def IsEligibleSrc(self, src_treep): if src_treep.HasVariables(): return False src_leaves = filter_tokens(GetLeaves(src_treep)) if not src_leaves: return False if not len(src_leaves) <= self.max_src_phrase_length: return False return True
def GetSimilarity(self, src_treep, trg_treep): """ If 'predicate' is within the roles of the target URI, then the relation is labelled as self.predicate_relation. Otherwise, as self.entity_relation. Assuming the best possible cost to be 1.0 for the transformation of each source or target token, this cost function cannot give costs below that. In case the ngram ratio is 1.0 (perfect match of source into target; note it is asymmetric), then the cost to transform each token will be (2.1 - ngram_ratio) = 1.1 Lower ngram ratios will give higher costs. The minimum ngram ratio that we consider for a mapping to be eligible is self.ngram_min_ratio. """ similarities = [] if not self.IsEligible(src_treep, trg_treep): return similarities src_leaves = GetLeaves(src_treep) trg_leaves = GetLeaves(trg_treep) uri = trg_leaves[-1] num_src_leaves = len(src_leaves) num_trg_leaves = len(trg_leaves) trg_leaves = SplitLeavesBy([uri], self.trg_token_separators) src_leaves = filter_tokens(src_leaves) trg_leaves = filter_tokens(trg_leaves) ngram_ratio = get_ngram_ratio(src_leaves, trg_leaves) if ngram_ratio >= self.ngram_min_ratio: cost = (2.1 - ngram_ratio) * (num_src_leaves + 1) if num_trg_leaves == 1: relation = self.GetURIRole(uri) if not relation: return similarities else: cost += self.extra_cost relation = self.bridge_relation if relation in [self.entity_relation, self.bridge_relation] and \ not IsPlausibleEntityPhrase(src_treep): return similarities similarities = [Similarity(cost, relation, src_treep, trg_treep)] return similarities
def GetSimilarityCost(self, src_words, trg_words, common_indices): """ The cost per token is guaranteed to be between 0 and 1 (except for bridges). For every candidate entry index: 1. Splits words according to token separators, 2. Filter-out stopwords in source (e.g. determiners or prepositions) and target (e.g. "fb:", "en", "m"), 3. Computes ngram ratio between source and target tokens, 4. Multiplies the complement of the ratio (1.0 - ratio) by the number of leaves. 5. If there are two target URIs, then add self.extra_cost. """ num_src_words = len(src_words) num_trg_words = len(trg_words) src_words = filter_tokens(src_words) trg_words = SplitLeavesBy(trg_words[-1:], self.trg_ngram_token_separators) trg_words = filter_tokens(trg_words) costs = [] for common_index in common_indices: stored_src_words = self.src_trg_cost[common_index][0].split() stored_trg_words = self.src_trg_cost[common_index][1].split() uri = stored_trg_words[-1] stored_trg_words = SplitLeavesBy(stored_trg_words, self.trg_ngram_token_separators) stored_src_words = filter_tokens(stored_src_words) stored_trg_words = filter_tokens(stored_trg_words) src_ngram_ratio = get_ngram_ratio(src_words, stored_src_words) trg_ngram_ratio = get_ngram_ratio(trg_words, stored_trg_words) ngram_ratio = (src_ngram_ratio + trg_ngram_ratio) / 2 cost = (2.0 - ngram_ratio) * (num_src_words + 1) if num_trg_words == 2: cost += self.ExtraCost costs.append(cost) best_cost = min(costs) return best_cost