def IsEligibleSrc(self, src_treep):
     if src_treep.HasVariables():
         return False
     src_leaves = filter_tokens(GetLeaves(src_treep))
     if not src_leaves:
         return False
     if not len(src_leaves) <= self.max_src_phrase_length:
         return False
     return True
Esempio n. 2
0
 def GetSimilarity(self, src_treep, trg_treep):
   """
   If 'predicate' is within the roles of the target URI, then the relation
   is labelled as self.predicate_relation. Otherwise, as self.entity_relation.
   Assuming the best possible cost to be 1.0 for the transformation of each
   source or target token, this cost function cannot give costs below that.
   In case the ngram ratio is 1.0 (perfect match of source into target; note
   it is asymmetric), then the cost to transform each token will be
   (2.1 - ngram_ratio) = 1.1
   Lower ngram ratios will give higher costs.
   The minimum ngram ratio that we consider for a mapping to be eligible
   is self.ngram_min_ratio.
   """
   similarities = []
   if not self.IsEligible(src_treep, trg_treep):
     return similarities
   src_leaves = GetLeaves(src_treep)
   trg_leaves = GetLeaves(trg_treep)
   uri = trg_leaves[-1]
   num_src_leaves = len(src_leaves)
   num_trg_leaves = len(trg_leaves)
   trg_leaves = SplitLeavesBy([uri], self.trg_token_separators)
   src_leaves = filter_tokens(src_leaves)
   trg_leaves = filter_tokens(trg_leaves)
   ngram_ratio = get_ngram_ratio(src_leaves, trg_leaves)
   if ngram_ratio >= self.ngram_min_ratio:
     cost = (2.1 - ngram_ratio) * (num_src_leaves + 1)
     if num_trg_leaves == 1:
       relation = self.GetURIRole(uri)
       if not relation:
         return similarities
     else:
       cost += self.extra_cost
       relation = self.bridge_relation
     if relation in [self.entity_relation, self.bridge_relation] and \
       not IsPlausibleEntityPhrase(src_treep):
       return similarities
     similarities = [Similarity(cost, relation, src_treep, trg_treep)]
   return similarities
 def GetSimilarityCost(self, src_words, trg_words, common_indices):
     """
 The cost per token is guaranteed to be between 0 and 1 (except for bridges).
 For every candidate entry index:
 1. Splits words according to token separators,
 2. Filter-out stopwords in source (e.g. determiners or prepositions)
    and target (e.g. "fb:", "en", "m"),
 3. Computes ngram ratio between source and target tokens,
 4. Multiplies the complement of the ratio (1.0 - ratio) by the number
    of leaves.
 5. If there are two target URIs, then add self.extra_cost.
 """
     num_src_words = len(src_words)
     num_trg_words = len(trg_words)
     src_words = filter_tokens(src_words)
     trg_words = SplitLeavesBy(trg_words[-1:],
                               self.trg_ngram_token_separators)
     trg_words = filter_tokens(trg_words)
     costs = []
     for common_index in common_indices:
         stored_src_words = self.src_trg_cost[common_index][0].split()
         stored_trg_words = self.src_trg_cost[common_index][1].split()
         uri = stored_trg_words[-1]
         stored_trg_words = SplitLeavesBy(stored_trg_words,
                                          self.trg_ngram_token_separators)
         stored_src_words = filter_tokens(stored_src_words)
         stored_trg_words = filter_tokens(stored_trg_words)
         src_ngram_ratio = get_ngram_ratio(src_words, stored_src_words)
         trg_ngram_ratio = get_ngram_ratio(trg_words, stored_trg_words)
         ngram_ratio = (src_ngram_ratio + trg_ngram_ratio) / 2
         cost = (2.0 - ngram_ratio) * (num_src_words + 1)
         if num_trg_words == 2:
             cost += self.ExtraCost
         costs.append(cost)
     best_cost = min(costs)
     return best_cost