Ejemplo n.º 1
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     if not (isinstance(tree_pattern1, TreePattern) \
             and isinstance(tree_pattern2, TreePattern)):
         return []
     nodes1 = tree_pattern1.GetNodes()
     nodes2 = tree_pattern2.GetNodes()
     # At the moment, we do not allow gapped phrases. Only single tokens.
     if len(nodes1) != 1 or len(nodes2) != 1:
         return []
     phrase1 = '_'.join(nodes1)
     phrase2 = '_'.join(nodes2)
     linguistic_relationships = LinguisticRelationship(phrase1, phrase2)
     similarities = []
     for relation in linguistic_relationships:
         similarity = Similarity(self.kScore, relation, tree_pattern1,
                                 tree_pattern2)
         similarities.append(similarity)
     return similarities
Ejemplo n.º 2
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     src_nodes = tree_pattern1.GetInnerNodes()
     trg_nodes = tree_pattern2.GetInnerNodes()
     # Get bag of words of differences.
     bow = defaultdict(float)
     for src_node in src_nodes:
         bow[src_node] += 1.0
     for trg_node in trg_nodes:
         bow[trg_node] -= 1.0
     num_src_trg_nodes = len(src_nodes) + len(trg_nodes)
     if num_src_trg_nodes == 0:
         cost = 0.0
     else:
         cost = sum([abs(count) for count in bow.values()])
     similarities = [
         Similarity(cost, 'inner_node_diff', tree_pattern1, tree_pattern2)
     ]
     return similarities
Ejemplo n.º 3
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
   if not (tree_pattern1.IsString() and tree_pattern2.IsString()):
     """
     if tree_pattern1 == tree_pattern2:
       return [Similarity(self.kLinguisticVariation, 'copy',
                          tree_pattern1, tree_pattern2)]
     """
     return []
   tree1_leaves = set(GetLeaves(tree_pattern1))
   tree2_leaves = set(GetLeaves(tree_pattern2))
   phrase1 = '_'.join(tree1_leaves)
   phrase2 = '_'.join(tree2_leaves)
   linguistic_relationships = LinguisticRelationship(phrase1, phrase2)
   similarities = []
   for relation in linguistic_relationships:
     similarity = Similarity(self.kLinguisticVariation, relation,
                             tree_pattern1, tree_pattern2)
     similarities.append(similarity)
   return similarities
Ejemplo n.º 4
0
 def GetSimilar(self, tree_pattern1):
     similarities = []
     if not tree_pattern1.IsString():
         return similarities
     src_leaves = tree_pattern1.GetLeaves()
     if len(src_leaves) != 1:
         return similarities
     src_str = src_leaves[0]
     if src_str.startswith('@'):
         # Remove the @ symbol, transliterate the rest of the string,
         # and add the target token separator.
         trg_str = TransliterateJaEn(src_str[1:]).title()
         trg_str = trg_str.replace(' ', self.trg_token_separator)
         # Add the @ to the transliterated string and title-cased it.
         tree_pattern2 = TreePattern('@{0}'.format(trg_str), (), [])
         similarities = [
             Similarity(self.kProb, self.relation, tree_pattern1,
                        tree_pattern2)
         ]
     return similarities
Ejemplo n.º 5
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     assert len(tree_pattern1.subpaths) == len(tree_pattern2.subpaths), \
       'Number of subpaths from tree_pattern1 and tree_pattern2 differ: {0} vs. {1}'\
       .format(tree_pattern1, tree_pattern2)
     tree1_excluded_nodes = set(tree_pattern1.GetExcludedNodes())
     tree2_excluded_nodes = set(tree_pattern2.GetExcludedNodes())
     tree1_unique_nodes = tree1_excluded_nodes.difference(
         tree2_excluded_nodes)
     tree2_unique_nodes = tree2_excluded_nodes.difference(
         tree1_excluded_nodes)
     num_tree1_nodes = len(tree1_excluded_nodes)
     num_tree2_nodes = len(tree2_excluded_nodes)
     min_unique_nodes = min(len(tree1_unique_nodes),
                            len(tree2_unique_nodes))
     num_deleted_nodes = max(0, (num_tree1_nodes - num_tree2_nodes))
     num_inserted_nodes = max(0, (num_tree2_nodes - num_tree1_nodes))
     weight = min_unique_nodes * self.kLinguisticVariationCost \
            + num_deleted_nodes * self.kDeletionCost \
            + num_inserted_nodes * self.kInsertionCost
     return [
         Similarity(weight, 'leaf_similarity', tree_pattern1, tree_pattern2)
     ]
Ejemplo n.º 6
0
 def GetSimilarity(self, src_treep, trg_treep):
   """
   If 'predicate' is within the roles of the target URI, then the relation
   is labelled as self.predicate_relation. Otherwise, as self.entity_relation.
   Assuming the best possible cost to be 1.0 for the transformation of each
   source or target token, this cost function cannot give costs below that.
   In case the ngram ratio is 1.0 (perfect match of source into target; note
   it is asymmetric), then the cost to transform each token will be
   (2.1 - ngram_ratio) = 1.1
   Lower ngram ratios will give higher costs.
   The minimum ngram ratio that we consider for a mapping to be eligible
   is self.ngram_min_ratio.
   """
   similarities = []
   if not self.IsEligible(src_treep, trg_treep):
     return similarities
   src_leaves = GetLeaves(src_treep)
   trg_leaves = GetLeaves(trg_treep)
   uri = trg_leaves[-1]
   num_src_leaves = len(src_leaves)
   num_trg_leaves = len(trg_leaves)
   trg_leaves = SplitLeavesBy([uri], self.trg_token_separators)
   src_leaves = filter_tokens(src_leaves)
   trg_leaves = filter_tokens(trg_leaves)
   ngram_ratio = get_ngram_ratio(src_leaves, trg_leaves)
   if ngram_ratio >= self.ngram_min_ratio:
     cost = (2.1 - ngram_ratio) * (num_src_leaves + 1)
     if num_trg_leaves == 1:
       relation = self.GetURIRole(uri)
       if not relation:
         return similarities
     else:
       cost += self.extra_cost
       relation = self.bridge_relation
     if relation in [self.entity_relation, self.bridge_relation] and \
       not IsPlausibleEntityPhrase(src_treep):
       return similarities
     similarities = [Similarity(cost, relation, src_treep, trg_treep)]
   return similarities
Ejemplo n.º 7
0
    def GetSimilarity(self, tree_pattern1, tree_pattern2):
        """
    The cost associated to a partial match when checking the source and the target sides
    is equivalent to the worst cost (max cost) among all entries where
    there is a partial match. The rationale is that this cost function
    should not have preference over the exact match implemented in
    DictionaryCost(). 
    """
        similarities = []
        if not self.IsEligible(tree_pattern1, tree_pattern2):
            return similarities
        tree1_leaves = GetLeaves(tree_pattern1)
        tree2_leaves = [l.lstrip('!') for l in GetLeaves(tree_pattern2)]
        # Split source and target leaves by token separators.
        src_words = SplitLeavesBy(tree1_leaves, self.src_token_separators)
        trg_words = SplitLeavesBy(tree2_leaves, self.trg_token_separators)
        if self.lowercase:
            src_words = [word.lower() for word in src_words]
            trg_words = [word.lower() for word in trg_words]
        # Obtain indices of bilingual phrases for which at least one source word
        # appears.
        src_word_indices = [self.get_src_index(word) for word in src_words]
        src_indices = set(itertools.chain(*src_word_indices))
        # The same for target words.
        trg_word_indices = [self.trg_index.get(word, []) for word in trg_words]
        trg_indices = set(itertools.chain(*trg_word_indices))

        common_indices = src_indices.intersection(trg_indices)
        if not common_indices:
            return similarities

        cost = self.GetSimilarityCost(src_words, trg_words, common_indices)
        similarities = [
            Similarity(cost, self.relation, tree_pattern1, tree_pattern2)
        ]
        return similarities
Ejemplo n.º 8
0
 def GetSimilar(self, word):
     return [Similarity(self.kScore, None, word, None)]
Ejemplo n.º 9
0
 def GetSimilar(self, src_treep):
   raise ValueError('Not implemented')
   return [Similarity(self.high_cost, None, src_treep, None)]
Ejemplo n.º 10
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
   tree1_leaves = set(GetLeaves(tree_pattern1))
   tree2_leaves = set(GetLeaves(tree_pattern2))
   weight = len(tree1_leaves) * self.kDeletionCost \
          + len(tree2_leaves) * self.kInsertionCost
   return [Similarity(weight, None, tree_pattern1, tree_pattern2)]
Ejemplo n.º 11
0
 def GetSimilar(self, word):
   raise ValueError('Not implemented')
   return [Similarity(self.kScore, None, word, None)]