Esempio n. 1
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     if tree_pattern1.IsString() and tree_pattern2.IsString():
         return [
             Similarity(self.kSubstitutionCost, None, tree_pattern1,
                        tree_pattern2)
         ]
     tree1_leaves = set(GetLeaves(tree_pattern1))
     tree2_leaves = set(GetLeaves(tree_pattern2))
     num_tree1_leaves = len(tree1_leaves)
     num_tree2_leaves = len(tree2_leaves)
     weight = num_tree1_leaves * self.kDeletionCost \
            + num_tree2_leaves * self.kInsertionCost
     return [Similarity(weight, None, tree_pattern1, tree_pattern2)]
Esempio n. 2
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
   if tree_pattern1.IsString() and tree_pattern2.IsString():
     return [Similarity(self.kSubstitutionCost,
                        self.relation, tree_pattern1, tree_pattern2)]
   tree1_leaves = GetLeaves(tree_pattern1)
   tree2_leaves = GetLeaves(tree_pattern2)
   num_tree1_leaves = len(tree1_leaves)
   num_tree2_leaves = len(tree2_leaves)
   num_substitution_leaves = min(num_tree1_leaves, num_tree2_leaves)
   num_deletion_leaves = max(0, num_tree1_leaves - num_substitution_leaves)
   num_insertion_leaves = max(0, num_tree2_leaves - num_substitution_leaves)
   cost = num_substitution_leaves * self.kSubstitutionCost \
        + num_deletion_leaves * self.kDeletionCost \
        + num_insertion_leaves * self.kInsertionCost
   return [Similarity(cost, self.relation, tree_pattern1, tree_pattern2)]
  def test_TerminalEqualVarUpper(self):
    src_tree = tree_or_string('(:index A-1)')
    trg_tree = tree_or_string('(:tense e-2)')
    src_tree_pat = TreePattern(src_tree, (0,), [])
    trg_tree_pat = TreePattern(trg_tree, (0,), [])
    similarities = self.var_ind.GetSimilarity(src_tree_pat, trg_tree_pat)
    self.assertEqual(1, len(similarities))
    expected_similarities = [Similarity(0.0, 'var_copy', src_tree_pat, trg_tree_pat)]
    self.assertEqual(expected_similarities, similarities)

    similarities = self.var_ind.GetSimilar(src_tree_pat)
    result_pattern = TreePattern('A-1', (), [])
    expected_similarities = \
      [Similarity(0.0, 'var_copy', src_tree_pat, result_pattern)]
    self.assertEqual(expected_similarities, similarities)
  def test_TerminalEqualEntity(self):
    src_tree = tree_or_string(u'(N @杉田山本)')
    trg_tree = tree_or_string(u'(N @Sugita_Yamamoto)')
    src_tree_pat = TreePattern(src_tree, (0,), [])
    trg_tree_pat = TreePattern(trg_tree, (0,), [])
    similarities = self.ent_ind.GetSimilarity(src_tree_pat, trg_tree_pat)
    self.assertEqual(1, len(similarities))
    expected_similarities = [Similarity(0.0, 'entity_copy', src_tree_pat, trg_tree_pat)]
    self.assertEqual(expected_similarities, similarities)

    similarities = self.ent_ind.GetSimilar(src_tree_pat)
    result_pattern = TreePattern(u'@Sugita_Yamamoto', (), [])
    expected_similarities = \
      [Similarity(0.0, 'entity_copy', src_tree_pat, result_pattern)]
    self.assertEqual(expected_similarities, similarities)
Esempio n. 5
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     IsVariable = self.is_var.match
     src_nodes = tree_pattern1.GetLeaves()
     trg_nodes = tree_pattern2.GetLeaves()
     src_vars = [v.lower() for v in src_nodes if IsVariable(v)]
     trg_vars = [v.lower() for v in trg_nodes if IsVariable(v)]
     # Bag of words for variable types. Variable types are the first character
     # of a variable. E.g. e-3 has type 'e'. We ignore variable numbers,
     # as their difference between source and target is not meaningful.
     bow = defaultdict(float)
     for src_var in src_vars:
         src_var_type = src_var[0]
         bow[src_var_type] += 1.0
     for trg_var in trg_vars:
         trg_var_type = trg_var[0]
         bow[trg_var_type] -= 1.0
     num_src_trg_vars = len(src_vars) + len(trg_vars)
     if num_src_trg_vars == 0:
         cost = 0.0
     else:
         cost = sum([abs(count) for count in bow.values()])
     relation = 'var_diff'
     similarities = [
         Similarity(cost, relation, tree_pattern1, tree_pattern2)
     ]
     return similarities
Esempio n. 6
0
  def setUp(self):
    self.similarity = Similarity(1.0, 'dummy', None, None)

    t0_src_path = (0,)
    t0_trg_path = (10,)
    t0_src_subpaths = ((0, 0), (0, 1))
    t0_trg_subpaths = ((10, 0), (10, 1))
    self.t0 = Transformation(t0_src_path, t0_trg_path,
                             t0_src_subpaths, t0_trg_subpaths, self.similarity)

    t1_src_path = (1,)
    t1_trg_path = (11,)
    t1_src_subpaths = ((1, 0), (1, 1))
    t1_trg_subpaths = ((11, 0), (11, 1))
    self.t1 = Transformation(t1_src_path, t1_trg_path,
                             t1_src_subpaths, t1_trg_subpaths, self.similarity)

    t2_src_path = (2,)
    t2_trg_path = (12,)
    t2_src_subpaths = ((2, 0), (2, 1))
    t2_trg_subpaths = ((12, 0), (12, 1))
    self.t2 = Transformation(t2_src_path, t2_trg_path,
                             t2_src_subpaths, t2_trg_subpaths, self.similarity)

    t0bis_src_path = (0,)
    t0bis_trg_path = (10,)
    t0bis_src_subpaths = ((3, 0), (3, 1))
    t0bis_trg_subpaths = ((13, 0), (13, 1))
    self.t0bis = Transformation(t0bis_src_path, t0bis_trg_path,
                                t0bis_src_subpaths, t0bis_trg_subpaths, self.similarity)

    self.q_costs = PriorityQueue(2)
    self.q_probs = PriorityQueue(2, reverse=True)
Esempio n. 7
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     if tree_pattern1.IsString() and tree_pattern2.IsString():
         return [
             Similarity(self.kSubstitutionCost, None, tree_pattern1,
                        tree_pattern2)
         ]
     tree1_nodes = set(GetNodes(tree_pattern1))
     tree2_nodes = set(GetNodes(tree_pattern2))
     num_tree1_nodes = len(tree1_nodes)
     num_tree2_nodes = len(tree2_nodes)
     min_nodes = min(num_tree1_nodes, num_tree2_nodes)
     num_deleted_nodes = max(0, (num_tree1_nodes - num_tree2_nodes))
     num_inserted_nodes = max(0, (num_tree2_nodes - num_tree1_nodes))
     weight = min_nodes * self.kSubstitutionCost \
            + num_deleted_nodes * self.kDeletionCost \
            + num_inserted_nodes * self.kInsertionCost
     return [Similarity(weight, None, tree_pattern1, tree_pattern2)]
Esempio n. 8
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     src_nodes = tree_pattern1.GetInnerNodes()
     trg_nodes = tree_pattern2.GetInnerNodes()
     cost = abs(len(src_nodes) - len(trg_nodes))**2
     similarities = [
         Similarity(cost, 'tree_comp', tree_pattern1, tree_pattern2)
     ]
     return similarities
Esempio n. 9
0
 def GetSimilarity(self, src_treep, trg_treep):
   similarities = []
   if src_treep.HasVariables():
     return similarities
   root_category = src_treep.GetRoot()
   cost = self.cost_np if root_category == 'NP' else self.cost_no_np
   similarities = [Similarity(cost, self.relation, src_treep, trg_treep)]
   return similarities
Esempio n. 10
0
 def GetSimilarity(self, tree1, tree2):
     similarities = []
     tree1_str = tree1 if IsString(tree1) else repr(tree1)
     tree2_str = tree2 if IsString(tree2) else repr(tree2)
     if tree1_str == tree2_str:
         similarities = [
             Similarity(self.kCost, self.kDefaultState, tree1, tree2)
         ]
     return similarities
Esempio n. 11
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
   # When computing the number of subtrees to measure the tree complexity,
   # we do not penalize trees (lhs or rhs) that have only one inner node,
   # as in the case of pre-terminals or transductions that consume only
   # one inner node.
   num_nodes_tree1 = max(0, tree_pattern1.GetNumNodes() - 1)
   num_nodes_tree2 = max(0, tree_pattern2.GetNumNodes() - 1)
   weight = min(1, (num_nodes_tree1 + num_nodes_tree2)**2 / self.normalizer)
   return [Similarity(weight, 'nodes_difference', tree_pattern1, tree_pattern2)]
Esempio n. 12
0
 def GetSimilarity(self, tree1, tree2):
   num_nodes_tree1 = 0 if IsString(tree1) \
                       else tree1.GetNumSubtrees()
   num_nodes_tree2 = 0 if IsString(tree2) \
                       else tree2.GetNumSubtrees()
   weight = 0.0
   if not (num_nodes_tree1 == 0 and num_nodes_tree2 == 0):
     weight = (float(abs(num_nodes_tree1 - num_nodes_tree2)) \
               / max(num_nodes_tree1, num_nodes_tree2))
   return [Similarity(weight, 'nodes_difference', tree1, tree2)]
Esempio n. 13
0
 def GetSimilarity(self, src_treep, trg_treep):
   cost = self.low_cost
   if self.side == "both" and (src_treep.IsString() or trg_treep.IsString()):
     cost = self.high_cost
   if self.side == "source" and src_treep.IsString():
     cost = self.high_cost
   if self.side == "target" and trg_treep.IsString():
     cost = self.high_cost
   similarities = [Similarity(cost, self.relation, src_treep, trg_treep)]
   return similarities
Esempio n. 14
0
 def GetSimilar(self, src_treep):
   similarities = []
   if not self.IsEligibleSrc(src_treep):
     return similarities
   trg_treeps = self.BuildTrgTreePatterns(src_treep)
   for trg_treep in trg_treeps:
     score = self.GetScoreSimilarity(src_treep, trg_treep)
     if score is not None:
       similarity = Similarity(score, self.relation, src_treep, trg_treep)
       similarities.append(similarity)
   return sorted(similarities, key=lambda s: s.score, reverse=True)
Esempio n. 15
0
 def GetSimilarity(self, tree1, tree2):
     # Retrieve the variables of each tree.
     tree1_vars = self.GetVariables(tree1)
     tree2_vars = self.GetVariables(tree2)
     assert len(tree1_vars) == len(tree2_vars), \
       'Number of variables differ {0} vs. {1} in trees {2} vs. {3}'.format(
       tree1_vars, tree2_vars, tree1, tree2)
     if not tree1_vars or len(tree1_vars) < 2:
         return [Similarity(0.0, 'order_difference', tree1, tree2)]
     kendall_tau = self.KendallTauCached(tuple(tree1_vars),
                                         tuple(tree2_vars))
     if kendall_tau is None:
         return []
     # kendall_tau is in the range [-1.0, 1.0]. We need to normalize it to be
     # within [0.0, 1.0] and complement it so that high scores denote similar
     # word order, and low scores denote reversed word order.
     weight = (1.0 - kendall_tau) / 2
     assert weight >= 0, 'Kendall tau value is not positive: {0}'.format(
         weight)
     return [Similarity(weight, 'order_difference', tree1, tree2)]
 def MakeSimilar(self, src_treep, src_words, src_indices):
     similarities = []
     for i in src_indices:
         target = self.src_trg_cost[i][1].replace(' ',
                                                  self.trg_token_separator)
         score = self.GetScoreSimilar(src_words, i)
         trg_treep = TreePattern(target, (), [])
         similarities.append(
             Similarity(score, self.relation, src_treep, trg_treep))
     # Remove duplicated elements (since this is not an exact match, they may occur).
     similarities = list(set(similarities))
     return similarities
Esempio n. 17
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     src_nodes = tree_pattern1.GetInnerNodes()
     trg_nodes = tree_pattern2.GetInnerNodes()
     src_leaves = GetLeaves(tree_pattern1)
     trg_leaves = GetLeaves(tree_pattern2)
     src_length = len(src_nodes) + len(src_leaves)
     trg_length = len(trg_nodes) + len(trg_leaves)
     cost = max(0, src_length)**2 + max(0, trg_length)**2
     similarities = [
         Similarity(cost, 'tree_size', tree_pattern1, tree_pattern2)
     ]
     return similarities
Esempio n. 18
0
 def GetSimilar(self, tree_pattern1):
     if not isinstance(tree_pattern1, TreePattern):
         return []
     nodes1 = tree_pattern1.GetNodes()
     phrase1 = '_'.join(nodes1)
     linguistic_relationships = ObtainLinguisticRelationships(phrase1)
     similarities = []
     for relation, lemma in linguistic_relationships:
         similarity = Similarity(self.kScore, relation, tree_pattern1,
                                 lemma)
         similarities.append(similarity)
     return similarities
Esempio n. 19
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
   """
   Computes similarity *cost* between natural language phrases
   and constituent structures that represent lambda-DCS subtrees.
   """
   similarities = []
   if not self.IsEligible(tree_pattern1, tree_pattern2):
     return similarities
   cost = self.GetCostSimilarity(tree_pattern1, tree_pattern2)
   if cost is not None:
     similarity = Similarity(cost, self.relation, tree_pattern1, tree_pattern2)
     similarities.append(similarity)
   return similarities
Esempio n. 20
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     src_nodes = tree_pattern1.GetLeaves()
     trg_nodes = tree_pattern2.GetLeaves()
     src_vars = [v for v in src_nodes if v.startswith('@')]
     trg_vars = [v for v in trg_nodes if v.startswith('@')]
     # Bag of words for variable types. Variable types are the first character
     # of a variable. E.g. e-3 has type 'e'. We ignore variable numbers,
     # as their difference between source and target is not meaningful.
     cost = abs(len(src_vars) - len(trg_vars))
     similarities = [
         Similarity(cost, 'entity_diff', tree_pattern1, tree_pattern2)
     ]
     return similarities
Esempio n. 21
0
 def GetSimilar(self, tree_pattern1):
     tree1_leaves = GetLeaves(tree_pattern1)
     num_tree1_leaves = len(tree1_leaves)
     if len(tree1_leaves) > self.max_phrase_length or not tree1_leaves:
         cost = num_tree1_leaves
         similarities = [
             Similarity(cost, 'q0', tree_pattern1, tree_pattern2)
         ]
     else:
         entities = self.GetLexicon(tree1_leaves, 'entity')
         unary_predicates = self.GetLexicon(tree1_leaves, 'unary')
         binary_predicates = self.GetLexicon(tree1_leaves, 'binary')
         lexicon = entities + unary_predicates + binary_predicates
         similarities = []
         cost = 0.0
         for lex, lex_type in lexicon:
             path, subpaths = (), []
             tree_pattern2 = TreePattern(lex, path, subpaths)
             similarity = Similarity(cost, lex_type, tree_pattern1,
                                     tree_pattern2)
             similarities.append(similarity)
     return similarities
Esempio n. 22
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     if not (tree_pattern1.IsString() and tree_pattern2.IsString()):
         return []
     tree1_leaves = GetLeaves(tree_pattern1)
     tree2_leaves = GetLeaves(tree_pattern2)
     phrase1 = '_'.join(tree1_leaves).lower()
     phrase2 = '_'.join(tree2_leaves).lower()
     if (phrase1, phrase2) in self.dictionary:
         cost = self.dictionary[(phrase1, phrase2)]
         return [
             Similarity(cost, 'dictionary', tree_pattern1, tree_pattern2)
         ]
     return []
Esempio n. 23
0
 def GetSimilarity(self, src_treep, trg_treep):
     alignment = self.alignments.get(
         (str(src_treep.tree), str(trg_treep.tree)), None)
     assert alignment is not None
     src_leaves_inds = src_treep.GetLeavesIndices()
     trg_leaves_inds = trg_treep.GetLeavesIndices()
     src_to_trg_inds = alignment.get_trg_inds(src_leaves_inds)
     trg_to_src_inds = alignment.get_src_inds(trg_leaves_inds)
     cost = self.kCost
     if IsAlignmentViolated(src_to_trg_inds, trg_leaves_inds):
         cost = self.kCostViolation
     if IsAlignmentViolated(trg_to_src_inds, src_leaves_inds):
         cost = self.kCostViolation
     return [Similarity(cost, self.relation, src_treep, trg_treep)]
Esempio n. 24
0
 def MakeSimilar(self, src_tree_pattern, src_words, src_indices):
     similarities = []
     for i in src_indices:
         target = self.src_trg_cost[i][1].replace(' ',
                                                  self.trg_token_separator)
         prob = self.GetSimilarProb(src_words, i)
         trg_tree_pattern = \
           TreePattern(Tree.fromstring(u'(ID [] {0})'.format(target)), (), [])
         similarities.append(
             Similarity(prob, self.relation, src_tree_pattern,
                        trg_tree_pattern))
     # Remove duplicated elements (since this is not an exact match, they may occur).
     similarities = list(set(similarities))
     return similarities
Esempio n. 25
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     IsVariable = self.is_var.match
     similarities = []
     if not (tree_pattern1.IsString() and tree_pattern2.IsString()):
         return similarities
     src_leaves = tree_pattern1.GetLeaves()
     trg_leaves = tree_pattern2.GetLeaves()
     if len(src_leaves) != 1 or len(trg_leaves) != 1:
         return similarities
     if IsVariable(src_leaves[0]) and IsVariable(trg_leaves[0]):
         cost = 0.0
         similarities = [
             Similarity(cost, self.relation, tree_pattern1, tree_pattern2)
         ]
     return similarities
Esempio n. 26
0
 def test_NodeSimilar(self):
     tree1 = tree_or_string('(is (italian the) smart)')
     tree2 = tree_or_string('(european smart)')
     path1 = (0, )
     path2 = ()
     subpaths1 = [(0, 0)]
     subpaths2 = [(0, )]
     tree_pattern1 = TreePattern(tree1, path1, subpaths1)
     tree_pattern2 = TreePattern(tree2, path2, subpaths2)
     similarities = \
       self.similarity_scorer.GetSimilarity(tree_pattern1, tree_pattern2)
     self.assertEqual(1, len(similarities))
     expected_similarities = \
       [Similarity(self.kScore, 'hypernym', tree_pattern1, tree_pattern2)]
     self.assertListEqual(expected_similarities, similarities)
Esempio n. 27
0
 def test_TerminalToTerminalSimilar(self):
     tree1 = tree_or_string('italian')
     tree2 = tree_or_string('european')
     path1 = ()
     path2 = ()
     subpaths1 = []
     subpaths2 = []
     tree_pattern1 = TreePattern(tree1, path1, subpaths1)
     tree_pattern2 = TreePattern(tree2, path2, subpaths2)
     similarities = \
       self.similarity_scorer.GetSimilarity(tree_pattern1, tree_pattern2)
     self.assertEqual(1, len(similarities))
     expected_similarities = \
       [Similarity(self.kScore, 'hypernym', tree_pattern1, tree_pattern2)]
     self.assertListEqual(expected_similarities, similarities)
Esempio n. 28
0
 def test_NodeToLeafSimilar(self):
     tree1 = tree_or_string('(is (italian the) smart)')
     tree2 = tree_or_string('(french bright)')
     path1 = (1, )
     path2 = (0, )
     subpaths1 = []
     subpaths2 = []
     tree_pattern1 = TreePattern(tree1, path1, subpaths1)
     tree_pattern2 = TreePattern(tree2, path2, subpaths2)
     similarities = \
       self.similarity_scorer.GetSimilarity(tree_pattern1, tree_pattern2)
     self.assertEqual(1, len(similarities))
     expected_similarities = \
       [Similarity(self.kScore, 'synonym', tree_pattern1, tree_pattern2)]
     self.assertListEqual(expected_similarities, similarities)
Esempio n. 29
0
 def GetSimilar(self, src_pattern):
   # TODO: If we want the ability to compute linguistic similarities between
   # tree patterns, we could concatenate the yield (except variables on leaves)
   # to produce (possibly gapped) phrases.
   if not src_pattern.IsString():
     return []
   phrase = ' '.join([leaf for leaf in src_pattern.GetLeaves()])
   linguistic_relationships = ObtainLinguisticRelationships(phrase)
   similarities = []
   for relation, lemma in linguistic_relationships:
     trg_pattern = TreePattern(lemma, (), [])
     similarity = Similarity(
       self.kLinguisticVariation, relation, src_pattern, trg_pattern)
     similarities.append(similarity)
   return similarities
Esempio n. 30
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     similarities = []
     if not (tree_pattern1.IsString() and tree_pattern2.IsString()):
         return similarities
     src_leaves = tree_pattern1.GetLeaves()
     trg_leaves = tree_pattern2.GetLeaves()
     if len(src_leaves) != 1 or len(trg_leaves) != 1:
         return similarities
     src_str, trg_str = src_leaves[0], trg_leaves[0]
     if src_str.startswith('@') and trg_str.startswith('@'):
         cost = 0.0
         similarities = [
             Similarity(cost, self.relation, tree_pattern1, tree_pattern2)
         ]
     return similarities