Example #1
0
 def GetScoreSimilarity(self, src_treep, trg_treep):
   src_leaves = [leaf for leaf in GetLeaves(src_treep) if not IsVariable(leaf)]
   trg_leaves = [
     leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf) and leaf != '[]']
   uri = trg_leaves[-1].lstrip('!')
   uri_candidates = self.GetURIs(src_leaves, filterq=self.filterq, k=self.kgen)
   try:
     score = 1.0 / (uri_candidates.index(uri) + 1)
   except ValueError:
     score = None
   return score
Example #2
0
 def IsEligible(self, src_treep, trg_treep):
   if src_treep.HasVariables():
     return False
   src_leaves = [leaf for leaf in GetLeaves(src_treep) if not IsVariable(leaf)]
   if not len(src_leaves) <= self.max_src_phrase_length:
     return False
   if trg_treep is not None:
     trg_leaves = [leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf)]
     if len(trg_leaves) > self.max_trg_phrase_length:
       return False
   return True
Example #3
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     src_nodes = tree_pattern1.GetInnerNodes()
     trg_nodes = tree_pattern2.GetInnerNodes()
     src_leaves = GetLeaves(tree_pattern1)
     trg_leaves = GetLeaves(tree_pattern2)
     src_length = len(src_nodes) + len(src_leaves)
     trg_length = len(trg_nodes) + len(trg_leaves)
     cost = max(0, src_length)**2 + max(0, trg_length)**2
     similarities = [
         Similarity(cost, 'tree_size', tree_pattern1, tree_pattern2)
     ]
     return similarities
Example #4
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     if not (tree_pattern1.IsString() and tree_pattern2.IsString()):
         return []
     tree1_leaves = GetLeaves(tree_pattern1)
     tree2_leaves = GetLeaves(tree_pattern2)
     phrase1 = '_'.join(tree1_leaves).lower()
     phrase2 = '_'.join(tree2_leaves).lower()
     if (phrase1, phrase2) in self.dictionary:
         cost = self.dictionary[(phrase1, phrase2)]
         return [
             Similarity(cost, 'dictionary', tree_pattern1, tree_pattern2)
         ]
     return []
Example #5
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
     if tree_pattern1.IsString() and tree_pattern2.IsString():
         return [
             Similarity(self.kSubstitutionCost, None, tree_pattern1,
                        tree_pattern2)
         ]
     tree1_leaves = set(GetLeaves(tree_pattern1))
     tree2_leaves = set(GetLeaves(tree_pattern2))
     num_tree1_leaves = len(tree1_leaves)
     num_tree2_leaves = len(tree2_leaves)
     weight = num_tree1_leaves * self.kDeletionCost \
            + num_tree2_leaves * self.kInsertionCost
     return [Similarity(weight, None, tree_pattern1, tree_pattern2)]
Example #6
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
   if tree_pattern1.IsString() and tree_pattern2.IsString():
     return [Similarity(self.kSubstitutionCost,
                        self.relation, tree_pattern1, tree_pattern2)]
   tree1_leaves = GetLeaves(tree_pattern1)
   tree2_leaves = GetLeaves(tree_pattern2)
   num_tree1_leaves = len(tree1_leaves)
   num_tree2_leaves = len(tree2_leaves)
   num_substitution_leaves = min(num_tree1_leaves, num_tree2_leaves)
   num_deletion_leaves = max(0, num_tree1_leaves - num_substitution_leaves)
   num_insertion_leaves = max(0, num_tree2_leaves - num_substitution_leaves)
   cost = num_substitution_leaves * self.kSubstitutionCost \
        + num_deletion_leaves * self.kDeletionCost \
        + num_insertion_leaves * self.kInsertionCost
   return [Similarity(cost, self.relation, tree_pattern1, tree_pattern2)]
Example #7
0
 def GetCostSimilarity(self, src_treep, trg_treep):
   src_leaves = [leaf for leaf in GetLeaves(src_treep) if not IsVariable(leaf)]
   trg_leaves = [
     leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf) and leaf != '[]']
   uri = trg_leaves[-1].lstrip('!')
   uri_candidates = self.GetURIs(src_leaves, filterq=self.filterq, k=self.krecog)
   try:
     cost = 1.0 - 1.0 / (uri_candidates.index(uri) + 1)
     num_leaves = len(src_leaves) + len(trg_leaves)
     cost *= num_leaves
     if len(trg_leaves) > 1:
       cost += self.extra_cost
   except ValueError:
     cost = None
   return cost
 def get_trg_words_from_treep(self, trg_treep):
     trg_leaves = [
         leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf)
     ]
     target = ' '.join(trg_leaves)
     tokens = self.get_target_words(target)
     return tokens
 def get_src_words_from_treep(self, src_treep):
     src_leaves = [
         leaf for leaf in GetLeaves(src_treep) if not IsVariable(leaf)
     ]
     source = ' '.join(src_leaves)
     tokens = self.get_index_words(source, None)
     return tokens
Example #10
0
 def BuildTrgTreePatterns(self, src_treep):
   src_leaves = GetLeaves(src_treep)
   uri_candidates = self.GetURIs(src_leaves, k=self.kgen)
   path, subpaths = (), []
   trg_treeps = [TreePattern(
                   tree_or_string(u'(ID [] {0})'.format(uri)), path, subpaths) \
                     for uri in uri_candidates]
   return trg_treeps
Example #11
0
 def BuildTrgTreePatterns(self, src_treep):
   src_leaves = GetLeaves(src_treep)
   # uri_candidate_docs = self.GetDocs(src_leaves, context=None, fields=['uri'])
   uri_candidates = self.GetURIs(src_leaves, k=self.kgen)
   path, subpaths = (), []
   trg_treeps = [TreePattern(tree_or_string(uri), path, subpaths) \
                   for uri in uri_candidates]
   return trg_treeps
Example #12
0
 def IsEligible(self, src_treep, trg_treep):
   if not self.IsEligibleSrc(src_treep):
     return False
   if trg_treep is not None:
     trg_leaves = [leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf)]
     num_trg_vars = trg_treep.GetNumVariables()
     if len(trg_leaves) != self.trg_phrase_length or num_trg_vars > 1:
       return False
   return True
 def IsEligibleSrc(self, src_treep):
     if src_treep.HasVariables():
         return False
     src_leaves = filter_tokens(GetLeaves(src_treep))
     if not src_leaves:
         return False
     if not len(src_leaves) <= self.max_src_phrase_length:
         return False
     return True
 def IsEligible_(self, src_treep, trg_treep):
     """
 The source tree pattern should not contain any variable (hence,
 no variables in target tree pattern either), have equal or less leaves
 than self.max_src_phrase_length and the target tree pattern have
 self.trg_phrase_length leaves.
 """
     if src_treep.HasVariables():
         return False
     src_leaves = GetLeaves(src_treep)
     if not len(src_leaves) <= self.max_src_phrase_length:
         return False
     if trg_treep is not None:
         trg_leaves = [
             leaf for leaf in GetLeaves(trg_treep) if not IsVariable(leaf)
         ]
         if not len(trg_leaves) == self.trg_phrase_length:
             return False
     return True
Example #15
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
   if not (tree_pattern1.IsString() and tree_pattern2.IsString()):
     """
     if tree_pattern1 == tree_pattern2:
       return [Similarity(self.kLinguisticVariation, 'copy',
                          tree_pattern1, tree_pattern2)]
     """
     return []
   tree1_leaves = set(GetLeaves(tree_pattern1))
   tree2_leaves = set(GetLeaves(tree_pattern2))
   phrase1 = '_'.join(tree1_leaves)
   phrase2 = '_'.join(tree2_leaves)
   linguistic_relationships = LinguisticRelationship(phrase1, phrase2)
   similarities = []
   for relation in linguistic_relationships:
     similarity = Similarity(self.kLinguisticVariation, relation,
                             tree_pattern1, tree_pattern2)
     similarities.append(similarity)
   return similarities
Example #16
0
def FilterOutRulesWithCVT(rules):
    remaining_rules = []
    for r in rules:
        all_leaves = GetLeaves(r.rhs)
        for leaf in all_leaves:
            if IsVariable(leaf) or IsOperator(leaf):
                continue
            if leaf.lstrip('!') in cvts:
                break
        else:
            remaining_rules.append(r)
    return remaining_rules
Example #17
0
 def GetSimilarity(self, src_treep, trg_treep):
   """
   If 'predicate' is within the roles of the target URI, then the relation
   is labelled as self.predicate_relation. Otherwise, as self.entity_relation.
   Assuming the best possible cost to be 1.0 for the transformation of each
   source or target token, this cost function cannot give costs below that.
   In case the ngram ratio is 1.0 (perfect match of source into target; note
   it is asymmetric), then the cost to transform each token will be
   (2.1 - ngram_ratio) = 1.1
   Lower ngram ratios will give higher costs.
   The minimum ngram ratio that we consider for a mapping to be eligible
   is self.ngram_min_ratio.
   """
   similarities = []
   if not self.IsEligible(src_treep, trg_treep):
     return similarities
   src_leaves = GetLeaves(src_treep)
   trg_leaves = GetLeaves(trg_treep)
   uri = trg_leaves[-1]
   num_src_leaves = len(src_leaves)
   num_trg_leaves = len(trg_leaves)
   trg_leaves = SplitLeavesBy([uri], self.trg_token_separators)
   src_leaves = filter_tokens(src_leaves)
   trg_leaves = filter_tokens(trg_leaves)
   ngram_ratio = get_ngram_ratio(src_leaves, trg_leaves)
   if ngram_ratio >= self.ngram_min_ratio:
     cost = (2.1 - ngram_ratio) * (num_src_leaves + 1)
     if num_trg_leaves == 1:
       relation = self.GetURIRole(uri)
       if not relation:
         return similarities
     else:
       cost += self.extra_cost
       relation = self.bridge_relation
     if relation in [self.entity_relation, self.bridge_relation] and \
       not IsPlausibleEntityPhrase(src_treep):
       return similarities
     similarities = [Similarity(cost, relation, src_treep, trg_treep)]
   return similarities
Example #18
0
    def GetSimilarity(self, tree_pattern1, tree_pattern2):
        """
    The cost associated to a partial match when checking the source and the target sides
    is equivalent to the worst cost (max cost) among all entries where
    there is a partial match. The rationale is that this cost function
    should not have preference over the exact match implemented in
    DictionaryCost(). 
    """
        similarities = []
        if not self.IsEligible(tree_pattern1, tree_pattern2):
            return similarities
        tree1_leaves = GetLeaves(tree_pattern1)
        tree2_leaves = [l.lstrip('!') for l in GetLeaves(tree_pattern2)]
        # Split source and target leaves by token separators.
        src_words = SplitLeavesBy(tree1_leaves, self.src_token_separators)
        trg_words = SplitLeavesBy(tree2_leaves, self.trg_token_separators)
        if self.lowercase:
            src_words = [word.lower() for word in src_words]
            trg_words = [word.lower() for word in trg_words]
        # Obtain indices of bilingual phrases for which at least one source word
        # appears.
        src_word_indices = [self.get_src_index(word) for word in src_words]
        src_indices = set(itertools.chain(*src_word_indices))
        # The same for target words.
        trg_word_indices = [self.trg_index.get(word, []) for word in trg_words]
        trg_indices = set(itertools.chain(*trg_word_indices))

        common_indices = src_indices.intersection(trg_indices)
        if not common_indices:
            return similarities

        cost = self.GetSimilarityCost(src_words, trg_words, common_indices)
        similarities = [
            Similarity(cost, self.relation, tree_pattern1, tree_pattern2)
        ]
        return similarities
Example #19
0
 def BuildTrgTreePatterns(self, src_treep):
   src_leaves = GetLeaves(src_treep)
   uri_candidates_direct = self.GetURIs(
     src_leaves, filterq=self.filterq, k=self.kgen)
   uri_candidates = []
   for uri in uri_candidates_direct:
     uri_candidates.append(uri)
     uri_candidates.append('!' + uri)
   path, subpaths = (), []
   src_has_variables = src_treep.HasVariables()
   if src_has_variables:
     trg_treeps = [TreePattern(tree_or_string(u'(ID {0} ?x0|)'.format(uri)),
                               path, subpaths) for uri in uri_candidates]
   else:
     trg_treeps = [TreePattern(tree_or_string(uri), path, subpaths) \
                     for uri in uri_candidates]
   return trg_treeps
Example #20
0
    def GetSimilar(self, src_tree_pattern):
        similarities = []
        if not self.IsEligibleSrc(src_tree_pattern):
            return similarities
        src_leaves = GetLeaves(src_tree_pattern)
        src_words = SplitLeavesBy(src_leaves, self.src_token_separators)
        if self.lowercase:
            src_words = [word.lower() for word in src_words]
        # Obtain indices of bilingual phrases for which at least one source word
        # appears.
        src_word_indices = [self.get_src_index(word) for word in src_words]
        src_indices = set(itertools.chain(*src_word_indices))

        similarities = self.MakeSimilar(src_tree_pattern, src_words,
                                        src_indices)
        return sorted(similarities, key=lambda s: s.score,
                      reverse=True)[:self.n_best]
Example #21
0
 def GetSimilar(self, tree_pattern1):
     tree1_leaves = GetLeaves(tree_pattern1)
     num_tree1_leaves = len(tree1_leaves)
     if len(tree1_leaves) > self.max_phrase_length or not tree1_leaves:
         cost = num_tree1_leaves
         similarities = [
             Similarity(cost, 'q0', tree_pattern1, tree_pattern2)
         ]
     else:
         entities = self.GetLexicon(tree1_leaves, 'entity')
         unary_predicates = self.GetLexicon(tree1_leaves, 'unary')
         binary_predicates = self.GetLexicon(tree1_leaves, 'binary')
         lexicon = entities + unary_predicates + binary_predicates
         similarities = []
         cost = 0.0
         for lex, lex_type in lexicon:
             path, subpaths = (), []
             tree_pattern2 = TreePattern(lex, path, subpaths)
             similarity = Similarity(cost, lex_type, tree_pattern1,
                                     tree_pattern2)
             similarities.append(similarity)
     return similarities
Example #22
0
def GetOutputVocabulary(rules):
    return set([l for rule in rules \
                for l in GetLeaves(rule.rhs) \
                if not l.startswith('?x')])
Example #23
0
 def GetSimilarity(self, tree_pattern1, tree_pattern2):
   tree1_leaves = set(GetLeaves(tree_pattern1))
   tree2_leaves = set(GetLeaves(tree_pattern2))
   weight = len(tree1_leaves) * self.kDeletionCost \
          + len(tree2_leaves) * self.kInsertionCost
   return [Similarity(weight, None, tree_pattern1, tree_pattern2)]