def MakeDeletingRule(self): """ If the LHS does not produce any leaf but RHS does, such rule can be considered as a leaf-deleting rule. It is not clear when lexicalized branches should be replaced by a deleting variable (it depends on the application). Here we replace fully lexicalied branches at level 1 by a deleting variable, only when the RHS does not contain any leaf that is not a variable. """ if IsString(self.lhs): return self if IsString(self.rhs) and not IsVariable(self.rhs): return self if not IsString(self.rhs): rhs_leaves = self.rhs.leaves() if rhs_leaves and any([not IsVariable(l) for l in rhs_leaves]): return self # Make generator of fresh variables. index_new_variable = ('?xx%d|' % i for i in xrange(20)) # Substitute branches at level 1 if they are fully lexicalized. lhs_paths_prefix_1 = set( [p[0] for p in self.lhs_vars_to_paths.values()]) if not lhs_paths_prefix_1: return self for i, branch in enumerate(self.lhs): if i not in lhs_paths_prefix_1: if IsString(branch): self.lhs[i] = index_new_variable.next() else: self.lhs[i] = index_new_variable.next() + get_top(branch) self.lhs_vars_to_paths[self.lhs[i]] = (i, ) return self
def get_statements_from_ldcsc(ldcsc, var_counter=0): statements = [] if not isinstance(ldcsc, Tree): return statements if get_top(ldcsc) in ['DATE', 'NUMBER']: return statements if IsString(ldcsc[0]) and not is_operator(ldcsc[0]): new_var = '?x' + str(var_counter) pred = ldcsc[0].strip('!') if IsString(ldcsc[1]): entity_or_var = ldcsc[1] elif get_top(ldcsc[1]) == 'DATE': entity_or_var = '?d0' statements.extend(get_statements_from_date(ldcsc[1], '?d0')) elif get_top(ldcsc[1]) == 'NUMBER': entity_or_var = get_number_from_constituent(ldcsc[1]) # statements.extend(get_statements_from_number(ldcsc[1], '?n0')) else: entity_or_var = '?x' + str(var_counter + 1) if IsString(ldcsc[0]) and ldcsc[0].startswith('!'): subj, obj = entity_or_var, new_var else: subj, obj = new_var, entity_or_var s = Statement(subj, pred, obj) statements.append(s) var_counter += 1 subtree_ini_index = 1 if IsString(ldcsc[0]) else 0 for subtree in ldcsc[subtree_ini_index:]: statements.extend(get_statements_from_ldcsc(subtree, var_counter)) return statements
def GetURIsFromRules(rules): uris = set() for rule in rules: if IsString(rule.rhs) and not IsVariable(rule.rhs): uris.add(rule.rhs) if not IsString(rule.rhs): uris.update(u for u in rule.rhs.leaves() if not IsVariable(u)) return uris
def GetSimilarity(self, tree1, tree2): similarities = [] tree1_str = tree1 if IsString(tree1) else repr(tree1) tree2_str = tree2 if IsString(tree2) else repr(tree2) if tree1_str == tree2_str: similarities = [ Similarity(self.kCost, self.kDefaultState, tree1, tree2) ] return similarities
def __repr__(self): lhs_str = self.lhs.encode('utf-8') if IsString(self.lhs) else repr( self.lhs) rhs_str = self.rhs.encode('utf-8') if IsString(self.rhs) else repr( self.rhs) return (("<rule.\n state: {0}\n lhs: {1}\n rhs: {2}\n" + " newstates: {3}\n weight: {4}>").format( self.state, lhs_str, rhs_str, self.newstates, self.weight))
def GetSimilarity(self, tree1, tree2): num_nodes_tree1 = 0 if IsString(tree1) \ else tree1.GetNumSubtrees() num_nodes_tree2 = 0 if IsString(tree2) \ else tree2.GetNumSubtrees() weight = 0.0 if not (num_nodes_tree1 == 0 and num_nodes_tree2 == 0): weight = (float(abs(num_nodes_tree1 - num_nodes_tree2)) \ / max(num_nodes_tree1, num_nodes_tree2)) return [Similarity(weight, 'nodes_difference', tree1, tree2)]
def StringifyWithoutWeight(self): if not self.stringified: lhs_str = self.lhs.encode('utf-8') if IsString(self.lhs) else repr( self.lhs) rhs_str = self.rhs.encode('utf-8') if IsString(self.rhs) else repr( self.rhs) self.stringified = ( "<rule.\n state: {0}\n lhs: {1}\n rhs: {2}\n" + " newstates: {3}>").format(self.state, lhs_str, rhs_str, self.newstates) return self.stringified
def GetVariables(self, tree): if isinstance(tree, TreePattern): tree_vars = self.MakeVariablesFromTreePattern(tree) elif isinstance(tree, NLTKTree): tree_vars = [ var.split('|')[0] for (var, path) in variables_to_paths(tree) ] elif IsString(tree) and tree.startswith('?x'): tree_vars = [tree] elif IsString(tree) and not tree.startswith('?x'): tree_vars = [] else: tree_vars = None return tree_vars
def insert_cvt_if_needed(self, tree): predicate = get_main_predicate_from_tree(tree) cvt = self.get_cvt_cached(predicate) if cvt: if IsString(tree): tree = tree_or_string('(ID !{0} {1})'.format(cvt, tree)) elif tree.label() == u'COUNT': tree = tree_or_string('(COUNT (ID !{0} {1}))'.format( cvt, tree[0])) elif not IsString(tree[0]): tree_repr = ' '.join(map(str, tree)) tree = tree_or_string('(ID !{0} {1})'.format(cvt, tree_repr)) else: tree = tree_or_string('(ID !{0} {1})'.format(cvt, tree)) return tree
def _GetURIField(uri, field): """ Retrieves information of URIs or words according to the index of Freebase. URIs that are prefixed with "!" are stripped to remove that operator. """ if field == 'numFound': try: assert _IsConnectionAlive() words = [] if uri == '<total>' else [uri] num_docs = _GetNumDocsFound(words) except ValueError: num_docs = 0 return num_docs if field.startswith('uri_type'): arg = field.split('|')[1] return GetURIType(uri, arg) if not IsString(uri): return None if field == 'role': return _GetURIRole(uri) elif field == 'text': return _GetURIText(uri) try: uri_field = _GetFieldFromURI(uri, field) except ValueError: uri_field = None return uri_field
def DecodeInputTree(wrtg, nbest, lambda_dcs_str_list): """ lambda_dcs_str_list is an output parameter, where we store the valid output trees (string representations of lambda-DCS trees). An output parameter is used in order to retrieve partial lists in case of timeouts. """ transductions = wrtg.GenerateNBestTreesMax(nbest) for best_tree, optimal_weight in transductions: if cvt_inserter: best_tree = cvt_inserter.insert_cvt_if_needed(best_tree) constituent_str = \ best_tree if IsString(best_tree) else best_tree.pprint(margin=10000) query_results = QueryLambdaDCSC(constituent_str, query_manager) logging.info('\nConstituent: {0}\nWeight: {1}'\ .format(constituent_str, optimal_weight)) if query_results is None: continue if query_results not in invalid_results: lambda_dcs_str = ConvertConstituent2DCS(constituent_str) logging.info('Found. Weight: {0}\tTransduction: {1}'\ .format(optimal_weight, lambda_dcs_str)) logging.info(u'Answer: {0}'.format(query_results)) lambda_dcs_str_list.append(str(lambda_dcs_str)) return
def GetBestValidDerivations( wrtg, cvt_inserter, nbest=1000, nvalid=100, query_manager=None): """ It obtains derivations in descending order of score from wRTG wrtg. It inserts CVTs when necessary. If a derivation produces a sparql query that retrieves an invalid result, then such derivation is skipped until a good derivation is found. The maximum number of explored derivations is given by nbest. It returns a list of up to nvalid valid derivations and corresponding constituent trees with a CVT inserted (when necessary). If not found, returns an empty list. """ # This variable contains the result as a list of tuples. valid_derivations = [] wrtg.ClearCaches() derivations = wrtg.ObtainDerivationsFromNT() for i, derivation in enumerate(derivations): if i >= nbest: break constituent, _ = TargetProjectionFromDerivation(derivation) constituent_str = \ constituent if IsString(constituent) else constituent.pprint(margin=10000) valid_derivations.append((derivation, constituent)) if i == 0: first_derivation = derivation first_tree = constituent if len(valid_derivations) >= nvalid: break if not valid_derivations: valid_derivations.append((first_derivation, first_tree)) return valid_derivations
def fromldcsc(ldcsc, var_prefs=None): """ other_query_vars_prefs is a list with the prefixes of the other query variable instantiations that we also want to retrieve. E.g. if ['p', 'r'], then SELECT DISTINCT ?x0 , ?p0, ?p1, ?r0, ?r1 WHERE { ... """ if not isinstance(ldcsc, Tree): if IsString(ldcsc) and not ldcsc.startswith( '(') and not ldcsc.endswith(')'): return None else: raise (ValueError( 'This method expects a Tree instance. Got type {0} for instance {1}' .format(type(ldcsc), ldcsc))) try: statements = get_statements_from_ldcsc(ldcsc) except: logging.warning('Failed to get statements from l-dcsc: {0}'.format( str(ldcsc))) statements = [] if not statements: return None operator = ldcsc[0] if is_operator(ldcsc[0]) else "" query_vars = get_query_vars(statements, var_prefs) query_str = build_query_str(statements, '?x0', query_vars, operator) query = Query(query_str) query.query_vars = query_vars query.ldcsc = ldcsc return query
def BuildTiburonRHS(tree, newstates, path=(), quote_tokens=True): """ 1. Quote terminals, 2. Rename variables ?x0|NP would change into x0:NP 3. Remove types of variables. x0:NP would change into x0. 3. Change bracketing (NP (DT the) (NN house)) would change into NP(DT(the) NN(house)) 4. Apply states to variables. (NP (DT ?x0|) ?x1|NN) and {(0,0): 'q1', (1,) : 'q2'} would change into NP(DT(q1.x0) q2.x1) """ rhs_str = '' if IsString(tree): if IsVariable(tree): assert tree.startswith('?') assert '|' in tree assert path in newstates, 'path {0} not in {1}'.format( path, newstates) rhs_str = newstates[path] + '.' + tree[1:tree.index('|')] else: rhs_str = ConvertTokenToTiburon(tree, quote=quote_tokens) else: pos = get_top(tree) rhs_str = ConvertPOSToTiburon(pos) + '(' rhs_str += ' '.join( [BuildTiburonRHS(child, newstates, path + (i,), quote_tokens=quote_tokens) \ for i, child in enumerate(tree)]) rhs_str += ')' return rhs_str
def is_operator(op): if isinstance(op, Tree): return False assert IsString(op) return op == "COUNT" or \ op == "MAX" or \ op == "MIN"
def constituent2dcs(tree): '''convert a constituent structure into a DCS tree''' if not isinstance(tree, Tree): return [tree] # elif get_top(tree) == 'COUNT': # assert len(tree) == 1 # return [Tree('count', constituent2dcs(tree[0]))] elif get_top(tree) == 'NUMBER': assert len(tree) == 2 return [Tree('number', tree[:])] elif get_top(tree) == 'DATE': # tree contains a list with only one element, which is the data # joined with underscores. We re-establish the list. if IsString(tree[0]): date_info = tree[0].split('_') try: map(int, date_info) except ValueError: date_info = [tree[0]] return [Tree('date', date_info)] else: return [Tree(get_top(tree[0]), flatten(map(constituent2dcs, tree[1:])))] if get_top(tree) == 'ID' and len(tree) == 2: # The first child is the predicate. The rest are the arguments. assert len(tree) == 2, '%s' % tree predicate = get_top(tree[0]) if predicate == 'COUNT': predicate = predicate.lower() return [Tree(predicate, flatten(map(constituent2dcs, tree[1:])))] if len(tree) > 2: # A length greater than 2 is the only signal we have for "and". return [Tree(get_top(tree[0]), [Tree('and', flatten(map(constituent2dcs, tree[1:])))])] return [tree]
def UnconvertAllPOSFromTiburon(tree): leaf_paths = tree.treepositions('leaves') nt_paths = set(tree.treepositions()) - set(leaf_paths) for nt_path in nt_paths: assert not IsString(tree[nt_path]) tiburon_pos = get_top(tree[nt_path]) tree[nt_path].set_label(UnconvertPOSFromTiburon(tiburon_pos)) return tree
def GetLeavePositions(tree): positions = [] if IsString(tree): if not tree.startswith(u'?x'): positions.append( () ) else: positions = [position for position in tree.treepositions('leaves') \ if not tree[position].startswith(u'?x')] return positions
def QueryLambdaDCSC(ldcsc_str, query_manager=None): assert IsString(ldcsc_str) if query_manager is None: query_manager = query_manager_global results = [] ldcsc = tree_or_string(ldcsc_str) query = Query.fromldcsc(ldcsc) if query is not None: results = [r[0] for r in query.get_results(query_manager)] return results
def rule_meets_conds(self, rule, conds): if not conds: return True for cond in conds: target = rule.lhs if cond.startswith('lhs:') else rule.rhs if cond.endswith('is_var') and not IsVariable(target): return False if cond.endswith('is_str') and not IsString(target): return False return True
def GetNewstatesFromRHSInTiburon(rhs_str): """ Given a string representation of a RHS in Tiburon format, it returns a dictionary: path -> varname, where varname is converted to our software's variable name convention (.e.g ?x0|). """ rhs_str_nltk = TiburonToStanford(rhs_str) rhs_nltk = tree_or_string(rhs_str_nltk) newstates = {} if IsString(rhs_nltk) and IsTiburonStateVariable(rhs_nltk): state = rhs_nltk[:rhs_nltk.index('.')] newstates[()] = state elif not IsString(rhs_nltk): for path in rhs_nltk.treepositions('leaves'): if IsTiburonStateVariable(rhs_nltk[path]): state_var = rhs_nltk[path] state = state_var[:state_var.index('.')] newstates[path] = state return newstates
def get_entity_label(entity): assert IsString(entity) if ' ' in entity: return entity label_results = QueryLambdaDCSC(u'(ID !fb:type.object.name <{0}>)'.format(entity)) if not label_results: return entity if len(label_results) > 1: logging.warning( u'More than one label results for entity {0} = {1}'.format( entity, ', '.join(label_results))) return label_results[0]
def ConvertConstituent2DCS(constituent_tree): """ Wrapper for constituent2dcs, where we try to convert an eventual tree string into a tree. This function also retrieves the first item of the resulting list, which contains the final constituent structure, and transforms it into a utils.tree_tools.Tree object. """ if IsString(constituent_tree): constituent_tree = tree_or_string(constituent_tree) dcs_tree_fragments = constituent2dcs(constituent_tree) assert isinstance(dcs_tree_fragments, list) and len(dcs_tree_fragments) == 1 dcs_tree = tree_or_string(str(dcs_tree_fragments[0])) return dcs_tree
def LoadAlignments(alignment_fname): """ Load a filename with the following structure: src_tree trg_tree alignment ... src_tree trg_tree alignment into a dictionary indexed by a tuple (src_tree_str, trg_tree_str), whose values are Alignment objects. """ alignments = {} with codecs.open(alignment_fname, 'r', 'utf-8') as fin: lines = fin.readlines() assert len( lines) % 3 == 0, 'Lines in {0} are not a multiple of 3.'.format( alignment_fname) for i, line in enumerate(lines): if i % 3 == 0: src_tree_str = line.strip() src_tree = tree_or_string(src_tree_str) src_leaves = src_tree.leaves() if not IsString(src_tree) else [ src_tree ] if i % 3 == 1: trg_tree_str = line.strip() trg_tree = tree_or_string(trg_tree_str) trg_leaves = trg_tree.leaves() if not IsString(trg_tree) else [ trg_tree ] if i % 3 == 2: alignment_str = line.strip() alignment = Alignment(alignment_str, src_leaves, trg_leaves) alignments[(src_tree_str, trg_tree_str)] = alignment return alignments
def get_statements_from_date(ldcsc, var): assert get_top(ldcsc) == 'DATE' and len(ldcsc) == 1 statements = [] if not IsString(ldcsc[0]): return statements try: year = int(ldcsc[0].split('_')[0]) except ValueError: return statements statements = [ 'FILTER (xsd:dateTime({0}) >= xsd:dateTime("{1}"^^xsd:datetime)) .'\ .format(var, year), 'FILTER (xsd:dateTime({0}) < xsd:dateTime("{1}"^^xsd:datetime)) .'\ .format(var, year + 1)] return statements
def get_main_predicate_from_tree(tree): """ Given a constituent representation of a sparql query, it returns the main predicate (as in lambda-DCS) by returning the left-most leaf. If "COUNT" operator is the left-most leaf, then it returns the leaf immediately on the right of the "COUNT" operator. """ if IsString(tree): predicate = tree else: leaves = tree.leaves() assert leaves predicate = leaves[0] # left-most-leaf if predicate.lower() == 'count': predicate = leaves[1] return predicate
def GetTreePattern(tree, subpaths): """ Converts a rule LHS or RHS into a TreePattern. The tree attribute of the TreePattern would simply be the LHS or RHS tree. The path to the root (beginning) of the TreePattern would be (), because we do not have the real information on at what level this rule was originally extracted (or is being applied). The subpaths of the TreePattern would be the relative paths of the variables in the LHS or RHS. """ path = () if IsString(tree): if IsVariable(tree): return TreePattern(tree, path, [()]) else: return TreePattern(tree, path, []) subpaths_sorted = sorted(subpaths) return TreePattern(tree, path, subpaths_sorted)
def get_query_vars(statements, prefixes): if prefixes is None: prefixes = [] query_vars = list() for s in statements: if IsString(s): continue if is_var(s.subj): query_vars.append(s.subj) if is_var(s.rel): query_vars.append(s.rel) if is_var(s.obj): query_vars.append(s.obj) out_vars = set() for pref in prefixes: for v in query_vars: if v.startswith(pref): out_vars.add(v) return sorted(out_vars)
def BuildTiburonLHS(tree, quote_tokens=True): """ 1. Quote terminals, 2. Rename variables ?x0|NP -> x0:NP 3. Change bracketing (NP (DT the) (NN house)) -> NP(DT(the) NN(house)) """ lhs_str = '' if IsString(tree): if IsVariable(tree): lhs_str = ConvertVarToTiburon(tree) else: lhs_str = ConvertTokenToTiburon(tree, quote=quote_tokens) else: pos = get_top(tree) lhs_str = ConvertPOSToTiburon(pos) + '(' lhs_str += ' '.join([ BuildTiburonLHS(child, quote_tokens=quote_tokens) for child in tree ]) lhs_str += ')' return lhs_str
def get_alignment(pair_tt, entities_lex, predicates_lex): """ Obtains maximum-length alignment between leaves of source and target tree. @pair_tt is a tuple (src_tree, trg_tree). @entities_lex and @predicates_lex are dictionaries, as constructed above. """ assert len(pair_tt) == 2 src_tree, trg_tree = map(tree_or_string, pair_tt) src_leaves = src_tree.leaves() if IsString(trg_tree): trg_leaves = [trg_tree] else: trg_leaves = trg_tree.leaves() # This is a list of lists. Each list will have source word indices. alignments = [] for i, trg_leaf in enumerate(trg_leaves): alignment = align(trg_leaf, src_leaves, entities_lex, predicates_lex) alignments.append(alignment) alignments = fix_unaligned(alignments, trg_leaves) return alignments