def addNonTerminal(nodes,node): name=str(node.get('Cat')) + ('_'+ str(node.get('Rule')) if node.get('Rule')!=None else '') + ('_' + str(node.get('ClType')) if node.get('ClType')!=None else '') nodeNumber = whichNode(name,nodes) if nodeNumber == -1: nodes.append(NonTerminal(name)) children = [] for child in node.getchildren(): if 'Unicode' in child.keys(): features = FeatStruct() for key in child.keys(): if key in ['Person','Tense','Voice','Mood','Case','Number','Gender','Degree']: features = features.unify(FeatStruct('['+str(key)+'=\''+str(child.get(key))+'\']')) children.append('{' + str(child.get('Cat')) + '_' + stringFeatures(features) + '}') else: children.append(str(child.get('Cat')) + ('_'+ str(child.get('Rule')) if child.get('Rule')!=None else '') + ('_' + str(child.get('ClType')) if child.get('ClType')!=None else '')) nodes[len(nodes)-1].addPath(children,nodes) else: children = [] for child in node.getchildren(): if 'Unicode' in child.keys(): features = FeatStruct() for key in child.keys(): if key in ['Person','Tense','Voice','Mood','Case','Number','Gender','Degree']: features = features.unify(FeatStruct('['+str(key)+'=\''+str(child.get(key))+'\']')) children.append('{' + str(child.get('Cat')) + '_' + stringFeatures(features) + '}') else: children.append(str(child.get('Cat')) + ('_'+ str(child.get('Rule')) if child.get('Rule')!=None else '') + ('_' + str(child.get('ClType')) if child.get('ClType')!=None else '')) nodes[nodeNumber].addPath(children,nodes)
def get_instance_featstructs(root): instances = get_instances(root) events = get_events(root) for i in instances: try: i.set( 'event', next( FeatStruct({ **{k: e.attrib[k] for k in ['class', 'stem']}, **{ 'text': e.text } }) for e in events if e.get('eid') == i.get('eventID'))) except KeyError: i.set( 'event', next( FeatStruct({ **{k: e.attrib[k] for k in ['class']}, **{ 'text': e.text } }) for e in events if e.get('eid') == i.get('eventID'))) return [ FeatStruct({k: i.attrib[k] for k in set(i.keys()) - set(['eventID'])}) for i in instances ]
def smartUnify(*featstructs): '''Unifies two or more feature structures based on what they have in common. For example, [person=1,number=2] and [case=3,number=2] will return [number=2], not [person=1,number=2,case=3] as featStruct.unify() would do. Return None if unable to unify. Arguments: *featstructs -- Any number of feature structures to unify. ''' # Create a list of lists of feature structures in each overall # structure. lis = [] for struct in featstructs: lis.append([]) for item in struct: lis[len(lis)-1].append(item) # Create a set based on the first structure list, and then form # the intersection of the remaining lists, leaving only what is # left in common between all lists. s = set(lis[0]) f = FeatStruct() for l in lis: s = s.intersection(l) # Unify the set together to form the resulting FeatStruct for struct in featstructs: for i in struct: if i in s: f = f.unify(FeatStruct('['+str(i)+'='+str(struct[i])+']')) if f == None: return None return f
def debug_test_contain(): fs100 = FeatStruct() fs100['__or_wzq'] = 'wzq' #fs100['__or_qwe'] = 'qwe' fs101 = FeatStruct() fs101['__or_123'] = '123' fs101['__or_wzq'] = 'wzq' print test_contain(fs100, fs101)
def addTerminal(nodes,node): features = FeatStruct() for key in node.keys(): if key in ['Person','Tense','Voice','Mood','Case','Number','Gender','Degree']: features = features.unify(FeatStruct('['+str(key)+'=\''+str(node.get(key))+'\']')) name = '{' + str(node.get('Cat')) + '_' + stringFeatures(features) + '}' nodeNumber = whichNode(name,nodes) if nodeNumber == -1: nodes.append(Terminal(name[1:name.find('_')],features))
def match_feature(feature, regexp, operation=0): """ match_feature(feature,regexp,operation=0) -> FeatStruct This function is used to filter a feature structure with a regular exression. The regular expression should be written in the form that XTAG system uses, which has a '__value__' entry at the last level of indexing. feature: The feature that you would like to filter regexp: An acceptable regular expression by module re operation: 0 if positive filtering will be done, 1 if negative filtering. positive filtering means that all RHS values that match the regexp will be retained, while negative filtering means that all RHS values that doesn't match will be retained. """ new_feature = FeatStruct() count = 0 for i in feature.keys(): val = feature[i] if val.has_key('__value__'): #search_ret = re.search(regexp,val['__value__']) search_ret = re.search(regexp, i) if operation == 0 and search_ret != None: new_feature[i] = val count += 1 elif operation == 1 and search_ret == None: new_feature[i] = val count += 1 else: search_ret = re.search(regexp, i) if operation == 0 and search_ret != None: new_feature[i] = val count += 1 elif operation == 1 and search_ret == None: ret = match_feature(val, regexp, operation) if ret != None: new_feature[i] = ret else: new_feature[i] = FeatStruct() count += 1 elif operation == 1 and search_ret != None: pass else: ret = match_feature(val, regexp, operation) #print ret,'\n' if ret != None: new_feature[i] = ret count += 1 #print new_feature,'\n' if count == 0: return None else: return new_feature
def topic_features(self, article): word_occurence = self.get_word_occurence(article) feature_list = FeatStruct(word_occurence) feature_list.freeze() feature = FeatStruct(word_occurence=feature_list) feature.freeze() return dict([(word, True) for word in word_occurence])
def make_fs(lhs,rhs): # This function makes a feature structure using a list of lhs which are nested # e.g. if lhs = ['a','b','c','d'] and rhs = 'wzq' then the # fs shoule be [a = [b = [c = [d = 'wzq']]]] new_fs = FeatStruct() if len(lhs) == 1: inner = FeatStruct() inner['__value__'] = rhs rhs = inner new_fs[lhs[0]] = rhs else: new_fs[lhs[0]] = make_fs(lhs[1:],rhs) return new_fs
def nested_frozen_fs(dictionary): if not isinstance(dictionary, FeatStruct): ret = FeatStruct() for k, v in dictionary.items(): v_new = v if isinstance(v_new, set): v_new = frozenset(v_new) elif isinstance(v_new, dict): v_new = nested_frozen_fs(v_new) ret[k] = v_new ret.freeze() return ret else: dictionary.freeze() return dictionary
def make_rhs_using_or(rhs): """ :param rhs: The right hand string which may contain the 'or' relationship :type rhs: str :return: A feature structure using '__or_' structure :rtype: FeatStruct This function will return a feature structure which satisfies the requirement for implementing the 'or' relationship in the xtag grammar. rhs must be a string, whose value will be used to construct the lhs inside the new feature structure. For example, rhs = a/b/c -> [ __or_a = a ] [ __or_b = b ] [ __or_c = c ] """ new_fs = FeatStruct() slash = rhs.find('/') if slash == -1: rhs = [rhs] else: rhs = rhs.split('/') # After this rhs is a list containing the entities in the 'or' relation for i in rhs: lhs = make_leaf_str(i) new_fs[lhs] = i return new_fs
def fourth_pass(xtag_trees): """ fourth_pass() -> list Given the result of third_pass(), this function will make use of FeatStruct, and build a feature structure dictionary. """ for xtag_entry in xtag_trees: features = {} for feature_entry in xtag_entry[1]: lhs = feature_entry[0] rhs = feature_entry[1] l_separator = lhs.find(':') r_separator = rhs.find(':') if r_separator == -1: l_id = lhs[:l_separator] l_space = lhs.find(' ') feat_rhs = FeatStruct() feat_rhs["__value__"] = rhs #feat_rhs = rhs if (l_space == -1): l_feature = lhs[l_separator + 2:-1] add_two_feature(features, l_id, feat_rhs, l_feature) else: l_feature1 = lhs[l_separator + 2:l_space] l_feature2 = lhs[l_space + 1:-1] add_two_feature(features, l_id, feat_rhs, l_feature1, l_feature2) xtag_entry[4] = features return xtag_trees
def _naive_unify(fstruct1:FeatStruct, fstruct2:FeatStruct): newfs = copy.copy(fstruct1) if _is_mapping(fstruct1) and _is_mapping(fstruct2): # Unify any values that are defined in both fstruct1 and # fstruct2. Copy any values that are defined in fstruct2 but # not in fstruct1 to fstruct1. Note: sorting fstruct2's # features isn't actually necessary; but we do it to give # deterministic behavior, e.g. for tracing. for fname, fval2 in sorted(fstruct2.items()): if fname in fstruct1: newfs[fname] = _naive_unify(fstruct1[fname], fval2) else: newfs[fname] = fval2 return newfs # Contains the unified value. # Unifying two sequences: elif _is_sequence(fstruct1) and _is_sequence(fstruct2): # Concatenate the values !! # Don't unify corresponding values in fstruct1 and fstruct2. newfs += fstruct2 newfs = tuple([t for t in newfs if not isinstance(t, Variable)]) return newfs # Contains the unified value. else: return None
def remove_or_tag(feature): """ :param feature: The feature structure that you want to remove the '__or_' tag :type feature: FeatStruct :return: A new feature structure with "__or_" removed and combined :rtype: FeatStruct Given a feature structure in the internal repersentation of our xtag system (i.e. each leaf is wrapped with an '__or_' + lhs feature struct), this function will get rid of the __or_ tag, and produce a feature structure where no __or_ is there, and the multiple or relation is represented as [__or_1]/[__or_2]/ ... e.g. for fs = [apple = [__or_a = 'a']] [ [__or_b = 'b']] [ [__or_c = 'c']] remove_or_tag(fs) will return: fs_return = [apple = 'a/b/c'] """ new_feature = FeatStruct() for key in feature.keys(): entry = feature[key] entry_keys = entry.keys() if test_leaf(entry) == True: str_or_removed = entry[entry_keys[0]] if len(entry_keys) > 1: for i in entry_keys[1:]: str_or_removed += '/' + entry[i] new_feature[key] = str_or_removed else: new_feature[key] = remove_or_tag(feature[key]) return new_feature
def find_probability(list1,list2,targword): c=0 prob_list=[] sense_word_list=[] for i in range(len(list1)): for j in range(len(list1[i])): for k in range(len(list1[i][j])): #print(type(list1[i][j][k]),type(list2[0][0][k])) if(list1[i][j][k]==list2[0][0][k]): c+=1 prob=c/(5*len(list1)*len(list1[i])) prob_list.append(prob) max_prob=max(prob_list) sensetag=prob_list.index(max_prob) with codecs.open(targword + '/Senses00' + str(sensetag) + '.txt', encoding='utf-16') as f: sense_word_list.append(f.read()) print("\n") print(sense_word_list) #print(sense_word_list[sense_word_list.index('&'):sense_word_list.index('!') ]) try: LinearSVC_classifier = SklearnClassifier(LinearSVC()) LinearSVC_classifier.train(list1) print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, list2)) * 100) except: #classifier=nltk.NaiveBayesClassifier.train(FeatStruct('["a","b" , "c"]')) print("Classifier accuracy percent :", "{0:.3f}".format(value((FeatStruct('[1,2,3]')))))
def remove_value_tag(feature): new_feature = FeatStruct() for i in feature.keys(): if feature[i].has_key('__value__'): new_feature[i] = feature[i]['__value__'] else: new_feature[i] = remove_value_tag(feature[i]) return new_feature
def test_contain(fs1, fs2): """ Test if one feature structure contains another, i.e. is the super set of another. :param fs1: The first feature structure you want to test :type fs1: FeatStruct :param fs2: The second feature structure you wang to test :type fs2: FeatStruct :return: 0 if they are equal to each other 1 if fs1 is a subset of fs2 -1 if fs2 is a subset of fs1 FeatStruct if there is some intersection None if there is no intersection One exception is that, if the two fs are both empth then we will return equal instead of None. :rtype: integer/FeatStruct/None This function requires that fs1 and fs2 are leaf nodes, if they are not then an an exception will be raised. Besides, since in a leaf node the left hand side is actually derivable from the right hang side, so if we know one we can know another. Based on this observation we just make comparisions to the left hand side, i.e. keys(). """ if test_leaf(fs1) == False or test_leaf(fs2) == False: raise ValueError('Two arguments must be leaf nodes.') key_1 = fs1.keys() key_2 = fs2.keys() new_key_1 = [] new_key_2 = [] for i in key_1: if i in key_2: new_key_1.append(i) for i in key_2: if i in key_1: new_key_2.append(i) len_1 = len(key_1) len_2 = len(key_2) new_len_1 = len(new_key_1) new_len_2 = len(new_key_2) # Now new_len_1 and new_len_2 are the same keys in both fs, or both [] # means no same keys #print new_key_1 #print new_key_2 if new_len_1 == 0 and new_len_2 == 0: return None elif len_1 != new_len_1 and len_2 != new_len_2: ret = FeatStruct() for i in new_key_1: ret[i] = fs1[i] return ret elif len_1 == new_len_1 and len_2 != new_len_2: return 1 # len_1 not changed, it is contained in len_2 elif len_1 != new_len_1 and len_2 == new_len_2: return -1 # len_2 contained in len_1 else: return 0 # Neigher has changed, so they are equal
def add_two_feature(features, l_id, rhs, l_feature1, l_feature2=None): if l_feature2 == None: if features.has_key(l_id): features[l_id][l_feature1] = rhs else: features[l_id] = FeatStruct() features[l_id][l_feature1] = rhs else: if features.has_key(l_id): if features[l_id].has_key(l_feature1): features[l_id][l_feature1][l_feature2] = rhs else: features[l_id][l_feature1] = FeatStruct() features[l_id][l_feature1][l_feature2] = rhs else: features[l_id] = FeatStruct() features[l_id][l_feature1] = FeatStruct() features[l_id][l_feature1][l_feature2] = rhs return
def fifth_pass(xtag_trees): """ fifth_pass() -> list Given the result of fourth_pass(), this function will continue to build the feature structure, and in this phase we must add all values even if they are not defined by the tree grammar. """ for xtag_entry in xtag_trees: features = xtag_entry[4] for feature_entry in xtag_entry[1]: lhs = feature_entry[0] rhs = feature_entry[1] l_separator = lhs.find(':') r_separator = rhs.find(':') if r_separator != -1: l_id = lhs[:l_separator] r_id = rhs[:r_separator] r_feature = rhs[r_separator + 2:-1] print r_feature l_space = lhs.find(' ') if not features.has_key( r_id): # Make sure features[r_id] exists features[r_id] = FeatStruct() features[r_id][r_feature] = FeatStruct(__value__='') elif not features[r_id].has_key( r_feature ): # Make sure features[r_id][r_feature] exists features[r_id][r_feature] = FeatStruct(__value__='') if (l_space == -1): l_feature = lhs[l_separator + 2:-1] add_two_feature(features, l_id, features[r_id][r_feature], l_feature) else: l_feature1 = lhs[l_separator + 2:l_space] l_feature2 = lhs[l_space + 1:-1] add_two_feature(features, l_id, features[r_id][r_feature], l_feature1, l_feature2) return xtag_trees
def create_fact( pred: FeatStruct, arg_0: Union[List[FeatStruct], FeatStruct], arg_1: Union[List[FeatStruct], FeatStruct] ) -> FeatStruct: """ Construct feature structure than represent fact. ex: know(Gael, [Bas, Justine]), return a feature structure of the type : [pred=[sem=know], arg0 = [head=Gael, tail=na], arg1 = [head=Bas, tail=[head=Justine, tail=na]]] """ arg_0 = arg_0 if isinstance(arg_0, list) else [arg_0] arg_1 = arg_1 if isinstance(arg_1, list) else [arg_1] return FeatStruct(arg0=format_list(arg_0), arg1=format_list(arg_1), pred=pred)
def parse_feature_in_catalog(s): # This function parses the string in catalog file, i.e. english.gram # with the option 'start-feature', into a FeatStruct. We MUST write # separate parsers for different strings from different files, since # these features are represented in different forms. """ :param s: The string repersenting start feature in the catalog file :type s: str :return: A feature structure which is the start feature :rtype: FeatStruct Given the string, this function will return a feature structure parsed from that string. The feature structure should be encoded like this: <mode> = ind/imp <comp> = nil <wh> = <invlink> <punct term> = per/qmark/excl <punct struct> = nil All tokens shall be separated by a single space, no comma and period and semicolon is used. This parses is designed specially for the string from the catalog (i.e. english.gram) file, since there are multiple ways to represent the FS in xtag grammar, so we need multiple parsers. """ # token _list is a list of tuples, the element of which is the LHS and # RHS of a feature structure definition, i.e. [('mode','ind/imp'),('comp','nil')] token_list = [] while True: equal_sign = s.find('=') if equal_sign == -1: break # find between '=' and '<', which is the RHS if no "<LHS> = <RHS>" # is used. If it is then we can know that the no-white string between # '=' and '<' is an empty string. angle_bracket = s.find('<', equal_sign) if angle_bracket == -1: rhs = s[equal_sign + 1:].strip() else: rhs = s[equal_sign + 1:angle_bracket].strip() if rhs == '': angle_bracket = s.find('<', angle_bracket + 1) if angle_bracket == -1: rhs = s[equal_sign + 1:].strip() else: rhs = s[equal_sign + 1:angle_bracket].strip() lhs = s[:equal_sign].strip()[1:-1] token_list.append((lhs, rhs)) s = s[angle_bracket:] new_fs = FeatStruct() for token in token_list: add_new_fs(new_fs, token[0].split(), token[1]) return new_fs
def add_new_fs(fs, lhs, rhs, ref=0): """ :param fs: The feature structure that we are going to add to it. :type fs: FeatStruct :param lhs: The path defined for the new node :type lhs: list(str) :param rhs: The value of the node :type rhs: str / Any object :param ref: Controls whether rhs should be treated as a string or other object :type ref: 0 / 1 This function will add the feature structure defined by lhs and rhs into an existing feature fs. The lhs of the lowest level is defined to be '__or_' + rhs to facilitate other procedures. If any of the paths defined by lhs has already existed in fs, then it will be merged into that existing path, instead of erasing the existing one and make a new one, so it is safe to use this function to merge two feature structures. For example, fs = [a = ['www']] lhs = ['a','b','c','d','e'] rhs = 'wzq' -> [a = [['www'] ] [ [b = [c = [d = [e = [__or_wzq = 'wzq']]]] """ if len(lhs) == 1: #inner = FeatStruct() #inner['__value__'] = rhs #fs[lhs[0]] = inner # ref means reference. If we are not making reference, then rhs must # be a string, and we will process that string # But if ref == 1 then we are just making references, so we will not # process rhs, but only attach it to the existing feature structure if ref == 0: fs[lhs[0]] = make_rhs_using_or(rhs) elif ref == 1: fs[lhs[0]] = rhs else: raise ValueError('Undefined ref value %d' % (ref)) else: if fs.has_key(lhs[0]): add_new_fs(fs[lhs[0]], lhs[1:], rhs, ref) else: fs[lhs[0]] = FeatStruct() add_new_fs(fs[lhs[0]], lhs[1:], rhs, ref) return
def make_fs(lhs, rhs, ref=0): # This function makes a feature structure using a list of lhs which are nested # e.g. if lhs = ['a','b','c','d'] and rhs = 'wzq' then the # fs shoule be [a = [b = [c = [d = 'wzq']]]] """ :param lhs: The path on the left hand side :type lhs: list(str) :param rhs: The string on the right hand side / Any object :type rhs: str / object :param ref: Control whether to treat rhs as a string or as an abitrary object :type ref: 0 / 1 :return: A constructed feature structure :rtype: FeatStruct Given the path and the right hand side of a feature structure this function will return a feature structure exactly has the path defined in lhs and the value inside it is the rhs. There are two choices, we can either pass in a string as the rhs to let the code to deal with the 'or' problem. or just pass in an object and the code will not touch that (ref = 1 needed). lhs = ['a','b','c','d'] rhs = 'wzq' ->FeatStruct = [a = [b = [c = [d = [__or_wzq = 'wzq']]]]] """ new_fs = FeatStruct() if len(lhs) == 1: #inner = FeatStruct() #inner['__value__'] = rhs #rhs = inner # if ref == 0 then we are not making references so we will process # rhs, and it must be a string if ref == 0: rhs = make_rhs_using_or(rhs) elif ref == 1: pass # Do nothing else: raise ValueError('Undefined ref value %d' % (ref)) new_fs[lhs[0]] = rhs else: new_fs[lhs[0]] = make_fs(lhs[1:], rhs, ref) # Recursively call return new_fs
def debug(): a = FeatStruct() b = FeatStruct() c = FeatStruct() d = FeatStruct() a['__value__'] = 'OKWANGZiqi' b['__value__'] = 'WANGYunpeng' c['__value__'] = "WWA!!!" d['__value__'] = 'WZQ(*&YTG' e = FeatStruct() e['first'] = a e['second'] = b e['third'] = c e['fourth'] = d f = FeatStruct() f['nested'] = e g = FeatStruct() g['__value__'] = "WAAAAAAAAH!" f['single'] = g print f print '' print remove_value_tag(match_feature(f, 'i', 1))
def get_timex_featstructs(root): timexs = get_timexs(root) return [FeatStruct(t.attrib) for t in timexs]
def __hash__(self): self.freeze() return FeatStruct.__hash__(self)
path_1 = get_all_path(fs1) for i in path_1: item_2 = get_element_by_path(fs2, i) if item_2 == None: item_1 = get_element_by_path(fs1, i) add_new_fs(new_fs, i, item_1, 1) if tree1 != None: correct_other_nodes(correction_list, tree1) if tree2 != None: correct_other_nodes(correction_list, tree2) return new_fs fs1 = FeatStruct() fs2 = FeatStruct() fs3 = FeatStruct() fs4 = FeatStruct() fs4['more'] = fs3 fs2['__or_a'] = 'a' fs2['__or_wzq'] = 'wzq' fs2['__or_qwe'] = 'qwe' fs1['apple'] = fs2 fs1['orange'] = fs4 fs3['__or_zxcv'] = 'zxcv' fs3['__or_4567'] = '4567' debug_start_feature = parse_feature_in_catalog( '<mode> = ind/imp <comp> = nil <wh> = <invlink> <punct term> = per/qmark/excl <punct struct> = nil' ) empty_feature = FeatStruct()
class Terminal: def __init__(self,name,value,features): self.name = name self.value = value self.features = features nodes = [] tree = ET.parse('65.xml') root = tree.getroot() for sentence in root.iter('Sentence'): for tree in sentence.iter('Tree'): for node in tree.iter('Node'): if 'Unicode' in node.keys(): features = FeatStruct() for key in node.keys(): if key in ['Person','Tense','Voice','Mood','Case','Number','Gender','Degree']: features = features.unify(FeatStruct('['+str(key)+'=\''+str(node.get(key))+'\']')) nodes.append(Terminal(node.get('Cat'),node.get('Unicode'),features)) else: name = str(node.get('Cat')) + ('_'+ str(node.get('Rule')) if node.get('Rule')!=None else '') + ('_' + str(node.get('ClType')) if node.get('ClType')!=None else '') children = [] for child in node.getchildren(): children.append(child.get('Cat') + ('_'+ str(child.get('Rule')) if child.get('Rule')!=None else '') + ('_' + str(child.get('ClType')) if child.get('ClType')!=None else '')) nodes.append(NonTerminal(name,children)) nodes.append('------') def printCFG(nodes): s = '' for node in nodes:
def special_unify(fs1, fs2, tree1=None, tree2=None): """ :param fs1: One of the feature structures you want to unify :type fs1: FeatStruct :param fs2: Another feature structure :type fs2: FeatStruct :param tree1: The tree that you want to restore inter-node reference :type tree1: TAGTree :param tree2: Another tree, optional. :type tree2: TAGTree This function will do a unify just like what the normal unify() does, but in addition to a normal unification we also have the following features: 1. Disjunction is considered, e.g. [x = a/b/c] and [x = b/c/d] should yield [x = b/c]; [x = a/b/c] and [x = a/b] should yield [x = a/b] 2. The result is a new feature structure, but the leaf node is not new; actually we will make reference to the leaf nodes in fs1 and fs2 3. When the path and the value are both the same, we will make new nodes, and then fix the references in the trees given by parameters tree1 and tree2 to make the entry point to the new node. """ new_fs = FeatStruct() correction_list = [] path_2 = get_all_path(fs2) # To save time, no path_1 for i in path_2: item_1 = get_element_by_path(fs1, i) if item_1 == None: item_2 = get_element_by_path(fs2, i) add_new_fs(new_fs, i, item_2, 1) # ref == 1, we only do reference!! else: item_2 = get_element_by_path(fs2, i) tc = test_contain( item_1, item_2) # Single entry is the same as multiple entry if tc == 1: # t1 is a subset of t2, we always use the smaller one add_new_fs(new_fs, i, item_1, 1) elif tc == -1: add_new_fs(new_fs, i, item_2, 1) elif tc == 0: # Two entries are the same, we create a new one corr_check = search_correction(correction_list, item_1) if corr_check == None: new_entry = copy.deepcopy(item_1) # This tuple is used to correct the reference in tree(s) # Enumerating all paths, check whether the id of the value is # equal to either item1 or item2, if it is then change the # reference to new_entry correction_tuple = (new_entry, item_1, item_2) correction_list.append(correction_tuple) else: # The return value is the new entry stored if three is not None new_entry = corr_check # Add new reference (new entry or existing entry) add_new_fs(new_fs, i, new_entry, 1) elif tc == None: return None # Conflict # Partial intersection, return value is a new FeatStruct only contains # the intersection. But we do not need to correct this, since it # is brand-new else: add_new_fs(new_fs, i, tc, 1) #if i[0] == 'comp': print tc # We do not need to check when item_2 != None, because we have already # done it in the first loop. In other words, we have processed the overlapping # paths, and what is left is to add those in fs1 but not in fs2 into the # new feature structure path_1 = get_all_path(fs1) for i in path_1: item_2 = get_element_by_path(fs2, i) if item_2 == None: item_1 = get_element_by_path(fs1, i) add_new_fs(new_fs, i, item_1, 1) if tree1 != None: correct_other_nodes(correction_list, tree1) if tree2 != None: correct_other_nodes(correction_list, tree2) return new_fs
def featStruct(gapUp, semUp, varUp, arriveFlag=False, destVpFlag=False, sourceVpFlag=False, busNameNpFlag=False, destNpFlag=False, departFlag=False, departVpFlag=False): gap = Variable('?gap') if arriveFlag and not (destVpFlag) and not (sourceVpFlag): vp = FeatStruct(arrive=FeatStruct( a=Variable('?a'), f=Variable('?f'), t=FeatStruct(t_var=Variable('?t'), time_var=Variable('?time')))) elif departFlag and departVpFlag: vp = FeatStruct(depart=FeatStruct( d=Variable('?d'), f=Variable('?fDep'), t=FeatStruct(t_var=Variable('?t_var_dep'), time_var=Variable('?timeDepart'))), source=FeatStruct(bus=Variable('?h'), sourceName=FeatStruct( f=Variable('?h'), name=Variable('?nameSource')))) else: vp = FeatStruct( depart=FeatStruct(d=Variable('?d'), f=Variable('?fDep'), t=FeatStruct(t_var=Variable('?t_var_dep'), time_var=Variable('?timeDepart'))), source=FeatStruct(bus=Variable('?h'), sourceName=FeatStruct( f=Variable('?h'), name=Variable('?nameSource'))), arrive=FeatStruct(a=Variable('?a'), f=Variable('?fArr'), t=FeatStruct(t_var=Variable('?t_var_arr'), time_var=Variable('?timeArrive'))), dest=FeatStruct(destName=FeatStruct( f=Variable('?f'), name=FeatStruct(h=Variable('?hDest'), name=Variable('?nameDest'))))) if destNpFlag and not (busNameNpFlag): np = FeatStruct(dest=FeatStruct( bus=Variable('?f'), dest=FeatStruct(f=Variable('?f'), name=FeatStruct(h=Variable('?h'), name=Variable('?name'))))) else: np = FeatStruct( the=FeatStruct(bus=Variable('?b'), busname=FeatStruct(h=Variable('?h_BusName'), name=Variable('?busName')))) wh = FeatStruct( whType=FeatStruct(f=Variable('?f'), type=Variable('?type'))) sem = FeatStruct(query=FeatStruct(vp=vp, np=np, wh=wh)) var = Variable('?a') para = FeatStruct(gap=gap, sem=sem, var=var) paraUpdate = FeatStruct(gap=gapUp, sem=semUp, var=varUp) # print('$$$$$$$$$$$$$$$$$$$$$$$$$$$$$') # print(paraUpdate) # paraUpdate.unify(para)['sem']['query']['vp']['arrive']['f'] return paraUpdate.unify(para)
def get_trigram_featstruct(trigram_tuple): return FeatStruct(x=trigram_tuple[0],y=trigram_tuple[1],z=trigram_tuple[2])
def analyze_template(s): # The return value of this function is a tuple. The first element of the tuple is a dictionary # using identifiers from morph.flat, and the entries are feature structures # with proper values set. The second element is a dictionary using keys from # syntax-coded.flat, which will return a list containing all feature structures # from a given identifier. lines = s.splitlines() feature_list = {} feature_list2 = {} for l in lines: #print l l = l.strip() if l == '' or l[0] == ';': continue index = l.find(';') if index != -1: l = l[:index] l = l.strip() if l[0] == '@': if l[-1] != '!': raise TypeError("Each line should be terminated with a '!'") l = l[1:-1] # we only split the name and the other part temp = l.split(None,1) name = temp[0] l = l[len(name):].strip() features = l.split(',') fs = FeatStruct() for f in features: f = f.strip() index = f.find('=') if f[0] == '@' and feature_list.has_key(f[1:]): fs = fs.unify(feature_list[f[1:]]) # unify() does not change in-place elif index != -1: lhs = f[:index].strip() rhs = f[index + 1:].strip() if rhs[0] == '@': # rhs can also be reference rhs = feature_list[rhs[1:]] if lhs[0] != '<' or lhs[-1] != '>': raise TypeError('The left hand side of a feature structure must be wrapped with <>') lhs = lhs[1:-1] path = lhs.split() #path.reverse() # This method is in-place fs = fs.unify(make_fs(path,rhs)) feature_list[name] = fs #print name #print fs,'\n' elif l[0] == '#': if l[-1] != '!': raise TypeError('Invalid input line, must be terminated by "!"') l = l[1:-1] tokens = l.split(None,1) # Split for once using space character word_pos = tokens[0].strip() features = tokens[1].split(',') new_fs = FeatStruct() for fs in features: tokens = fs.split(':',1) node_type = tokens[0].strip() tokens = tokens[1].split('=',1) lhs = tokens[0].strip()[1:-1] # Remove <> rhs = tokens[1].strip() lhs = lhs.split() if new_fs.has_key(node_type): new_fs[node_type] = new_fs[node_type].unify(make_fs(lhs,rhs)) else: new_fs[node_type] = make_fs(lhs,rhs) if feature_list2.has_key(word_pos) == False: feature_list2[word_pos] = new_fs else: #feature_list2[word_pos].append(new_fs) raise KeyError('Duplicate defitinion detected: %s.' % (word_pos)) else: raise TypeError('Cannot recognize line: %s.' % (l)) return (feature_list,feature_list2)
pred: FeatStruct, arg_0: Union[List[FeatStruct], FeatStruct], arg_1: Union[List[FeatStruct], FeatStruct] ) -> FeatStruct: """ Construct feature structure than represent fact. ex: know(Gael, [Bas, Justine]), return a feature structure of the type : [pred=[sem=know], arg0 = [head=Gael, tail=na], arg1 = [head=Bas, tail=[head=Justine, tail=na]]] """ arg_0 = arg_0 if isinstance(arg_0, list) else [arg_0] arg_1 = arg_1 if isinstance(arg_1, list) else [arg_1] return FeatStruct(arg0=format_list(arg_0), arg1=format_list(arg_1), pred=pred) # STATIC LIST OF ALL PREDICATES know = FeatStruct(sem="know") know_r = FeatStruct(sem="know_r") member = FeatStruct(sem="member") # INPUT DATA Gael = FeatStruct(proper="Gael", is_proper=True, explicit=True, is_noun=False, gender="male", form="singular") Bas = FeatStruct(proper="Bas", is_proper=True, explicit=True, is_noun=False, gender="male", form="singular") Justine = FeatStruct(proper="Justine", is_proper=True, explicit=True, is_noun=False, gender="female", form="singular") club = FeatStruct( proper="na", is_proper=False, is_noun=True, explicit=True, noun="tennis_club", gender="neuter", form="singular" ) # Dynamic rules that are specific to this input data # for instance "ProperName[proper=Bas] -> "Bas" dynamic_productions = [entity_specific_rule(entity) for entity in [Gael, Bas, Justine, club]]
def mainLogic(doc): departFlag = False departVpFlag = False arriveFlag = False sourceVpFlag = False destVpFlag = False busNameNpFlag = False destNpFlag = False d = '' t = '' a = '' time = '' nameDepart = '' nameArrive = '' bVar = '' h_BusName = '' busName = '' timeDepart = '' cityTokenText = ['Hồ_Chí_Minh', 'Hà_Nội', 'Huế', 'Đà_nẵng', 'Đà_Nẵng'] busTokenText = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8'] cityTokenDep = ['compound', 'nmod', 'obl'] (f, typeWh) = ('f2', 'WHICH1') if ( subtree_matcher(doc, 'det', text='nào') != []) else ('h1', 'HOWLONG1') if (typeWh == 'WHICH1'): gap = f else: #Runtime (HOWLONG1 case) gap = 'r2' if (gap == 'f2'): if subtree_matcher(doc, 'case', text='đến') != [] or subtree_matcher( doc, 'ccomp', text='đến') != []: arriveFlag = True a = 'a3' # time=subtree_matcher(doc,'nummod')[0] if (len(subtree_matcher(doc,'nummod'))==1) else subtree_matcher(doc,'nummod') # print([i.text for i in searchChild(doc,'ROOT')]) try: time = [ i.text for i in searchChild(doc, 'ROOT') if 'HR' in i.text ][0] except: time = '' if time != '': t = 't2' else: t = '?t' ################################################ if subtree_matcher(doc, 'ROOT', text='đi') != []: departFlag = True d = 'd3' for cT in cityTokenText: for cD in cityTokenDep: temp = subtree_matcher(doc, cD, cT) tempHead = checkHead(doc, temp) try: tempChild = [i.text for i in searchChild(doc, cD)] except: tempChild = '' if (temp != []) and (tempHead != 'đi'): destVpFlag = True nameArrive = temp elif (temp != []) and (tempHead == 'đi') and ('từ' in tempChild): sourceVpFlag = True nameDepart = temp elif (temp != []) and (tempHead == 'đi') and ('đến' in tempChild): destVpFlag = True nameArrive = temp # elif subtree_matcher(doc,'ROOT',text='xuất_phát')!=[]: # departFlag=True # d='d3' # for cT in cityTokenText: # for cD in cityTokenDep: # temp=subtree_matcher(doc,cD,cT) # tempHead=checkHead(doc,temp) # # try: # # tempChild=[i.text for i in searchChild(doc,cD)] # # except: # # tempChild='' # if (temp !=[]) and (tempHead!='xuất_phát'): # destVpFlag=True # nameDepart= temp # # elif (temp !=[]) and (tempHead=='xuất_phát') and ('từ' in tempChild): # # sourceVpFlag=True # # nameDepart= temp # # elif (temp !=[]) and (tempHead=='xuất_phát') and ('đến' in tempChild): # # destVpFlag=True # # nameArrive= temp else: for cT in cityTokenText: for cD in cityTokenDep: temp = subtree_matcher(doc, cD, cT) if temp != []: destNpFlag = True nameArrive = temp break else: # Continue if the inner loop wasn't broken. continue # Inner loop was broken, break the outer. break elif subtree_matcher(doc, 'ROOT', text='xuất_phát') != []: departFlag = True d = 'd3' for cT in cityTokenText: for cD in cityTokenDep: temp = subtree_matcher(doc, cD, cT) tempHead = checkHead(doc, temp) # try: # tempChild=[i.text for i in searchChild(doc,cD)] # except: # tempChild='' if (temp != []) and (tempHead != 'xuất_phát'): departVpFlag = True nameDepart = temp # elif (temp !=[]) and (tempHead=='xuất_phát') and ('từ' in tempChild): # sourceVpFlag=True # nameDepart= temp # elif (temp !=[]) and (tempHead=='xuất_phát') and ('đến' in tempChild): # destVpFlag=True # nameArrive= temp # print(destNpFlag) # Variable('?hDest'),name=Variable('?nameDest') elif (gap == 'r2'): if (subtree_matcher(doc, 'ROOT', text='đến')) != []: arriveFlag = True a = 'a3' time = '?time' t = '?t' nameArrive = subtree_matcher(doc, 'obj')[0] if (len( subtree_matcher(doc, 'obj')) == 1) else subtree_matcher( doc, 'obj') if type(nameArrive) == list: for obj in nameArrive: if obj in cityTokenText: nameArrive = obj break if nameArrive != '': h = 'h4' destVpFlag = True else: h = '?h' if subtree_matcher(doc, 'case', text='từ') != []: d = 'd3' nameDepart = checkHead(doc, 'từ') if (nameDepart != ''): sourceVpFlag listObj = subtree_matcher(doc, 'obj') listCompound = subtree_matcher(doc, 'compound') for sub in listObj: if sub in busTokenText: busName = sub if busName == '': for sub in listCompound: if sub in busTokenText: busName = sub if busName != '': busNameNpFlag = True bVar = 'f2' h_BusName = 'h3' if arriveFlag and not (destVpFlag) and not (sourceVpFlag): vp = FeatStruct( arrive=FeatStruct(a=a, f=f, t=FeatStruct(t_var=t, time_var=time))) elif departFlag and departVpFlag: vp = FeatStruct( depart=FeatStruct(d='d3', f='f1', t=FeatStruct(t_var=t, time_var=time)), source=FeatStruct(bus='h3', sourceName=FeatStruct(f=Variable('?h'), name=nameDepart))) else: vp = FeatStruct( depart=FeatStruct(d='d3', f='f1', t=FeatStruct(t_var=t, time_var=time)), source=FeatStruct(bus='h4', sourceName=FeatStruct(f=Variable('?h'), name=nameDepart)), arrive=FeatStruct(a='a3', f='f2', t=FeatStruct(t_var=t, time_var=time)), dest=FeatStruct(destName=FeatStruct( f=Variable('?f'), name=FeatStruct(h='h6', name=nameArrive)))) if destNpFlag and not (busNameNpFlag): np = FeatStruct(dest=FeatStruct( bus=Variable('?f'), dest=FeatStruct(f=Variable('?f'), name=FeatStruct(h='h3', name=nameArrive)))) else: np = FeatStruct(the=FeatStruct( bus=bVar, busname=FeatStruct(h=h_BusName, name=busName))) wh = FeatStruct(whType=FeatStruct(f=f, type=typeWh)) sem = FeatStruct(query=FeatStruct(vp=vp, np=np, wh=wh)) var = Variable('?a') result = featStruct(gap, sem, var, arriveFlag=arriveFlag, destVpFlag=destVpFlag, sourceVpFlag=sourceVpFlag, busNameNpFlag=busNameNpFlag, destNpFlag=destNpFlag, departFlag=departFlag, departVpFlag=departVpFlag) print(result) return result
def _applyConstraints(self, parent, child): """ Constraint step: Given a satisfied state ``child'' and the updated state ``parent,'' compositional constraints are applied in the form of unification between the semantic heads of the two states. A successful unification implies the two states are compatible given the compositional constraints. In which case, the semantic bodies are merged and updated to reflect the unification. Lastly, the string of the ``parent'' is updated. Because unification is a tricky process, care needs to be taken to ensure that it is properly performed. Variables that have the same name are assumed to be the same, but because the grammar generates feature structures for compositional constraints using a small set of variable names, often variables can have the same name despite their being independent entities. One workaround is to rename the variables in one of the semantic heads before unification. It is important that the renamed variables are updated in the semantic bodies as well. """ if LWFG.is_terminal(child.prod.lhs()): # child is a special lexical state, and thus contains no semantic info return [True, parent] # skip ################################################### # Unification of Semantic Heads ################################################## # The compositional constraints are represented in such a way that there a separate set of constraints for each # term in the production rule. # Each set of constraints is a separate feature structure. # The set of constraints for the LHS are indexed with a feature identifier: `h' # The set of constraints for the ith term in the RHS are indexed with a feature indentifier: `hi' # # Thus, when unification is performed, the constraints of the LHS of the child state (indexed by `h') need # to be retrieved. # Also, because NLTK unifies embedded feature structures only when their feature identifiers are the same, # we need to have the feature id of the retrieved set of constraints to match the corresponding feature id # in the parent state. # Again, that would be `hi' where $i$ is the index of the child's LHS in the parent's RHS. hidx = 'h'+str(parent.dotIdx) childHead = FS() # make a new Feature Structure to get around the copy-by-ref issues childHead[hidx] = child.head['h'] parentHead = parent.head # Rename variables in childHead to avoid confusion # step 0: check if child.body and parent.body share variable names, change any that are shared renamedVarMap1 = {} usedVars = [] if parent.body: pBodVars = parent.body.variables() cBodVars = child.body.variables() for v in cBodVars: if v in pBodVars: # change v nv = self._newVarName(v) while nv in pBodVars+cBodVars: nv = self._newVarName(nv) usedVars.append(Variable(nv)) renamedVarMap1[Variable(v)] = Variable(nv) else: # not shared by parent usedVars.append(Variable(v)) for v in pBodVars: usedVars.append(Variable(v)) else: for v in child.body.variables(): usedVars.append(Variable(v)) # step 1: find used variables from parent's semantic head usedVars += list(parentHead.variables()) # step 2: rename variables in child's semantic head renamedVarMap2 = {} childHead = childHead.rename_variables(used_vars=usedVars, new_vars=renamedVarMap2) # check if features align with each other childFeats = set(childHead[hidx].keys()) parentFeats = set(parentHead[hidx].keys()) if not (parentFeats <= childFeats): # True if the relevant set of features of the parent state # are **not** a subset of those of the child state. return [False, parent] # If True, the states are incompatible. # perform unification bindings = {} parentHead = parentHead.unify(childHead, bindings) if not parentHead: # failed to unify return [False, parent] # the states are incompatible # update pState's rule parent.head = parentHead ################################################### # Updating Semantic Bodies ################################################## childBody = LWFG.OntoSeR(str(child.body)) # create new sem body to get around copy-by-ref issues # The next part is a bit confusing and needs spelling out. # We want to change variables names in the child's semantic body that are shared with # the parent's semantic body. # However, we can't just do it willy-nilly, we need to ensure that only those variable names # that are not linked with the parent's body are changed. # For example, in "the smart girl" -- when "n -> det n" is completed, the body corresponding to "the" # is linked with the body corresponding to "smart girl". # If we change variables names without consideration, that link is lost. # # To maintain the link, a roundabout method is employed. # We first look at all renamed var names in "renamedVarMap2" -- which results from changing vars in # the child's semantic head. # We then look at all renamed names and see whether they were involved in unification with the parent's # head. # We can do that by going through the unification "bindings" and checking to see if the renamed names # are keys. # If they were involved in unification, we check to see if they were bound to a variable in the parent's # head that we were about to rename. # We can do that by looking to see if the bound variable is a key in "renamedVarMap1" -- which we got # from renaming variable names in the child's body that were present in the parent's body. for var in renamedVarMap2.keys(): if renamedVarMap2[var] in bindings.keys() and bindings[renamedVarMap2[var]] in renamedVarMap1.keys(): renamedVarMap1.pop(bindings[renamedVarMap2[var]]) childBody.substituteBindings(renamedVarMap2) childBody.substituteBindings(renamedVarMap1) if not parent.body: ## print parent, child ## print parent.body, child.body ## print renamedVarMap1 ## print renamedVarMap2 ## print bindings ## print " " parentBody = childBody else: parentBody = LWFG.OntoSeR(str(parent.body) + ',' + str(childBody)) parentBody.substituteBindings(bindings) parent.body = parentBody ################################################### # Updating Strings ################################################## if not parent.string: parentString = child.string else: parentString = parent.string + ' ' + child.string parent.string = parentString return [True, parent]