word=''.join(tree.leaves()) if word in Vec: # word in the scope of our vocabulary/corpus, go on (ignore others) tag_collapsed=Vec[word] if tag_collapsed in Tag2Word: Tag2Word[tag_collapsed].add(word) else: Tag2Word[tag_collapsed]={word} Forest.append(tree) for subtree in tree.subtrees(): string=''.join(subtree.leaves()) if not string in Str2Code: Str2Code[string]=type_count Code2Str[type_count]=string type_count +=1 print('\nCurrent type count is:', type_count) print('while current Vec size is:', len(Vec)) print('tmp test', type_count,len(Str2Code))
string=''.join(tree.leaves()) #If the word has occurred in the corpus... if string in Vec: Forest.append(tree) tag_set=Vec[string] S.append(tree) for s in tree.subtrees(): Symbols2.add(''.join(s.leaves())) while S: current_tree=S.pop() string=''.join(current_tree.leaves()) Symbols.add(string) # propogate the tagset to current tree, note: one useless update for the root node if string in Vec: Vec[string].update(tag_set) else: