# 1st pass: collect all strings that can be yielded from nodes of the trees # to update Str2Code iter_count=0 print('First pass to collect all strings... ') for sent in Annotation: iter_count +=1 if iter_count%int(len(Annotation)/10)==0: print(iter_count/len(Annotation),'% having been processed ...') tree=ImmutableParentedTree(sent) word=''.join(tree.leaves()) if word in Vec: # word in the scope of our vocabulary/corpus, go on (ignore others) tag_collapsed=Vec[word] if tag_collapsed in Tag2Word: Tag2Word[tag_collapsed].add(word) else: Tag2Word[tag_collapsed]={word} Forest.append(tree) for subtree in tree.subtrees():
S=[] # S is the stack for tree node visit tree=ImmutableParentedTree(sent) #using NLTK.Tree data structure, representing the tree # #count unique strings(leaves) associated with subtrees # #print(tree) #print(tree.leaves()) string=''.join(tree.leaves()) #If the word has occurred in the corpus... if string in Vec: Forest.append(tree) tag_set=Vec[string] S.append(tree) for s in tree.subtrees(): Symbols2.add(''.join(s.leaves()))