Forest=[] Tag2Word={} # 1st pass: collect all strings that can be yielded from nodes of the trees # to update Str2Code iter_count=0 print('First pass to collect all strings... ') for sent in Annotation: iter_count +=1 if iter_count%int(len(Annotation)/10)==0: print(iter_count/len(Annotation),'% having been processed ...') tree=ImmutableParentedTree(sent) word=''.join(tree.leaves()) if word in Vec: # word in the scope of our vocabulary/corpus, go on (ignore others) tag_collapsed=Vec[word] if tag_collapsed in Tag2Word: Tag2Word[tag_collapsed].add(word) else: Tag2Word[tag_collapsed]={word} Forest.append(tree)
not_covered=0 #those annotation, the words of which have not occurred in our corpus Symbols=set() Symbols2=set() OOV=set() # Forest=[] # list of parsed trees, an output of this program # for sent in Annotation: S=[] # S is the stack for tree node visit tree=ImmutableParentedTree(sent) #using NLTK.Tree data structure, representing the tree # #count unique strings(leaves) associated with subtrees # #print(tree) #print(tree.leaves()) string=''.join(tree.leaves()) #If the word has occurred in the corpus...