Forest=[]
Tag2Word={}

# 1st pass: collect all strings that can be yielded from nodes of the trees
#           to update Str2Code

iter_count=0
print('First pass to collect all strings... ')
for sent in Annotation:

  iter_count +=1

  if iter_count%int(len(Annotation)/10)==0:
      print(iter_count/len(Annotation),'% having been processed ...')

  tree=ImmutableParentedTree(sent)

  word=''.join(tree.leaves())

  if word in Vec:  # word in the scope of our vocabulary/corpus, go on (ignore others)

    tag_collapsed=Vec[word]

    if tag_collapsed in Tag2Word:
      Tag2Word[tag_collapsed].add(word)
    else:
      Tag2Word[tag_collapsed]={word}


    Forest.append(tree)
コード例 #2
0
not_covered=0 #those annotation, the words of which have not occurred in our corpus

Symbols=set()
Symbols2=set()

OOV=set()
           #
Forest=[]  # list of parsed trees, an output of this program
           #
            

for sent in Annotation:

  S=[] # S is the stack for tree node visit
  
  tree=ImmutableParentedTree(sent)  #using NLTK.Tree data structure, representing the tree


  #
  #count unique strings(leaves) associated with subtrees
  #



  #print(tree)
  #print(tree.leaves())

  string=''.join(tree.leaves())

  #If the word has occurred in the corpus...