Corpus1=[]
Corpus2=[]
Corpus3=[]


print('\nGenerating three vesions of word-structure annotation...')

count=0

for word in Word2treeID:

  index=Word2treeID[word]
  tree=NewForest[index]

  tree_str1=tree.pprint(margin=10000)
  tree_str2=remove_all_subscript(tree).pprint(margin=10000)
  tree_str3=remove_crl_subscript(tree).pprint(margin=10000)

# removing '_' between tag and subtag, as Stanford parser will remove the subtag when seeing '_'


  new_tree_str1=' '.join([i[:-2]+i[-1] if len(i)>1 and i[-2]=='_' else i for i in tree_str1.split()]) # remove '_' to merge subscript to merge it to the non-terminal
  new_tree_str2=tree_str2 # tree_str2 have already remove all the subscripts
  new_tree_str3=' '.join([i[:-2]+i[-1] if len(i)>1 and i[-2]=='_' else i for i in tree_str3.split()]) # remove '_' to merge subscript to merge    Annotation1.append((new_tree_str1, counter))

  Corpus1.append(new_tree_str1)
  Corpus2.append(new_tree_str2)
  Corpus3.append(new_tree_str3)

print('\ndone!')
Ejemplo n.º 2
0
  
  if count%int(len(lines)/10)==0:
    print(int(count/len(lines)*100),'% finished...')
  count +=1
  
  mini_tree_seq1=[]
  mini_tree_seq2=[]
  mini_tree_seq3=[]


  for word in sent:
    index=Word2treeID[word]
    tree=NewForest[index]

    mini_tree_seq1.append(tree.pprint(margin=10000))
    mini_tree_seq2.append(remove_all_subscript(tree).pprint(margin=10000))
    mini_tree_seq3.append(remove_crl_subscript(tree).pprint(margin=10000))

    #mini_tree_seq1.append(keep_all_subscript(tree, Encoder1).pprint(margin=10000))
    #mini_tree_seq2.append(remove_all_subscript(tree,Encoder2).pprint(margin=10000))
    #mini_tree_seq3.append(remove_crl_subscript(tree, Encoder3).pprint(margin=10000))


  Corpus1.append(mini_tree_seq1)
  Corpus2.append(mini_tree_seq2)
  Corpus3.append(mini_tree_seq3)

print('\ndone!')