def remove_all_subscript(tree):
  #new_tree=ParentedTree(tree.pprint())
  new_tree=tree.copy(deep=True) 
  for subtree in new_tree.subtrees():
    tag, subscript=decompose_tag(subtree.node)
    #subtree.node=StrEncoder.str2code(tag)
    subtree.node=tag

  return new_tree
def remove_crl_subscript(tree):
  new_tree=tree.copy(deep=True)
  for subtree in new_tree.subtrees():
    tag, subscript=decompose_tag(subtree.node)
    if subscript in {'l','r','c'}:
      subtree.node=tag

    else:
      subtree.node=subtree.node

  return new_tree
def remove_crl_subscript(tree):
  new_tree=tree.copy(deep=True)
  for subtree in new_tree.subtrees():
    tag, subscript=decompose_tag(subtree.node)
    if subscript in {'l','r','c'}:  # ---> revert on Oct 5
    #if subscript in {'l','r','c','u'}:     #------> XXX Change on Oct 4 <----------

      subtree.node=tag

    else:
      subtree.node=subtree.node

  return new_tree
print('\n\nprocessing annotation from ', path_annotation, '...  \nprograss:')
f=codecs.open(path_annotation, 'rU', 'utf-8')
lines=f.readlines()
f.close()

Production=[]

count=0
total_nth=int(len(lines)/10)
for line in lines:
  if count%total_nth==0:
    print(count/total_nth*10, '% finished')
  count +=1

  tree=Tree(line.strip())
  tag, subscript=decompose_tag(tree.node)
  word=''.join(tree.leaves())

  word_pos2tree_str[(word, tag)]=line.strip()
  

print('done!')

#
# gen single-char annotation from the corpus
#

print('\n\ngenerating rules for single-char words from corpus')

#---> one needs to run 2a_gen_tag_set_for_word_type.py to gen word2newtag.pickle before using it
path_word2newtag='../working_data/word2newtag.pickle'
Exemple #5
0
if len(sys.argv)>1:
    path_to_annotaiton=sys.argv[1]
    if len(sys.argv)>2:
        path_to_rule=sys.argv[2]
    else:
        path_to_rule='/'+'/'.join(os.path.realpath(path_to_annotation).split('/')[:-1])+'/'+'rules.zpar'

def remove_p(d_str):
    return '  ' if d_str in {'(',')'} else d_str
    
l_set={'l','c'}
r_set={'r'}
rules={}#dictionary to keep rules

print('\ncollecting non-termianls...')
lines=codecs.open(path_to_annotation, 'rU','utf-8').readlines()
non_terminals={d_string for line in lines for d_string in ''.join([remove_p(char) for char in line]).split() if len(d_string)>2 and d_string[-2]=='_' and d_string[-1] in string.ascii_letters  }

print('\nconstructing rules...')
for full_tag in non_terminals:
    main_tag, subscript = decompose_tag(full_tag)
    #only dealing with l/r/c tag, not b/i tag
    if subscript in {'l','c','r'}:
        rules[full_tag]='l' if subscript in l_set  else 'r'

print('\nwriting rules to file', path_to_rule)
f=codecs.open(path_to_rule,'w','utf-8')
for tag in rules:
    f.write(tag+'	:'+rules[tag]+'\n')
f.close()    
Exemple #6
0
  if count%int(len(Forest)/10)==0:
      print('progress------->',str(count/len(Forest)*100)[:2], '% finished')


  new_tree=ParentedTree(tree.pprint())


  

  for subtree in new_tree.subtrees():  #update current tree

    string=''.join(subtree.leaves())

    if  string in Vec:  #leaves/string in the record

      tag, subscript= decompose_tag(subtree.node)

      tag_vec_str=set2str(Vec[string]) #get the tag-set of the node according to the leaves and convert it to str

      subtree.node=tag_vec_str+'_'+subscript  #update the node with the new_tag


  NewForest.append(new_tree)
  

  for subtree in new_tree.subtrees(lambda x: len(x)>1 and ''.join(x.leaves()) in Vec ):  # extraction known production rules

    string=''.join(subtree.leaves())


    left_child=subtree[0]