def getbasenps(pospath, pennpath): symbols = reader.readsymbols(pennpath) n = 9 for possent in posio.posread(pospath): base_nps = [] for index in range(len(possent)): # possymbol = '/'.join(possent[index]) symbol = symbols.next().replace('(', '-LRB-').replace(')', '-RRB-').replace('{', '-LCB-').replace('}', '-RCB-') lastn = 0 while symbol[:symbol.rfind('/')] != possent[index][0]: if lastn != n: posio.posprint(possent) print n lastn = n print symbol if symbol == '[': assert not base_nps or base_nps[-1][1] != None base_nps.append([index, None]) elif symbol == ']': if base_nps: assert base_nps and base_nps[-1][1] == None base_nps[-1][1] = index symbol = symbols.next().replace('(', '-LRB-').replace(')', '-RRB-').replace('{', '-LCB-').replace('}', '-RCB-') if base_nps and base_nps[-1][1] == None: # symbol = symbols.next().replace('(', '-LRB-').replace(')', '-RRB-').replace('{', '-LCB-').replace('}', '-RCB-') # assert symbol == ']' base_nps[-1][1] = index assert not base_nps or base_nps[-1][1]!=None print base_nps n += 1
def randompos(path, cutoff): for p in posio.posread(path): n = 0 for w, t in p: if random.random() < cutoff: p[n][1] = '_NONE-' n += 1 posio.posprint(p)
def batch_penn_to_pos(base, sections): sections = sections.split('-') assert len(sections) == 1 or len(sections) == 2 if len(sections) == 1: sections = [int(sections[0])] else: sections = range(int(sections[0]), int(sections[1])) for sec in sections: path = os.path.join(base, '%02d'%(sec,)) for file in os.listdir(path): assert file.endswith('.pos') for sent in penn2pos(os.path.join(path, file)): posio.posprint(sent)
import sys import posio import replace # how many sentences N = 5000 f = sys.argv[1] r = replace.CReplace(sys.argv[2]) i = 0 for sent in posio.posread(f, '_'): if i < N: for n in range(len(sent)): sent[n][0] = r.replace(sent[n][0]) i += 1 posio.posprint(sent, '_')
import sys import posio import replace # how many sentences N=5000 f = sys.argv[1] r = replace.CReplace(sys.argv[2]) i = 0 for sent in posio.posread(f, '_'): if i < N: for n in range(len(sent)): sent[n][0] = r.replace(sent[n][0]) i += 1; posio.posprint(sent, '_')