def grids_from_text(ldc_desc, args): """Extract grids for documents in a corpus (already parsed)""" t0 = time() try: input_path = '{0}/trees/{1}'.format(args.workspace, ldc_desc['name']) output_path = '{0}/grids/{1}'.format(args.workspace, ldc_desc['name']) logging.info('processing: %s', input_path) if not args.dry_run: with gzip.open(input_path + '.gz', 'rb') as fi: with gzip.open(output_path + '.gz', 'wb') as fo: for lines, attrs in iterdoctext(fi): logging.debug('document %s', attrs['id']) cmd_line = args.ExtractGrid cmd_args = shlex.split(cmd_line) proc = subprocess.Popen(cmd_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE) (stdoutdata, stderrdata) = proc.communicate( '{0}\n'.format('\n'.join(lines))) writedoctext(fo, stdoutdata.split('\n'), id=attrs['id']) logging.info('done: %s', output_path) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) #print >> sys.stderr, ptb_str #print ' '.join(Tree(ptb_str).leaves()) #print return time() - t0
def grids_from_text(ldc_desc, args): """Extract grids for documents in a corpus (already parsed)""" t0 = time() try: input_path = '{0}/trees/{1}'.format(args.workspace, ldc_desc['name']) output_path = '{0}/grids/{1}'.format(args.workspace, ldc_desc['name']) logging.info('processing: %s', input_path) if not args.dry_run: with gzip.open(input_path + '.gz', 'rb') as fi: with gzip.open(output_path + '.gz', 'wb') as fo: for lines, attrs in iterdoctext(fi): logging.debug('document %s', attrs['id']) cmd_line = args.ExtractGrid cmd_args = shlex.split(cmd_line) proc = subprocess.Popen(cmd_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE) (stdoutdata, stderrdata) = proc.communicate('{0}\n'.format('\n'.join(lines))) writedoctext(fo, stdoutdata.split('\n'), id=attrs['id']) logging.info('done: %s', output_path) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) #print >> sys.stderr, ptb_str #print ' '.join(Tree(ptb_str).leaves()) #print return time() - t0
def main(args): """ Extract entities and construct grid """ try: #for ipath in enumerate(ipaths): #with gzip.open(input_path, 'rb') as fi: #with gzip.open(input_path+'_grid' + '.gz', 'wb') as fo: with open(args.directory, 'rb' ) as fi, \ open(args.directory+'_grid', 'w') as fo: text_idx = 0 grids = [] for lines, attrs in iterdoctext(fi): logging.debug('document %s', attrs['id']) print ' extract '+str(len(lines))+' lines' print >> fo, "# docid=" + attrs['id'] print >> fo, "# id=" + text_idx entities, sent_num = extract_grids(lines) print entities grid = construct_grid(entities, sent_num) grids.append(grid) print grid output_grid(grid, fo) #writedoctext(fo, grid , id=attrs['id']) text_idx+=1 logging.info('done: %s', args.directory) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
def extract_grids(fi): """ Identify entities from ptb trees for document. store in dictionary for grid construction. """ idx = 0 entities = defaultdict(lambda : defaultdict(dict)) #print 'fi='+fi for lines, attrs in iterdoctext(fi): logging.debug('document %s', attrs['docid']) print ' extract '+str(len(lines))+' lines' #for line in lines: entities, idx = (convert_tree(line, entities) for line in lines) return entities, idx
def extract_grids(fi): """ Identify entities from ptb trees for document. store in dictionary for grid construction. """ idx = 0 entities = defaultdict(lambda: defaultdict(dict)) #print 'fi='+fi for lines, attrs in iterdoctext(fi): logging.debug('document %s', attrs['docid']) print ' extract ' + str(len(lines)) + ' lines' #for line in lines: entities, idx = (convert_tree(line, entities) for line in lines) return entities, idx
def main(args): """ Extract documents and shuffle sentences within each document """ try: fi = open(args.directory, 'r') with open('{0}.shuffled'.format(args.directory), 'w') as fo: for lines, attributes in iterdoctext(fi): random.shuffle(lines) logging.debug('shuffled: %s', lines) writedoctext(fo, lines, **attributes) logging.info('done: %s', args.directory) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
def main(args): """ Extract documents and shuffle sentences within each document """ try: fi = open(args.directory, 'r') with open('{0}.shuffled'.format(args.directory), 'w') as fo: for lines, attributes in iterdoctext(fi): random.shuffle(lines) logging.debug('shuffled: %s', lines) writedoctext(fo, lines, **attributes) logging.info('done: %s', args.directory) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
def main(args): # reads in documents for trees, attrs in iterdoctext(args.input): # generator of d-sequences sequences = (dseqs(tree, depth=args.depth, no_punc=not args.punc, lexicalised=args.lexicalised, child_phrase=args.child, backoff=['*']) for tree in trees) # writes d-sequences writedoctext(args.output, (' '.join(patterns) for patterns in sequences), **attrs)
def main(args): # reads in documents for trees, attrs in iterdoctext(args.input): # generator of d-sequences sequences = (dseqs(tree, depth=args.depth, no_punc=not args.punc, lexicalised=args.lexicalised, child_phrase=args.child, backoff=['*']) for tree in trees) # writes d-sequences writedoctext(args.output, (' '.join(patterns) for patterns in sequences), **attrs)
def main(args): """ Extract documents and output each document to separate file""" try: fi = open(args.directory, 'r') idx = 0 for lines, attributes in iterdoctext(fi): with open('{0}.{1}'.format(args.directory, idx), 'w') as fo: logging.debug('done: %s', args.directory) logging.debug('doc: %s', lines) for line in lines: print >> fo, line idx += 1 logging.info('done: %s', args.directory) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
def main(args): """ Extract documents and output each document to separate file""" try: fi = open(args.directory, 'r') idx = 0 for lines, attributes in iterdoctext(fi): with open('{0}.{1}'.format(args.directory, idx), 'w') as fo: logging.debug('done: %s', args.directory) logging.debug('doc: %s', lines) for line in lines: print >> fo, line idx+=1 logging.info('done: %s', args.directory) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
def main(args): """ Converts doctext to good SGML Arguments --------- argparse's args """ from discourse.doctext import iterdoctext from discourse.docsgml import MakeSGMLDocs import sys sgmler = MakeSGMLDocs() [ sgmler.add_doc(content, **attrs) for content, attrs in iterdoctext(args.input) ] sgmler.write(args.output)
def main(args): logging.basicConfig(level=logging.INFO, format='%(levelname)s %(message)s') # reads docs from input docs = list(iterdoctext(args.input)) # distributes the jobs pool = Pool(args.jobs) logging.info('Distributing %d jobs to %d workers', len(docs), args.jobs) result = pool.map(partial(wrap_parse, args=args), docs) # stores the output times = [] for (content, attrs), (trees, dt) in itertools.izip(docs, result): writedoctext(args.output, trees, **attrs) times.append(dt) # dumps a summary print >> sys.stderr, tabulate(enumerate(times), headers=['doc', 'time'], tablefmt='pipe')
def main(args): logging.basicConfig(level=logging.INFO, format='%(levelname)s %(message)s') # reads docs from input docs = list(iterdoctext(args.input)) # distributes the jobs pool = Pool(args.jobs) logging.info('Distributing %d jobs to %d workers', len(docs), args.jobs) result = pool.map(partial(wrap_parse, args=args), docs) # stores the output times = [] for (content, attrs), (trees, dt) in itertools.izip(docs, result): writedoctext(args.output, trees, **attrs) times.append(dt) # dumps a summary print >> sys.stderr, tabulate(enumerate(times), headers=['doc', 'time'], tablefmt='pipe')
for path in glob('{0}/{1}*'.format(output_dir, corpus))) logging.info('%d files matching %s', len(done), '{0}/{1}*'.format(output_dir, corpus)) missing = todo - done return todo, done, missing def wrap_dseqs((i, ipath, opath), depth, **kwargs): """ Wrap a call to dseqs. To be used with Pool.map. """ try: logging.info('(%d) %s ', i, ipath) fi = smart_open(ipath, 'r') fo = smart_open(opath, 'w') for trees, attrs in iterdoctext(fi): sequences = [ ' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees ] writedoctext(fo, sequences, **attrs) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) def extract_dseqs(corpus, args, namespace, **kwargs): """ Extracts dsequences for a certain corpus """ logging.info('Extracting d-sequences for: %s', corpus) input_dir = namespace.trees
def read_grids(istream, str2int): return [np.array([[str2int[role] for role in line] for line in lines], int) for lines, attrs in iterdoctext(istream)]
def read_alignments(istream): #return [np.array([[str2int[role] for role in line] for line in lines], int) for lines, attrs in iterdoctext(istream)] return [ np.array([[alignment for alignment in line] for line in lines], int) for lines, attrs in iterdoctext(istream) ]
def read_grids(istream, str2int): return [ np.array([[str2int[role] for role in line] for line in lines], int) for lines, attrs in iterdoctext(istream) ]
todo = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(input_dir, corpus))) logging.info('%d files matching %s', len(todo), '{0}/{1}*'.format(input_dir, corpus)) done = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(output_dir, corpus))) logging.info('%d files matching %s', len(done), '{0}/{1}*'.format(output_dir, corpus)) missing = todo - done return todo, done, missing def wrap_dseqs((i, ipath, opath), depth, **kwargs): """ Wrap a call to dseqs. To be used with Pool.map. """ try: logging.info('(%d) %s ', i, ipath) fi = smart_open(ipath, 'r') fo = smart_open(opath, 'w') for trees, attrs in iterdoctext(fi): sequences = [' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees] writedoctext(fo, sequences, **attrs) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) def extract_dseqs(corpus, args, namespace, **kwargs): """ Extracts dsequences for a certain corpus """ logging.info('Extracting d-sequences for: %s', corpus) input_dir = namespace.trees output_dir = namespace.dseqs todo, done, missing = file_check(corpus, input_dir, output_dir)
def read_alignments(istream): #return [np.array([[str2int[role] for role in line] for line in lines], int) for lines, attrs in iterdoctext(istream)] return [np.array([[ alignment for alignment in line] for line in lines], int) for lines, attrs in iterdoctext(istream)]