def badsgml2text(ldc_name, args): path = '{0}/bsgml_trees/{1}'.format(args.workspace, ldc_name) logging.info('Processing %s', path) n_lines = [] try: with open(path, 'r') as fi: with gzip.open('{0}/trees/{1}.gz'.format(args.workspace, ldc_name), 'wb') as fo: lines = fi.read().split('\n') doc_re = re.compile('<doc id="(.+)">') doc_id, doc_lines = None, None for line in lines: # try to match <doc ... m = doc_re.match(line) if m is not None: # starts a doc doc_id = m.group(1) doc_lines = [] # try to match </doc> elif line == '</doc>': # add the doc to an actual SGML file n_lines.append(len(doc_lines)) writedoctext(fo, doc_lines, id=doc_id) doc_lines = None doc_id = None # if there is an open doc, append lines to it elif line and doc_lines is not None: doc_lines.append(line) #print >> sys.stderr, ptb_str #print ' '.join(Tree(ptb_str).leaves()) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) return n_lines
def grids_from_text(ldc_desc, args): """Extract grids for documents in a corpus (already parsed)""" t0 = time() try: input_path = '{0}/trees/{1}'.format(args.workspace, ldc_desc['name']) output_path = '{0}/grids/{1}'.format(args.workspace, ldc_desc['name']) logging.info('processing: %s', input_path) if not args.dry_run: with gzip.open(input_path + '.gz', 'rb') as fi: with gzip.open(output_path + '.gz', 'wb') as fo: for lines, attrs in iterdoctext(fi): logging.debug('document %s', attrs['id']) cmd_line = args.ExtractGrid cmd_args = shlex.split(cmd_line) proc = subprocess.Popen(cmd_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE) (stdoutdata, stderrdata) = proc.communicate( '{0}\n'.format('\n'.join(lines))) writedoctext(fo, stdoutdata.split('\n'), id=attrs['id']) logging.info('done: %s', output_path) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) #print >> sys.stderr, ptb_str #print ' '.join(Tree(ptb_str).leaves()) #print return time() - t0
def grids_from_text(ldc_desc, args): """Extract grids for documents in a corpus (already parsed)""" t0 = time() try: input_path = '{0}/trees/{1}'.format(args.workspace, ldc_desc['name']) output_path = '{0}/grids/{1}'.format(args.workspace, ldc_desc['name']) logging.info('processing: %s', input_path) if not args.dry_run: with gzip.open(input_path + '.gz', 'rb') as fi: with gzip.open(output_path + '.gz', 'wb') as fo: for lines, attrs in iterdoctext(fi): logging.debug('document %s', attrs['id']) cmd_line = args.ExtractGrid cmd_args = shlex.split(cmd_line) proc = subprocess.Popen(cmd_args, stdin=subprocess.PIPE, stdout=subprocess.PIPE) (stdoutdata, stderrdata) = proc.communicate('{0}\n'.format('\n'.join(lines))) writedoctext(fo, stdoutdata.split('\n'), id=attrs['id']) logging.info('done: %s', output_path) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) #print >> sys.stderr, ptb_str #print ' '.join(Tree(ptb_str).leaves()) #print return time() - t0
def main(args): """ Extract documents and shuffle sentences within each document """ try: fi = open(args.directory, 'r') with open('{0}.shuffled'.format(args.directory), 'w') as fo: for lines, attributes in iterdoctext(fi): random.shuffle(lines) logging.debug('shuffled: %s', lines) writedoctext(fo, lines, **attributes) logging.info('done: %s', args.directory) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
def main(args): # reads in documents for trees, attrs in iterdoctext(args.input): # generator of d-sequences sequences = (dseqs(tree, depth=args.depth, no_punc=not args.punc, lexicalised=args.lexicalised, child_phrase=args.child, backoff=['*']) for tree in trees) # writes d-sequences writedoctext(args.output, (' '.join(patterns) for patterns in sequences), **attrs)
def extract_and_save_txt(sgml_gz, args): """Extracts documents from a gzipped sgml file -> file ids""" try: ids = [] n = 0 logging.info('Processing %s', sgml_gz) stem = get_ldc_name(sgml_gz) with gzip.open(sgml_gz, 'rb') as fi: with gzip.open('{0}/raw/{1}.gz'.format(args.workspace, stem), 'wb') as fo: parser = TextFromSGML(fi.read(), text_under='text', root='sgml') for doc in parser.iterdocs(): if doc['text']: ids.append(doc['id']) writedoctext(fo, doc['text'].split('\n'), id=doc['id']) logging.info('%s contains %d documents', stem, len(ids)) return ids except: raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
def main(args): logging.basicConfig(level=logging.INFO, format='%(levelname)s %(message)s') # reads docs from input docs = list(iterdoctext(args.input)) # distributes the jobs pool = Pool(args.jobs) logging.info('Distributing %d jobs to %d workers', len(docs), args.jobs) result = pool.map(partial(wrap_parse, args=args), docs) # stores the output times = [] for (content, attrs), (trees, dt) in itertools.izip(docs, result): writedoctext(args.output, trees, **attrs) times.append(dt) # dumps a summary print >> sys.stderr, tabulate(enumerate(times), headers=['doc', 'time'], tablefmt='pipe')
return todo, done, missing def wrap_dseqs((i, ipath, opath), depth, **kwargs): """ Wrap a call to dseqs. To be used with Pool.map. """ try: logging.info('(%d) %s ', i, ipath) fi = smart_open(ipath, 'r') fo = smart_open(opath, 'w') for trees, attrs in iterdoctext(fi): sequences = [ ' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees ] writedoctext(fo, sequences, **attrs) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) def extract_dseqs(corpus, args, namespace, **kwargs): """ Extracts dsequences for a certain corpus """ logging.info('Extracting d-sequences for: %s', corpus) input_dir = namespace.trees output_dir = namespace.dseqs todo, done, missing = file_check(corpus, input_dir, output_dir) if not missing:
done = frozenset(os.path.basename(path) for path in glob('{0}/{1}*'.format(output_dir, corpus))) logging.info('%d files matching %s', len(done), '{0}/{1}*'.format(output_dir, corpus)) missing = todo - done return todo, done, missing def wrap_dseqs((i, ipath, opath), depth, **kwargs): """ Wrap a call to dseqs. To be used with Pool.map. """ try: logging.info('(%d) %s ', i, ipath) fi = smart_open(ipath, 'r') fo = smart_open(opath, 'w') for trees, attrs in iterdoctext(fi): sequences = [' '.join(dseqs(tree, depth=depth, **kwargs)) for tree in trees] writedoctext(fo, sequences, **attrs) except: raise Exception(''.join(traceback.format_exception(*sys.exc_info()))) def extract_dseqs(corpus, args, namespace, **kwargs): """ Extracts dsequences for a certain corpus """ logging.info('Extracting d-sequences for: %s', corpus) input_dir = namespace.trees output_dir = namespace.dseqs todo, done, missing = file_check(corpus, input_dir, output_dir) if not missing: logging.info('all d-sequences of depth %d are there, nothing to be done', args.depth)