def main(args): files = [path.strip() for path in sys.stdin if not path.startswith('#')] ldc_names = [get_ldc_name(path) for path in files] # sanity checks for ldc_name in ldc_names: if not os.path.exists('{0}/raw/{1}.gz'.format(args.workspace, ldc_name)): raise Exception('File not found: %s', '{0}/raw/{1}.gz'.format(args.workspace, ldc_name)) pool = Pool(args.jobs) logging.info('Distributing %d jobs to %d workers', len(ldc_names), args.jobs) t0 = time.time() result = pool.map(partial(parse_and_save, args=args), ldc_names) dt = time.time() - t0 logging.info('Total time: %f seconds', dt) data = zip(ldc_names, result) try: # prints a Markdown table if possible from tabulate import tabulate print tabulate(data, headers=['Corpus', 'Time (s)'], tablefmt='pipe') except: # plain table otherwise print '\n'.join('{0} {1}'.format(corpus, time) for corpus, time in data)
def main(args): files = [path.strip() for path in sys.stdin if not path.startswith('#')] ldc_names = [get_ldc_name(path) for path in files] # sanity checks for ldc_name in ldc_names: if not os.path.exists('{0}/bsgml_trees/{1}'.format(args.workspace, ldc_name)): raise Exception('File not found: %s', '{0}/bsgml_trees/{1}'.format(args.workspace, ldc_name)) # distribute jobs pool = Pool(args.jobs) logging.info('Distributing %d jobs to %d workers', len(ldc_names), args.jobs) t0 = time() # results = pool.map(partial(fix_bad_sgml, args=args), ldc_names) results = pool.map(partial(badsgml2text, args=args), ldc_names) dt = time() - t0 logging.info('Total time: %f seconds', dt) try: # prints a Markdown table if possible from tabulate import tabulate print tabulate(zip(ldc_names, (len(result) for result in results), (sum(result) for result in results), (np.mean(result) for result in results)), headers=['Corpus', 'Documents', 'Total Sentences', 'Average Document Length'], tablefmt='pipe') except ImportError: logging.info('Consider installing tabulate to get nice summaries.')
def extract_and_save_txt(sgml_gz, args): """Extracts documents from a gzipped sgml file -> file ids""" try: ids = [] n = 0 logging.info('Processing %s', sgml_gz) stem = get_ldc_name(sgml_gz) with gzip.open(sgml_gz, 'rb') as fi: with gzip.open('{0}/raw/{1}.gz'.format(args.workspace, stem), 'wb') as fo: parser = TextFromSGML(fi.read(), text_under='text', root='sgml') for doc in parser.iterdocs(): if doc['text']: ids.append(doc['id']) writedoctext(fo, doc['text'].split('\n'), id=doc['id']) logging.info('%s contains %d documents', stem, len(ids)) return ids except: raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
def extract_and_save_sgml(sgml_gz, args): """Extracts documents from a gzipped sgml file -> file ids""" try: ids = [] n = 0 logging.info('Processing %s', sgml_gz) stem = get_ldc_name(sgml_gz) with gzip.open(sgml_gz, 'rb') as fi: sgmler = MakeSGMLDocs(file=stem) parser = TextFromSGML(fi.read(), text_under='text', root='sgml') for doc in parser.iterdocs(): if doc['text']: ids.append(doc['id']) sgmler.add(doc['text'], id=doc['id']) sgmler.writegz('{0}/raw/{1}'.format(args.workspace, stem)) logging.info('%s contains %d documents', stem, len(ids)) return ids except: raise Exception(''.join(traceback.format_exception(*sys.exc_info())))
def main(args): files = [path.strip() for path in sys.stdin if not path.startswith('#')] ldc_names = [get_ldc_name(path) for path in files] pool = Pool(args.jobs) if args.sgml: results = pool.map(partial(extract_and_save_sgml, args=args), files) else: results = pool.map(partial(extract_and_save_txt, args=args), files) logging.info('Documents: %d', len(results)) data = zip(ldc_names, (len(ids) for ids in results)) try: # prints a Markdown table if possible from tabulate import tabulate print tabulate(data, headers=['Corpus', 'Documents'], tablefmt='pipe') except: # plain table otherwise print '\n'.join('{0} {1}'.format(c, n) for c, n in data)
def main(args): files = [path.strip() for path in sys.stdin if not path.startswith('#')] ldc_names = [get_ldc_name(path) for path in files] # sanity checks for ldc_name in ldc_names: if not os.path.exists('{0}/bsgml_trees/{1}'.format( args.workspace, ldc_name)): raise Exception( 'File not found: %s', '{0}/bsgml_trees/{1}'.format(args.workspace, ldc_name)) # distribute jobs pool = Pool(args.jobs) logging.info('Distributing %d jobs to %d workers', len(ldc_names), args.jobs) t0 = time() # results = pool.map(partial(fix_bad_sgml, args=args), ldc_names) results = pool.map(partial(badsgml2text, args=args), ldc_names) dt = time() - t0 logging.info('Total time: %f seconds', dt) try: # prints a Markdown table if possible from tabulate import tabulate print tabulate(zip(ldc_names, (len(result) for result in results), (sum(result) for result in results), (np.mean(result) for result in results)), headers=[ 'Corpus', 'Documents', 'Total Sentences', 'Average Document Length' ], tablefmt='pipe') except ImportError: logging.info('Consider installing tabulate to get nice summaries.')