raise ValueError('Input must be a directory of files.') except ValueError as err: print('Error: {0}'.format(err)) print(('Usage: {0} -i input_dir [-o output_file] -m [hunpos_model] ' + '[-a]').format(sys.argv[0])) print(' input_dir: the directory with the input text files.') print(' hunpos_model: the hunpos model file.') print(' output_file: the conll2 output file. If omitted, the result will') print(' be written to stdout.') print(' hunpos_model: the hunpos model file.') print(' -a: the output is appended to output_file, instead of overwriting it.') sys.exit() if 'o' in params: output_mode = 'a' if 'a' in params else 'w' out = FileWriter(params['o'], output_mode).open() else: out = StreamWriter(sys.stdout) nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m')) for infile in (os.path.join(d, f) for d, _, fs in os.walk(params['i']) for f in fs): print "File " + infile doc = FieldedDocument(infile) doc.fields = {} for field, raw_text in read_file(infile).iteritems(): doc.fields[field] = nt.tag_raw(raw_text) write_doc(doc, out) if 'o' in params: out.close()
' -a: the output is appended to output_file, instead of overwriting it.' ) sys.exit() if 'o' in params: output_mode = 'a' if 'a' in params else 'w' out = FileWriter(params['o'], output_mode).open() else: out = StreamWriter(sys.stdout) nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m')) for infile in filter(os.path.isfile, [ os.path.join(params['i'], infile) for infile in os.listdir(params['i']) ]): doc = FieldedDocument(infile) doc.fields = {} for field, raw_text in read_file(infile, True).iteritems(): filtered = nt.filter_long_sentences(raw_text) diff = len(raw_text) - len(filtered) if diff > 0: sys.stderr.write("{0}: {1} bytes filtered.\n".format( infile, diff)) if len(filtered) > 0: doc.fields[field] = nt.tag_raw(filtered) if len(doc.fields) > 0: write_doc(doc, out) if 'o' in params: out.close()
print(' be written to stdout.') print(' hunpos_model: the hunpos model file.') print(' -t: If specified, the first non-empty line of the the text files are') print(' considered to be titles, and will be processed accordingly.') print(' -a: the output is appended to output_file, instead of overwriting it.') sys.exit() if 'o' in params: output_mode = 'a' if 'a' in params else 'w' out = FileWriter(params['o'], output_mode).open() else: out = StreamWriter(sys.stdout) nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m')) for infile in filter(os.path.isfile, [os.path.join(params['i'], infile) for infile in os.listdir(params['i'])]): doc = FieldedDocument(infile) doc.fields = {} for field, raw_text in read_file(infile, True).iteritems(): filtered = nt.filter_long_sentences(raw_text) diff = len(raw_text) - len(filtered) if diff > 0: sys.stderr.write("{0}: {1} bytes filtered.\n".format(infile, diff)) if len(filtered) > 0: doc.fields[field] = nt.tag_raw(filtered) if len(doc.fields) > 0: write_doc(doc, out) if 'o' in params: out.close()