Example #1
0
            raise ValueError('Input must be a directory of files.')
    except ValueError as err:
        print('Error: {0}'.format(err))
        print(('Usage: {0} -i input_dir [-o output_file] -m [hunpos_model] ' +
            '[-a]').format(sys.argv[0]))
        print('       input_dir: the directory with the input text files.')
        print('       hunpos_model: the hunpos model file.')
        print('       output_file: the conll2 output file. If omitted, the result will')
        print('                    be written to stdout.')
        print('       hunpos_model: the hunpos model file.')
        print('       -a: the output is appended to output_file, instead of overwriting it.')
        sys.exit()
    
    if 'o' in params:
        output_mode = 'a' if 'a' in params else 'w'
        out = FileWriter(params['o'], output_mode).open()
    else:
        out = StreamWriter(sys.stdout)
    
    nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m'))
    for infile in (os.path.join(d, f) for d, _, fs in os.walk(params['i']) for f in fs):
        print "File " + infile
        doc = FieldedDocument(infile)
        doc.fields = {}
        for field, raw_text in read_file(infile).iteritems():
            doc.fields[field] = nt.tag_raw(raw_text)
        write_doc(doc, out)
    
    if 'o' in params:
        out.close()
Example #2
0
            '       -a: the output is appended to output_file, instead of overwriting it.'
        )
        sys.exit()

    if 'o' in params:
        output_mode = 'a' if 'a' in params else 'w'
        out = FileWriter(params['o'], output_mode).open()
    else:
        out = StreamWriter(sys.stdout)

    nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m'))
    for infile in filter(os.path.isfile, [
            os.path.join(params['i'], infile)
            for infile in os.listdir(params['i'])
    ]):
        doc = FieldedDocument(infile)
        doc.fields = {}
        for field, raw_text in read_file(infile, True).iteritems():
            filtered = nt.filter_long_sentences(raw_text)
            diff = len(raw_text) - len(filtered)
            if diff > 0:
                sys.stderr.write("{0}: {1} bytes filtered.\n".format(
                    infile, diff))
            if len(filtered) > 0:
                doc.fields[field] = nt.tag_raw(filtered)
        if len(doc.fields) > 0:
            write_doc(doc, out)

    if 'o' in params:
        out.close()
Example #3
0
        print('                    be written to stdout.')
        print('       hunpos_model: the hunpos model file.')
        print('       -t: If specified, the first non-empty line of the the text files are')
        print('           considered to be titles, and will be processed accordingly.')
        print('       -a: the output is appended to output_file, instead of overwriting it.')
        sys.exit()
    
    if 'o' in params:
        output_mode = 'a' if 'a' in params else 'w'
        out = FileWriter(params['o'], output_mode).open()
    else:
        out = StreamWriter(sys.stdout)
    
    nt = NltkTools(pos=True, stem=True, tok=True, pos_model=params.get('m'))
    for infile in filter(os.path.isfile, [os.path.join(params['i'], infile)
                                          for infile in os.listdir(params['i'])]):
        doc = FieldedDocument(infile)
        doc.fields = {}
        for field, raw_text in read_file(infile, True).iteritems():
            filtered = nt.filter_long_sentences(raw_text)
            diff = len(raw_text) - len(filtered)
            if diff > 0:
                sys.stderr.write("{0}: {1} bytes filtered.\n".format(infile, diff))
            if len(filtered) > 0:
                doc.fields[field] = nt.tag_raw(filtered)
        if len(doc.fields) > 0:
            write_doc(doc, out)
    
    if 'o' in params:
        out.close()