Ejemplo n.º 1
0
    if 'Title' in doc.fields:
        outs.write(u"%%#Field\tTitle\n")
        write_text(doc.fields['Title'], outs)
    outs.write(u"%%#Field\tBody\n")
    write_text(doc.fields['Body'], outs)
            
def write_text(text, outs):
    for token in text:
        outs.write(u"\t".join(token))
        outs.write("\n")

if __name__ == '__main__':
    import sys
    
    try:
        params, args = cmd_utils.get_params_sing(sys.argv[1:], 'i:o:m:ta', 'i', 0)
        if not os.path.isdir(params['i']):
            raise ValueError('Input must be a directory of files.')
    except ValueError as err:
        print('Error: {0}'.format(err))
        print(('Usage: {0} -i input_dir [-o output_file] -m [hunpos_model] ' +
            '[-a]').format(sys.argv[0]))
        print('       input_dir: the directory with the input text files.')
        print('       hunpos_model: the hunpos model file.')
        print('       output_file: the conll2 output file. If omitted, the result will')
        print('                    be written to stdout.')
        print('       hunpos_model: the hunpos model file.')
        print('       -a: the output is appended to output_file, instead of overwriting it.')
        sys.exit()
    
    if 'o' in params:
Ejemplo n.º 2
0
    if 'Body' in doc.fields:
        outs.write(u"%%#Field\tBody\n")
        write_text(doc.fields['Body'], outs)


def write_text(text, outs):
    for token in text:
        outs.write(u"\t".join(token))
        outs.write("\n")


if __name__ == '__main__':
    import sys

    try:
        params, args = cmd_utils.get_params_sing(sys.argv[1:], 'i:o:m:ta', 'i',
                                                 0)
        if not os.path.isdir(params['i']):
            raise ValueError('Input must be a directory of files.')
    except ValueError as err:
        print('Error: {0}'.format(err))
        print(('Usage: {0} -i input_dir [-o output_file] -m [hunpos_model] ' +
               '[-t] [-a]').format(sys.argv[0]))
        print('       input_dir: the directory with the input text files.')
        print('       hunpos_model: the hunpos model file.')
        print(
            '       output_file: the conll2 output file. If omitted, the result will'
        )
        print('                    be written to stdout.')
        print('       hunpos_model: the hunpos model file.')
        print(
            '       -t: If specified, the first non-empty line of the the text files are'
Ejemplo n.º 3
0
        for mapping in mappings:
            try:
                key, value = mapping.strip().split("\t")
                type_map[key] = value
            except (ValueError):
                continue
    return type_map

def print_usage_and_exit():
    sys.stderr.write('Usage: {0} dbpedia_type_file [-c classes_OWL_file] [-m NE_mappings]\n'.format(__file__))
    sys.exit()

if __name__ == '__main__':
    import sys
    try:
        params, args = get_params_sing(sys.argv[1:], 'c:m:k', '', 1)
    except ValueError as ve:
        sys.stderr.write(ve + "\n")
        print_usage_and_exit()
    
    if len(args) != 1:
        print_usage_and_exit()

#    with open(sys.argv[1], 'r', encoding = 'utf-8') as type_stream:
    with FileReader(args[0], encoding='utf-8').open() as type_stream:
        lines = merge_pairs(extract_dbpedia_type(type_stream))
        filter = __read_map(params['m']) if 'm' in params else None
        if 'c' in params:
            lines = filter_general(lines, OwlClassHierarchy(params['c']), filter)
        if 'm' in params:
            lines = filter_type(lines, filter, 'k' in params)