def parse_multipe_files(input_filepaths, output_file_directory):
    """
    Parse multiple input files serially.
    """
    for input_filepath in input_filepaths:
        print(input_filepath)
        subdir = os.path.basename(os.path.split(input_filepath)[0])
        basename = os.path.split(input_filepath)[1]
        ensure_directory(output_file_directory + subdir)
        output_path = output_file_directory + subdir + '/' + basename + '.json.gz'
        if os.path.isfile(output_path):
            continue
        parse_single_file(input_filepath, output_path)
Esempio n. 2
0
                        'pid': pid,
                        'pos': sentence['pos'],
                        'sid': sentence['sid'],
                        'tokens': sentence['tokens'],
                        'links': mapped_links
                    }
                    json_str = json.dumps(tagged_sentence) + '\n'
                    file_o.write(json_str.encode('utf-8'))


#pylint:disable=invalid-name
if __name__ == '__main__':
    if len(sys.argv) != 3:
        print('Usage: python utils/wikification/documents_to_sentences.py',
              '../data/processed/wikification/input_directory/',
              '../data/processed/wikification/output_directory/')
        sys.exit(1)
    input_directory = sys.argv[1]
    output_directory = sys.argv[2]

    filepaths = list(glob.iglob(input_directory + '**/wiki_*'))
    for filepath in filepaths:
        print(filepath)
        subdir = os.path.basename(os.path.split(filepath)[0])
        basename = os.path.split(filepath)[1]
        ensure_directory(output_directory + subdir)
        output_path = output_directory + subdir + '/' + basename
        if os.path.isfile(output_path):
            continue
        parse_single_file(filepath, output_path)