def parse_multipe_files(input_filepaths, output_file_directory): """ Parse multiple input files serially. """ for input_filepath in input_filepaths: print(input_filepath) subdir = os.path.basename(os.path.split(input_filepath)[0]) basename = os.path.split(input_filepath)[1] ensure_directory(output_file_directory + subdir) output_path = output_file_directory + subdir + '/' + basename + '.json.gz' if os.path.isfile(output_path): continue parse_single_file(input_filepath, output_path)
'pid': pid, 'pos': sentence['pos'], 'sid': sentence['sid'], 'tokens': sentence['tokens'], 'links': mapped_links } json_str = json.dumps(tagged_sentence) + '\n' file_o.write(json_str.encode('utf-8')) #pylint:disable=invalid-name if __name__ == '__main__': if len(sys.argv) != 3: print('Usage: python utils/wikification/documents_to_sentences.py', '../data/processed/wikification/input_directory/', '../data/processed/wikification/output_directory/') sys.exit(1) input_directory = sys.argv[1] output_directory = sys.argv[2] filepaths = list(glob.iglob(input_directory + '**/wiki_*')) for filepath in filepaths: print(filepath) subdir = os.path.basename(os.path.split(filepath)[0]) basename = os.path.split(filepath)[1] ensure_directory(output_directory + subdir) output_path = output_directory + subdir + '/' + basename if os.path.isfile(output_path): continue parse_single_file(filepath, output_path)