def test_serialisation(nlp, docs_path, models_path): docs = [] utils.multi_thread_process_files(docs_path, 'txt', 10, nlp_process_doc, args=[docs]) jl.dump(docs, models_path)
def process_doc_anns(anns_folder, full_text_folder, rule_config_file, output_folder, study_folder=None, study_config='study.json', full_text_fn_ptn='%s.txt', fn_pattern='se_ann_%s.json', thread_num=10, es_inst=None, es_text_field='', patient_id_field='', combined_anns=None, es_output_index=None, es_output_doc='doc'): """ multiple threading process doc anns :param anns_folder: :param full_text_folder: :param rule_config_file: :param output_folder: :param study_folder: :param study_config: :param full_text_fn_ptn: :param fn_pattern: :param thread_num: :param es_inst: semquery.SemEHRES instance :param es_text_field: the full text filed name in the es index :return: """ if es_inst is None: text_reader = FileTextReader(full_text_folder, full_text_fn_ptn) else: text_reader = ESTextReader(es_inst, es_text_field, patient_id_field=patient_id_field) ret = load_study_ruler(study_folder, rule_config_file, study_config) sa = ret['sa'] ruler = ret['ruler'] # for ff in [f for f in listdir(anns_folder) if isfile(join(anns_folder, f))]: # analyse_doc_anns(join(anns_folder, ff), ruler, text_reader, output_folder, fn_pattern, sa) if combined_anns is None: utils.multi_thread_process_files(dir_path=anns_folder, file_extension='json', num_threads=thread_num, process_func=analyse_doc_anns_file, args=[ruler, text_reader, output_folder, fn_pattern, es_inst, es_output_index, es_output_doc, sa]) else: ann_files = [f for f in listdir(anns_folder) if isfile(join(anns_folder, f))] for ann in ann_files: utils.multi_process_large_file_tasking( large_file=join(anns_folder, ann), process_func=analyse_doc_anns_line, args=[ruler, text_reader, output_folder, fn_pattern, es_inst, es_output_index, es_output_doc, sa]) logging.info('post processing of ann docs done')
def process_files(read_dir, write_dir): utils.multi_thread_process_files(read_dir, file_extension='xml', num_threads=10, process_func=save_full_text, args=[write_dir])