def task_ner(file_ids_list=None): data_to_process = boilerplate.get_process_data(file_ids_list) processed_file_ids = list() for processed_file_id, contents in process_ner(data_to_process): processed_file_ids.append( boilerplate.add_processed_file(boilerplate.NER_PREFIX, processed_file_id, contents, extension='txt')) return processed_file_ids
def process_task(file_ids_list=None): files_to_process = boilerplate.list_files(recursive=True, prefix=boilerplate.UPLOAD_PREFIX) if file_ids_list: files_to_process = [ boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process ] data_to_process = { file_id[len(boilerplate.UPLOAD_PREFIX):]: boilerplate.get_file(file_id) for file_id in files_to_process } processed_file_ids = list() for processed_file_id, contents in process_data(data_to_process): processed_file_ids.append( boilerplate.add_processed_file(processed_file_id, contents)) return processed_file_ids
def process_task(file_ids_list=None): files_to_process = boilerplate.list_files(recursive=True, prefix=boilerplate.UPLOAD_PREFIX) if file_ids_list: files_to_process = [ boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process ] data_to_process = { file_id[len(boilerplate.UPLOAD_PREFIX):]: boilerplate.get_file(file_id) for file_id in files_to_process } for filename, file_contents in data_to_process.items(): with open(join(TOMITA_PATH_IN, filename), 'wb') as f: f.write(file_contents) i = inotify.adapters.Inotify() i.add_watch(TOMITA_PATH_OUT) processed_file_ids = set() for (_, type_names, path, out_filename) in i.event_gen(yield_nones=False): print("PATH=[{}] FILENAME=[{}] EVENT_TYPES={}".format( path, out_filename, type_names)) if not out_filename.startswith('.') and \ out_filename.endswith('.xml') and \ 'IN_CLOSE_WRITE' in type_names: full_filename = join(path, out_filename) with open(full_filename, 'rb') as f: contents = BytesIO(f.read()) contents_length = getsize(full_filename) print(contents) generated_filename = boilerplate.add_processed_file( None, contents, "xml", contents_length) processed_file_ids.add(generated_filename) if len(processed_file_ids) >= len(set(data_to_process.keys())): break return list(processed_file_ids)