Esempio n. 1
0
def task_ner(file_ids_list=None):
    data_to_process = boilerplate.get_process_data(file_ids_list)
    processed_file_ids = list()
    for processed_file_id, contents in process_ner(data_to_process):
        processed_file_ids.append(
            boilerplate.add_processed_file(boilerplate.NER_PREFIX,
                                           processed_file_id,
                                           contents,
                                           extension='txt'))

    return processed_file_ids
Esempio n. 2
0
def process_task(file_ids_list=None):
    files_to_process = boilerplate.list_files(recursive=True,
                                              prefix=boilerplate.UPLOAD_PREFIX)
    if file_ids_list:
        files_to_process = [
            boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list
            if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process
        ]
    data_to_process = {
        file_id[len(boilerplate.UPLOAD_PREFIX):]: boilerplate.get_file(file_id)
        for file_id in files_to_process
    }
    processed_file_ids = list()
    for processed_file_id, contents in process_data(data_to_process):
        processed_file_ids.append(
            boilerplate.add_processed_file(processed_file_id, contents))
    return processed_file_ids
def process_task(file_ids_list=None):
    files_to_process = boilerplate.list_files(recursive=True,
                                              prefix=boilerplate.UPLOAD_PREFIX)
    if file_ids_list:
        files_to_process = [
            boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list
            if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process
        ]
    data_to_process = {
        file_id[len(boilerplate.UPLOAD_PREFIX):]: boilerplate.get_file(file_id)
        for file_id in files_to_process
    }
    for filename, file_contents in data_to_process.items():
        with open(join(TOMITA_PATH_IN, filename), 'wb') as f:
            f.write(file_contents)

    i = inotify.adapters.Inotify()

    i.add_watch(TOMITA_PATH_OUT)

    processed_file_ids = set()

    for (_, type_names, path, out_filename) in i.event_gen(yield_nones=False):
        print("PATH=[{}] FILENAME=[{}] EVENT_TYPES={}".format(
            path, out_filename, type_names))

        if not out_filename.startswith('.') and \
           out_filename.endswith('.xml') and \
           'IN_CLOSE_WRITE' in type_names:
            full_filename = join(path, out_filename)
            with open(full_filename, 'rb') as f:
                contents = BytesIO(f.read())
                contents_length = getsize(full_filename)
                print(contents)
                generated_filename = boilerplate.add_processed_file(
                    None, contents, "xml", contents_length)
                processed_file_ids.add(generated_filename)

        if len(processed_file_ids) >= len(set(data_to_process.keys())):
            break

    return list(processed_file_ids)