Esempio n. 1
0
def process_task(file_ids_list=None):
    files_to_process = boilerplate.list_files(recursive=True,
                                              prefix=boilerplate.UPLOAD_PREFIX)
    if file_ids_list:
        files_to_process = [
            boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list
            if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process
        ]
    data_to_process = {
        file_id[len(boilerplate.UPLOAD_PREFIX):]:
        boilerplate.get_file(file_id).decode('utf-8')
        for file_id in files_to_process
    }
    processed_file_ids = list()
    files = boilerplate.list_files(recursive=True)
    print('HEYY')
    for file in files:
        print('first')
        process_data(file)
        processed_file_ids.append(file)
        # boilerplate.add_processed_file(
        #     processed_file_id,
        #     contents,
        #     extension='txt'
        # )

    return processed_file_ids
Esempio n. 2
0
def query_endpoint(file_id=None):
    query_type = request.args.get('type')
    if request.method == 'POST':
        tags_required = request.get_json()
    else:
        tags_required = None

    if file_id is None and query_type is None:
        return jsonify({"error": boilerplate.ERROR_NO_QUERY_TYPE_SPECIFIED})
    else:
        if file_id == "gold":
            if query_type == "statistics":
                return jsonify(boilerplate.get_gold_statistics())
            if query_type == "examples":
                limit = request.args.get('limit')
                try:
                    limit = int(limit)
                except ValueError:
                    return jsonify({"error": "wrong limit parameter passed"})
                return jsonify(boilerplate.get_gold_examples(limit))
            else:
                processed_file, file_id = boilerplate.get_gold("txt")
                text = boilerplate.read_file(processed_file)
        else:
            processed_file_id = boilerplate.PROCESSED_PREFIX + file_id
            if processed_file_id in boilerplate.list_files(recursive=True):
                text = boilerplate.get_file(processed_file_id)
            else:
                return jsonify({"error": boilerplate.ERROR_NO_SUCH_FILE})
        return jsonify(query_data(query_type, text, tags_required))
def get_file_endpoint(file_id):
    if file_id in boilerplate.list_files(recursive=True):
        contents = boilerplate.get_file(file_id)
        if file_id.startswith(boilerplate.PROCESSED_PREFIX) and \
           file_id.endswith('.xml'):
            return Response(contents, mimetype='text/xml')
        return Response(contents, mimetype='text/plain')
    return jsonify({'error': boilerplate.ERROR_NO_SUCH_FILE})
def get_file(file_id):
    if file_id not in boilerplate.list_files(recursive=True):
        raise JSONRPCDispatchException(code=boilerplate.ERROR_NO_SUCH_FILE_CODE, message=boilerplate.ERROR_NO_SUCH_FILE)
    file_contents_base64 = None
    try:
        file_contents_base64 = b64encode(boilerplate.get_file(file_id)).decode("utf-8")
    except TypeError:
        raise JSONRPCDispatchException(code=boilerplate.ERROR_NO_FILE_PART_CODE, message=boilerplate.ERROR_NO_FILE_PART)
    return {"file_id": file_id,
            "file_contents_base64": file_contents_base64}
def do_query(file_id, query_type):
    if not query_type:
        return {"error": boilerplate.ERROR_NO_QUERY_TYPE_SPECIFIED}
    processed_file_id = boilerplate.PROCESSED_PREFIX + file_id
    if processed_file_id in boilerplate.list_files(recursive=True):
        return {
            "result": query_data({
                processed_file_id: boilerplate.get_file(processed_file_id)
            }, query_type=query_type)
        }
    return {"error": boilerplate.ERROR_NO_SUCH_FILE}
Esempio n. 6
0
def query_endpoint(file_id):
    query_type = request.args.get('type')
    if not query_type:
        return jsonify({"error": boilerplate.ERROR_NO_QUERY_TYPE_SPECIFIED})
    processed_file_id = boilerplate.PROCESSED_PREFIX + file_id
    if processed_file_id in boilerplate.list_files(recursive=True):
        return jsonify({
            "result":
            query_data(
                {processed_file_id: boilerplate.get_file(processed_file_id)},
                query_type=query_type)
        })
    return jsonify({"error": boilerplate.ERROR_NO_SUCH_FILE})
Esempio n. 7
0
def process_task(file_ids_list=None):
    from ufal.udpipe import Model, Pipeline
    model_path = MODELS_DIR + MODEL_NAMES[
        'russian']  # language harcoded so far
    model = Model.load(model_path)
    pipeline = Pipeline(model, '', '', '', '')
    file_to_process = boilerplate.get_file(
        file_ids_list[0])  # getting the content of the file
    print('...loaded the model')
    # parsed = pipeline.process(file_to_process.decode('utf-8'))
    # print('...parsed the sentence')
    # print(parsed)
    return process_data(file_to_process, pipeline)
Esempio n. 8
0
def get_file_endpoint(file_id):
    if file_id in boilerplate.list_files(recursive=True):
        response = make_response(boilerplate.get_file(file_id))
        response.headers["Content-Disposition"] = \
            "attachment; filename=%s" % file_id
        return response
    if file_id == "gold":
        query_type = request.args.get('type')
        processed_file, file_id = boilerplate.get_gold(query_type)
        return send_file(processed_file,
                         mimetype='txt',
                         attachment_filename=file_id,
                         as_attachment=True)
    return jsonify({'error': boilerplate.ERROR_NO_SUCH_FILE})
Esempio n. 9
0
def process_task(file_ids_list=None):
    files_to_process = boilerplate.list_files(recursive=True,
                                              prefix=boilerplate.UPLOAD_PREFIX)
    if file_ids_list:
        files_to_process = [
            boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list
            if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process
        ]
    data_to_process = {
        file_id[len(boilerplate.UPLOAD_PREFIX):]: boilerplate.get_file(file_id)
        for file_id in files_to_process
    }
    processed_file_ids = list()
    for processed_file_id, contents in process_data(data_to_process):
        processed_file_ids.append(
            boilerplate.add_processed_file(processed_file_id, contents))
    return processed_file_ids
Esempio n. 10
0
def process_data(file):
    """Split all files contents and then combine unique words into resulting file.
    """
    # result = set()
    #
    # for _, contents in data_to_process.items():
    #     if isinstance(contents, bytes):
    #         text = contents.decode('utf-8')
    #     else:
    #         text = contents
    #     result |= set([word + "!!!" for word in text.split()])
    #
    # if result:
    #     yield None, '\n'.join(sorted(list(result)))
    conn = boilerplate.get_mysql_connection()
    cur = conn.cursor()
    print(file)
    name = file[:-4]
    print(name)
    cur.execute("SELECT table_name from information_schema.tables where \
        table_schema = 'hse-api-database' and table_name = '%s'", name)
    resp = cur.fetchone()
    print(resp)
    try:
        text = boilerplate.get_file(file).decode('utf-8')
        if name == 'main':
            f = [tuple(x.split(';')) for x in text.split('\n')]
        else:
            f = [tuple(x.split(',')[1:]) for x in text.split('\n')]
        print(f[:5])
        cur.execute("CREATE TABLE `hse-api-database`.{} \
            (word varchar(300), lemma varchar(300), morphs varchar(300), categories varchar(100))".format(name))
        for tup in f:
            try:
                cur.execute("INSERT INTO `hse-api-database`.{}(word,lemma,morphs,categories)\
                    VALUES(%s, %s, %s, %s)".format(name), tup)
                # print("INSERT INTO `hse-api-database`.{}(word,lemma,morphs,categories)\
                #     VALUES(%s, %s, %s, %s)".format(name))
            except:
                print(tup)
                raise
        conn.commit()
        return name, text
    except:
        pass
Esempio n. 11
0
def process_task(file_ids_list=None):
    files_to_process = boilerplate.list_files(recursive=True,
                                              prefix=boilerplate.UPLOAD_PREFIX)
    if file_ids_list:
        files_to_process = [
            boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list
            if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process
        ]
    data_to_process = {
        file_id[len(boilerplate.UPLOAD_PREFIX):]: boilerplate.get_file(file_id)
        for file_id in files_to_process
    }
    for filename, file_contents in data_to_process.items():
        with open(join(TOMITA_PATH_IN, filename), 'wb') as f:
            f.write(file_contents)

    i = inotify.adapters.Inotify()

    i.add_watch(TOMITA_PATH_OUT)

    processed_file_ids = set()

    for (_, type_names, path, out_filename) in i.event_gen(yield_nones=False):
        print("PATH=[{}] FILENAME=[{}] EVENT_TYPES={}".format(
            path, out_filename, type_names))

        if not out_filename.startswith('.') and \
           out_filename.endswith('.xml') and \
           'IN_CLOSE_WRITE' in type_names:
            full_filename = join(path, out_filename)
            with open(full_filename, 'rb') as f:
                contents = BytesIO(f.read())
                contents_length = getsize(full_filename)
                print(contents)
                generated_filename = boilerplate.add_processed_file(
                    None, contents, "xml", contents_length)
                processed_file_ids.add(generated_filename)

        if len(processed_file_ids) >= len(set(data_to_process.keys())):
            break

    return list(processed_file_ids)
Esempio n. 12
0
def get_file_endpoint(file_id):
    if file_id in boilerplate.list_files(recursive=True):
        return boilerplate.get_file(file_id)
    return jsonify({'error': boilerplate.ERROR_NO_SUCH_FILE})
Esempio n. 13
0
def get_file_endpoint(file_id):
    if file_id in boilerplate.list_files(recursive=True):
        contents = boilerplate.get_file(file_id)
        return Response(contents, mimetype='text/plain')
    return jsonify({'error': boilerplate.ERROR_NO_SUCH_FILE})