def process_task(file_ids_list=None): files_to_process = boilerplate.list_files(recursive=True, prefix=boilerplate.UPLOAD_PREFIX) if file_ids_list: files_to_process = [ boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process ] data_to_process = { file_id[len(boilerplate.UPLOAD_PREFIX):]: boilerplate.get_file(file_id).decode('utf-8') for file_id in files_to_process } processed_file_ids = list() files = boilerplate.list_files(recursive=True) print('HEYY') for file in files: print('first') process_data(file) processed_file_ids.append(file) # boilerplate.add_processed_file( # processed_file_id, # contents, # extension='txt' # ) return processed_file_ids
def query_endpoint(file_id=None): query_type = request.args.get('type') if request.method == 'POST': tags_required = request.get_json() else: tags_required = None if file_id is None and query_type is None: return jsonify({"error": boilerplate.ERROR_NO_QUERY_TYPE_SPECIFIED}) else: if file_id == "gold": if query_type == "statistics": return jsonify(boilerplate.get_gold_statistics()) if query_type == "examples": limit = request.args.get('limit') try: limit = int(limit) except ValueError: return jsonify({"error": "wrong limit parameter passed"}) return jsonify(boilerplate.get_gold_examples(limit)) else: processed_file, file_id = boilerplate.get_gold("txt") text = boilerplate.read_file(processed_file) else: processed_file_id = boilerplate.PROCESSED_PREFIX + file_id if processed_file_id in boilerplate.list_files(recursive=True): text = boilerplate.get_file(processed_file_id) else: return jsonify({"error": boilerplate.ERROR_NO_SUCH_FILE}) return jsonify(query_data(query_type, text, tags_required))
def get_file_endpoint(file_id): if file_id in boilerplate.list_files(recursive=True): contents = boilerplate.get_file(file_id) if file_id.startswith(boilerplate.PROCESSED_PREFIX) and \ file_id.endswith('.xml'): return Response(contents, mimetype='text/xml') return Response(contents, mimetype='text/plain') return jsonify({'error': boilerplate.ERROR_NO_SUCH_FILE})
def get_file(file_id): if file_id not in boilerplate.list_files(recursive=True): raise JSONRPCDispatchException(code=boilerplate.ERROR_NO_SUCH_FILE_CODE, message=boilerplate.ERROR_NO_SUCH_FILE) file_contents_base64 = None try: file_contents_base64 = b64encode(boilerplate.get_file(file_id)).decode("utf-8") except TypeError: raise JSONRPCDispatchException(code=boilerplate.ERROR_NO_FILE_PART_CODE, message=boilerplate.ERROR_NO_FILE_PART) return {"file_id": file_id, "file_contents_base64": file_contents_base64}
def do_query(file_id, query_type): if not query_type: return {"error": boilerplate.ERROR_NO_QUERY_TYPE_SPECIFIED} processed_file_id = boilerplate.PROCESSED_PREFIX + file_id if processed_file_id in boilerplate.list_files(recursive=True): return { "result": query_data({ processed_file_id: boilerplate.get_file(processed_file_id) }, query_type=query_type) } return {"error": boilerplate.ERROR_NO_SUCH_FILE}
def query_endpoint(file_id): query_type = request.args.get('type') if not query_type: return jsonify({"error": boilerplate.ERROR_NO_QUERY_TYPE_SPECIFIED}) processed_file_id = boilerplate.PROCESSED_PREFIX + file_id if processed_file_id in boilerplate.list_files(recursive=True): return jsonify({ "result": query_data( {processed_file_id: boilerplate.get_file(processed_file_id)}, query_type=query_type) }) return jsonify({"error": boilerplate.ERROR_NO_SUCH_FILE})
def process_task(file_ids_list=None): from ufal.udpipe import Model, Pipeline model_path = MODELS_DIR + MODEL_NAMES[ 'russian'] # language harcoded so far model = Model.load(model_path) pipeline = Pipeline(model, '', '', '', '') file_to_process = boilerplate.get_file( file_ids_list[0]) # getting the content of the file print('...loaded the model') # parsed = pipeline.process(file_to_process.decode('utf-8')) # print('...parsed the sentence') # print(parsed) return process_data(file_to_process, pipeline)
def get_file_endpoint(file_id): if file_id in boilerplate.list_files(recursive=True): response = make_response(boilerplate.get_file(file_id)) response.headers["Content-Disposition"] = \ "attachment; filename=%s" % file_id return response if file_id == "gold": query_type = request.args.get('type') processed_file, file_id = boilerplate.get_gold(query_type) return send_file(processed_file, mimetype='txt', attachment_filename=file_id, as_attachment=True) return jsonify({'error': boilerplate.ERROR_NO_SUCH_FILE})
def process_task(file_ids_list=None): files_to_process = boilerplate.list_files(recursive=True, prefix=boilerplate.UPLOAD_PREFIX) if file_ids_list: files_to_process = [ boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process ] data_to_process = { file_id[len(boilerplate.UPLOAD_PREFIX):]: boilerplate.get_file(file_id) for file_id in files_to_process } processed_file_ids = list() for processed_file_id, contents in process_data(data_to_process): processed_file_ids.append( boilerplate.add_processed_file(processed_file_id, contents)) return processed_file_ids
def process_data(file): """Split all files contents and then combine unique words into resulting file. """ # result = set() # # for _, contents in data_to_process.items(): # if isinstance(contents, bytes): # text = contents.decode('utf-8') # else: # text = contents # result |= set([word + "!!!" for word in text.split()]) # # if result: # yield None, '\n'.join(sorted(list(result))) conn = boilerplate.get_mysql_connection() cur = conn.cursor() print(file) name = file[:-4] print(name) cur.execute("SELECT table_name from information_schema.tables where \ table_schema = 'hse-api-database' and table_name = '%s'", name) resp = cur.fetchone() print(resp) try: text = boilerplate.get_file(file).decode('utf-8') if name == 'main': f = [tuple(x.split(';')) for x in text.split('\n')] else: f = [tuple(x.split(',')[1:]) for x in text.split('\n')] print(f[:5]) cur.execute("CREATE TABLE `hse-api-database`.{} \ (word varchar(300), lemma varchar(300), morphs varchar(300), categories varchar(100))".format(name)) for tup in f: try: cur.execute("INSERT INTO `hse-api-database`.{}(word,lemma,morphs,categories)\ VALUES(%s, %s, %s, %s)".format(name), tup) # print("INSERT INTO `hse-api-database`.{}(word,lemma,morphs,categories)\ # VALUES(%s, %s, %s, %s)".format(name)) except: print(tup) raise conn.commit() return name, text except: pass
def process_task(file_ids_list=None): files_to_process = boilerplate.list_files(recursive=True, prefix=boilerplate.UPLOAD_PREFIX) if file_ids_list: files_to_process = [ boilerplate.UPLOAD_PREFIX + file_id for file_id in file_ids_list if (boilerplate.UPLOAD_PREFIX + file_id) in files_to_process ] data_to_process = { file_id[len(boilerplate.UPLOAD_PREFIX):]: boilerplate.get_file(file_id) for file_id in files_to_process } for filename, file_contents in data_to_process.items(): with open(join(TOMITA_PATH_IN, filename), 'wb') as f: f.write(file_contents) i = inotify.adapters.Inotify() i.add_watch(TOMITA_PATH_OUT) processed_file_ids = set() for (_, type_names, path, out_filename) in i.event_gen(yield_nones=False): print("PATH=[{}] FILENAME=[{}] EVENT_TYPES={}".format( path, out_filename, type_names)) if not out_filename.startswith('.') and \ out_filename.endswith('.xml') and \ 'IN_CLOSE_WRITE' in type_names: full_filename = join(path, out_filename) with open(full_filename, 'rb') as f: contents = BytesIO(f.read()) contents_length = getsize(full_filename) print(contents) generated_filename = boilerplate.add_processed_file( None, contents, "xml", contents_length) processed_file_ids.add(generated_filename) if len(processed_file_ids) >= len(set(data_to_process.keys())): break return list(processed_file_ids)
def get_file_endpoint(file_id): if file_id in boilerplate.list_files(recursive=True): return boilerplate.get_file(file_id) return jsonify({'error': boilerplate.ERROR_NO_SUCH_FILE})
def get_file_endpoint(file_id): if file_id in boilerplate.list_files(recursive=True): contents = boilerplate.get_file(file_id) return Response(contents, mimetype='text/plain') return jsonify({'error': boilerplate.ERROR_NO_SUCH_FILE})