def upload_catalog_files(): try: user_id = ObjectId(str(request.form.get('user_id'))) catalog_id = request.form.get('catalog_id') catalog_name = str(request.form.get('catalog_name')).strip() uploaded_files = request.files.getlist("file") if not validate_upload_files(uploaded_files): response = { "status": "error", "message": "Invalid file types. Please upload only .txt, .pdf, .docx files." } return json.dumps(response) user = validate_user(user_id) if user is None: response = { "status": "error", "message": "Invalid user_id '{}'".format(user_id) } return json.dumps(response) if catalog_id == 'null': user_catalog = validate_catalog(user=user, catalog_name=catalog_name, create=True) else: user_catalog = validate_catalog(user=user, catalog_name=catalog_name) if not user_catalog: response = { "status": "error", "message": "Invalid catalog_id '{}'".format(catalog_id) } return json.dumps(response) for up_file in uploaded_files: up_file_doc = validate_catalog_file(catalog=user_catalog, file_name=up_file.filename) if up_file_doc['is_entity_extracted']: update_file(up_file_doc) save_file(file=up_file, file_doc=up_file_doc) if "data_un_extracted_files" not in user_catalog: USER_CATALOGS_COL.update_one( {"_id": user_catalog["_id"]}, {"$set": { "data_un_extracted_files": len(uploaded_files) }}) else: USER_CATALOGS_COL.update_one({"_id": user_catalog["_id"]}, { "$set": { "data_un_extracted_files": user_catalog["data_un_extracted_files"] + len(uploaded_files) } }) return json.dumps({ 'status': 'success', 'message': '{} file(s) uploaded into "{}" catalog.'.format( len(uploaded_files), catalog_name) }) except Exception as err: return json.dumps({'status': 'error', 'message': str(err)})
def process_catalog_files(): try: req_data = request.get_json() user_id = ObjectId(str(req_data['user_id'])) catalog_id = ObjectId(str(req_data['catalog_id'])) user = validate_user(user_id) if user is None: response = { "status": "error", "message": "Invalid user_id '{}'".format(user_id) } return json.dumps(response) user_cat_col = USER_CATALOGS_COL.find_one({ "user": user["_id"], "_id": catalog_id }) if not user_cat_col: response = { "status": "error", "message": "Invalid catalog_id '{}'".format(catalog_id) } return json.dumps(response) catalog_resumes = list( CATALOG_FILES_COL.find({ "catalog": user_cat_col['_id'], "is_entity_extracted": False, "is_manually_updated": False })) nlp_obj = TrainModel(model='Resume_Keyword_Extraction') for file_doc in catalog_resumes: file_full_path = os.path.join(UPLOAD_FILE_PATH, str(file_doc["catalog"]), file_doc["file_name"]) file_type = file_doc['file_name'].rsplit('.', 1)[1].lower() if file_type == "txt": file_data = get_text_from_text_file(file_full_path) elif file_type == "pdf": file_data = extract_text_from_pdf_file(file_full_path) elif file_type == "docx": file_data = get_text_from_docx_file(file_full_path) entity_data = nlp_obj.get_entities(text=file_data) entity_data = json.loads(entity_data) entity_data['is_entity_extracted'] = True CATALOG_FILES_COL.update({"_id": file_doc['_id']}, {"$set": entity_data}) if "data_un_extracted_files" in user_cat_col: data_un_extracted_files_count = user_cat_col[ "data_un_extracted_files"] - len(list(catalog_resumes)) else: data_un_extracted_files_count = 0 USER_CATALOGS_COL.update_one({"_id": user_cat_col["_id"]}, { "$set": { "prev_data_extracted_date": datetime.now(), "data_un_extracted_files": data_un_extracted_files_count } }) return json.dumps({ 'status': 'success', 'message': 'Data extracted from "{}" catalog files'.format( user_cat_col["name"]) }) except Exception as err: return json.dumps({'status': 'error', 'message': str(err)})