def dbdata(connStr): utility.write_to_file( config.ConfigManager().LogFile, 'a', 'req_desc_file_extract running' + ' ' + str(datetime.datetime.now())) cursor = dbmanager.cursor_odbc_connection(connStr) query = custom.fetch_query(config.ConfigManager().STReqDocQueryId) configdocs = custom.retrieve_data_from_DB( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().ConfigCollection) query = query.replace('##jobDocumentID##', str(configdocs[0]['STreqDocpk'])) db_data_dict = dbmanager.cursor_execute(cursor, query) db_data = db_data_dict['dbdata'] db_data_cursorexec = db_data_dict['cursor_exec'] cursor_description = db_data_cursorexec.description column_headers = [column[0] for column in cursor_description] count = 0 data_dict = {} connection = dbmanager.mongoDB_connection( int(config.ConfigManager().MongoDBPort)) for row in db_data: try: data_dict = dict(utility.zip_list(column_headers, row)) reqdescfilepath = config.ConfigManager().ReqDocDirectory + '/' + '_' + \ str(data_dict['requirementID']) + '_' + \ str(data_dict['documentName']) filewrite = open(reqdescfilepath, 'wb') filewrite.write(data_dict['jobDocumentFile']) count += 1 except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file(config.ConfigManager().LogFile, 'a', exception_message) if 'jobDocumentID' in data_dict: primary_key = data_dict['jobDocumentID'] UpdateTemplateSet = utility.clean_dict() UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet['STreqDocpk'] = primary_key reqDocUpdateDelta = int(primary_key) - \ int(configdocs[0]['STreqDocpk']) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'Req docs updated - ' + str(count) + ' ' + str(datetime.datetime.now())) UpdateTemplateWhere['_id'] = configdocs[0]['_id'] DBSet = utility.clean_dict() DBSet['$set'] = UpdateTemplateSet custom.update_data_to_Db_noupsert( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().ConfigCollection, UpdateTemplateWhere, DBSet, connection)
# # nounphrase_generate() # # generating 15 threads and redirecting to worker function for i in range(15): t = Thread(target=worker) t.daemon = True t.start() # fetching promptcloud url and url response xml data url = config.ConfigManager().PromptCloudURL response = urllib.request.urlopen(url) data = json.loads(response.read().decode( response.info().get_param('charset') or 'utf-8')) urlcount = 0 # configuration data from db configdocs = custom.retrieve_data_from_DB( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().ConfigCollection) for item in data['root']['entry']: # for each entry taking th updated time updatets = item['updated_ts'].replace(' +0000', '') pclstdt = configdocs[0]['PClastDate'] updatets_formatted = datetime.strptime(updatets, "%Y-%m-%d %H:%M:%S") pclastdate_formatted = datetime.strptime(pclstdt, "%Y-%m-%d %H:%M:%S") # filling queue with url if updated time > last date of download if (updatets_formatted > pclastdate_formatted): urlcount += 1 q.put(item['url']) print(updatets_formatted) print(pclastdate_formatted)
# generating 15 threads and redirecting to worker function for i in range(15): t = Thread(target=worker) t.daemon = True t.start() # fetching promptcloud url and url response xml data url = config.ConfigManager().PromptCloudURL response = urllib.request.urlopen(url) print(response) data = json.loads(response.read().decode( response.info().get_param('charset') or 'utf-8')) urlcount = 0 # configuration data from db configdocs = custom.retrieve_data_from_DB( int(config.ConfigManager().MongoDBPort), config.ConfigManager().IntelligenceDb, config.ConfigManager().ConfigCollection) print(configdocs[0]) for item in data['root']['entry']: # for each entry taking th updated time updatets = item['updated_ts'].replace(' +0000', '') pclstdt = configdocs[0]['PClastDate'] print(pclstdt) updatets_formatted = datetime.strptime(updatets, "%Y-%m-%d %H:%M:%S") pclastdate_formatted = datetime.strptime(pclstdt, "%Y-%m-%d %H:%M:%S") # filling queue with url if updated time > last date of download if (updatets_formatted > pclastdate_formatted): urlcount += 1 q.put(item['url']) print(updatets_formatted)
def route_dataread(filepaths): data_read_count = int(utility.read_from_file( config.ConfigManager().ExecutioncountFile, 'r')) file_read_count = 0 file_path_count = 0 configdocs = custom.retrieve_data_from_DB(int(config.ConfigManager().MongoDBPort), config.ConfigManager( ).DataCollectionDB, config.ConfigManager().ConfigCollection) docid_count = int(configdocs[0]['docid_count']) connection = dbmanager.mongoDB_connection( int(config.ConfigManager().MongoDBPort)) utility.write_to_file(config.ConfigManager().LogFile, 'a', 'dataread running') for filepath in filepaths: data_text = '' try: file_path_count += 1 print('File number: ' + str(file_path_count)) print('Processing file..' + filepath) if filepath[-4:].lower() == ".txt": data_text = datareadfiletypes.read_text_text( filepath, data_text) elif filepath[-4:].lower() == ".pdf": data_text = datareadfiletypes.read_pdf_text( filepath, data_text) elif filepath[-5:].lower() == ".docx": data_text = datareadfiletypes.read_docx_text( filepath, data_text) elif filepath[-4:].lower() == ".doc": data_text = datareadfiletypes.read_doc_text( filepath, data_text) elif filepath[-4:].lower() == ".xls": # data_text = datareadfiletypes.read_excel_text( # filepath, data_text) docid_count = custom.process_excel_rowdata( filepath, docid_count) elif filepath[-5:].lower() == ".xlsx": # data_text = datareadfiletypes.read_excel_text( # filepath, data_text) docid_count = custom.process_excel_rowdata( filepath, docid_count) elif filepath[-4:].lower() == ".csv": data_text = datareadfiletypes.read_csv_text( filepath, data_text) elif filepath[-4:].lower() == ".odt": data_text = datareadfiletypes.read_odt_text( filepath, data_text) elif filepath[-4:].lower() == ".xml": docid_count = custom.process_xml_data(filepath, docid_count) if not data_text == '': docid_count += 1 file_read_count += 1 # dcrnlp.extract_nounphrases_sentences(data_text) noun_phrases = '' dictionaries.DataProperties['description'] = data_text dictionaries.DataProperties['nounPhrases'] = noun_phrases dictionaries.DataProperties[ 'documentType'] = utility.filefolder_from_filepath(filepath) dictionaries.DataProperties[ 'dataSource'] = config.ConfigManager().Misc # config.ConfigManager().JobPortal dictionaries.DataProperties['doc_id'] = docid_count dictionaries.DataProperties[ 'documentTitle'] = utility.filename_from_filepath(filepath) dictionaries.DataProperties['documentDesc'] = ( dictionaries.DataProperties['description'])[0:200] jsonfordatastore = custom.prepare_json_for_datastore( dictionaries.DataProperties) jsonfordatastore_deserialized = utility.jsonstring_deserialize( jsonfordatastore) custom.insert_data_to_DB( jsonfordatastore_deserialized, connection) phrases_file_data = custom.prepare_phrases_file_data( noun_phrases, data_read_count, file_read_count) utility.write_to_file( config.ConfigManager().PhraseFile, 'a', phrases_file_data) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + filepath + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file( config.ConfigManager().LogFile, 'a', exception_message) data_read_count += 1 utility.write_to_file(config.ConfigManager( ).ExecutioncountFile, 'w', str(data_read_count)) dictionaries.UpdateTemplateWhere['_id'] = configdocs[0]['_id'] dictionaries.UpdateTemplateSet['docid_count'] = docid_count dictionaries.DBSet['$set'] = dictionaries.UpdateTemplateSet custom.update_data_to_Db_noupsert(int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager( ).ConfigCollection, dictionaries.UpdateTemplateWhere, dictionaries.DBSet, connection)
def job_info_analysis(page, filepath, dbrecordcount): global totalrecords global invalidrecords global emptydesc global incompletedesc global smalldesc global nonedesc global nodesc global totaljobsdict global jobsitedict # Fetching current config paramters configdocs = custom.retrieve_data_from_DB(int(config.ConfigManager(). MongoDBPort), config.ConfigManager(). IntelligenceDb, config.ConfigManager(). ConfigCollection) docid_count = int(configdocs[0]['docid_count']) dict_object_record_list = [] for jobinfo in page.findall('record'): try: # creating dictionary from xml tag contents dict_object = utility.xml_to_dict(ET.tostring(jobinfo)) totaljobsdict = fill_job_by_site(filepath) totalrecords += 1 # outer if check is jobdescription tag is in the xml if 'jobdescription' in (dict_object['record']): # checking if job description is none if ((dict_object['record'])['jobdescription'] is not None): # variable to determine if record needs to be # updated in DB incorrectjobdescription = 0 # checking if job description is empty if (((dict_object['record'])['jobdescription']) .strip()) == '': write_fileinfo(filepath, dict_object) invalidrecords += 1 emptydesc += 1 incorrectjobdescription = 1 jobsitedict = fill_job_site_data(filepath) # checking if job desc has less than 20 chars if (len(((dict_object['record'])['jobdescription']) ) < 20): incorrectjobdescription = 1 # eliminating the incomplete desc case if (((dict_object['record'])['jobdescription']) .strip()[-3:]) == '...': print('Do nothing') else: write_fileinfo(filepath, dict_object) invalidrecords += 1 smalldesc += 1 jobsitedict = fill_job_site_data(filepath) # checking the incomplete desc case if (((dict_object['record'])['jobdescription']) .strip()[-3:]) == '...': incorrectjobdescription = 1 write_fileinfo(filepath, dict_object) invalidrecords += 1 incompletedesc += 1 jobsitedict = fill_job_site_data(filepath) if (incorrectjobdescription == 0): docid_count += 1 (dict_object['record'])['doc_id'] = docid_count (dict_object['record'])['description'] = ((dict_object['record'])['jobdescription']) (dict_object['record'])['nounPhrases'] = "" dict_object_record_list.append(dict_object['record']) dbrecordcount += 1 # checking if job description is none if (dict_object['record'])['jobdescription'] is None: write_fileinfo(filepath, dict_object) invalidrecords += 1 nonedesc += 1 jobsitedict = fill_job_site_data(filepath) else: write_fileinfo(filepath, dict_object) invalidrecords += 1 nodesc += 1 jobsitedict = fill_job_site_data(filepath) except BaseException as ex: utility.log_exception_file(ex, dcrconfig.ConfigManager().SemanticGraphLogFile) if dict_object_record_list: insert_to_db(dict_object_record_list) # updating doc_id in config table UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet = utility.clean_dict() UpdateTemplateWhere['_id'] = configdocs[0]['_id'] UpdateTemplateSet['docid_count'] = docid_count DBSet = utility.clean_dict() DBSet['$set'] = UpdateTemplateSet custom.update_data_to_Db_noupsert(int(config.ConfigManager().MongoDBPort), config.ConfigManager().IntelligenceDb, config.ConfigManager().ConfigCollection, UpdateTemplateWhere, DBSet, connection) return dbrecordcount
def dbdata(connStr): utility.write_to_file( config.ConfigManager().LogFile, 'a', 'resume_extract running' + ' ' + str(datetime.datetime.now())) st_db_name_list = utility.find_string_inbetween( config.ConfigManager().STConnStr, "DATABASE=", ";UID") cursor = dbmanager.cursor_odbc_connection(connStr) query = custom.fetch_query( config.ConfigManager().STCandidateResumesQueryId) configdocs = custom.retrieve_data_from_DB( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().ConfigCollection) query = query.replace('##candidateResumeID##', str(configdocs[0]['STcandidateResumepk'])).replace( '##STDB##', st_db_name_list[0]) db_data_dict = dbmanager.cursor_execute(cursor, query) db_data = db_data_dict['dbdata'] db_data_cursorexec = db_data_dict['cursor_exec'] cursor_description = db_data_cursorexec.description column_headers = [column[0] for column in cursor_description] count = 0 data_dict = {} connection = dbmanager.mongoDB_connection( int(config.ConfigManager().MongoDBPort)) for row in db_data: try: data_dict = dict(utility.zip_list(column_headers, row)) resumefilepath = config.ConfigManager().ResumeDirectory + '/' + '_' + \ str(data_dict['candidateID']) + '_' + \ str(data_dict['documentName']) filewrite = open(resumefilepath, 'wb') f = io.BytesIO(data_dict['ResumeFile']) filewrite.write(data_dict['ResumeFile']) filefilepath = config.ConfigManager().fileDirectory + '/' + '_' + \ str(data_dict['candidateResumeID'])+'-'+str(data_dict['supplierID']) + '_' + \ str(data_dict['documentName']) file_write = open(filefilepath, 'wb') file_write.write(data_dict['ResumeFile']) except BaseException as ex: exception_message = '\n' + 'Exception:' + \ str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file(config.ConfigManager().LogFile, 'a', exception_message) if 'candidateResumeID' in data_dict: primary_key = data_dict['candidateResumeID'] UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet = utility.clean_dict() UpdateTemplateSet['STcandidateResumepk'] = primary_key resumeUpdateDelta = int(primary_key) - \ int(configdocs[0]['STcandidateResumepk']) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'Resumes updated - ' + str(resumeUpdateDelta) + ' ' + str(datetime.datetime.now())) UpdateTemplateWhere['_id'] = configdocs[0]['_id'] DBSet = utility.clean_dict() DBSet['$set'] = UpdateTemplateSet custom.update_data_to_Db_noupsert( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().ConfigCollection, UpdateTemplateWhere, DBSet, connection)
def requirement_update(): utility.write_to_file( config.ConfigManager().LogFile, 'a', 'requpdatefastest running' + ' ' + str(datetime.datetime.now())) recordnumber = 0 configdocs = custom.retrieve_data_from_DB( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().ConfigCollection) query = custom.fetch_query( config.ConfigManager().STCandidateSubmissionsQueryId) if '.' in str(configdocs[0]['submissionsdateCreated']): query = query.replace( '##dateCreated##', str(configdocs[0]['submissionsdateCreated']).split('.')[0] + ('.') + str(configdocs[0]['submissionsdateCreated']).split('.')[1][:3]) else: query = query.replace('##dateCreated##', str(configdocs[0]['submissionsdateCreated'])) # query = query.replace('##dateCreated##', str((datetime.datetime.now() - datetime.timedelta(minutes=480))).split('.')[0] + ('.') + str((datetime.datetime.now() - datetime.timedelta(minutes=480))).split('.')[1][:3]) print(query) cursor = dbmanager.cursor_odbc_connection(config.ConfigManager().STConnStr) db_data_dict = dbmanager.cursor_execute(cursor, query) db_data = db_data_dict['dbdata'] db_data_cursorexec = db_data_dict['cursor_exec'] cursor_description = db_data_cursorexec.description column_headers = [column[0] for column in cursor_description] connection = dbmanager.mongoDB_connection( int(config.ConfigManager().MongoDBPort)) data_dict1 = {} req_list = [] for row1 in db_data: try: strtimestamp = str(datetime.datetime.now()) recordnumber += 1 print(recordnumber) data_dict1 = dict(utility.zip_list(column_headers, row1)) if not (data_dict1['requirementID']).strip(): requirementIDListperCandidate = [] reqratelist = [] reqratestatdictlist = [] else: reqratelist = [] reqratestatdictlist = [] requirementIDListperCandidate = [ reqID for reqID in (data_dict1['requirementID']).split(',') ] for reqratecombID in requirementIDListperCandidate: reqratelistinit = [ reqrateID for reqrateID in (reqratecombID.strip().replace( '~!@- ', '~!@-')).split('~!@-') ] print(reqratelistinit[0]) reqratestatdict = {} req_list.append(int(reqratelistinit[1])) if reqratelistinit[0] == "0.00": reqratelistinit[0] = "" reqratestatdict['rate'] = reqratelistinit[0] reqratestatdict['requirementId'] = int(reqratelistinit[1]) reqratestatdict['candidateStatus'] = reqratelistinit[2] reqratelist.append(reqratelistinit) reqratestatdictlist.append(reqratestatdict) strtimestamp += ' ' + str(datetime.datetime.now()) UpdateTemplateSet = utility.clean_dict() UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet[ 'requirementIDList'] = reqratelist # requirementIDListperCandidate UpdateTemplateSet[ 'requirementRateStatusList'] = reqratestatdictlist UpdateTemplateWhere['candidateid'] = data_dict1['CandidateID'] UpdateTemplateWhere['documentType'] = 'candidate details' UpdateTemplateWhere['dataSource'] = 'Smart Track' DBSet = utility.clean_dict() DBSet['$set'] = UpdateTemplateSet print(UpdateTemplateSet['requirementRateStatusList']) custom.update_data_to_Db_noupsert( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().STCandidateCollection, UpdateTemplateWhere, DBSet, connection) strtimestamp += ' ' + str(datetime.datetime.now()) except BaseException as ex: utility.log_exception(ex) if 'dateCreated' in data_dict1: if not data_dict1['dateCreated'] is None: UpdateTemplateSet = utility.clean_dict() UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet['submissionsdateCreated'] = data_dict1[ 'dateCreated'] UpdateTemplateWhere['_id'] = configdocs[0]['_id'] DBSet = utility.clean_dict() DBSet['$set'] = UpdateTemplateSet custom.update_data_to_Db_noupsert( int(config.ConfigManager().MongoDBPort), config.ConfigManager().DataCollectionDB, config.ConfigManager().ConfigCollection, UpdateTemplateWhere, DBSet, connection) if req_list: print(list(set(req_list))) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'requirement id list - ' + str(json.dumps(list(set(req_list)))) + ' ' + str(datetime.datetime.now())) db_requirements_candidate.generate_req_candidate_file_selected_req( list(set(req_list))) utility.write_to_file( config.ConfigManager().LogFile, 'a', 'Number of records for which requirement list was updated - ' + str(recordnumber) + ' ' + str(datetime.datetime.now()))