Esempio n. 1
0
def automate_processes():
    utility.write_to_file(config.ConfigManager().LogFile, 'a',
                          ' master automationscript running')
    try:
        utility.update_config_coll_process_started_date()
        # Supplier master list load
        exec(open('st_master_supplier_data_read.py').read(), globals())
        # Client master list load
        exec(open('stclientsdataread.py').read(), globals())
        # Currency master list load
        exec(open('currencydataread.py').read(), globals())
        # Industry master list load
        exec(open('industrydataread.py').read(), globals())
        # MSP master list load
        exec(open('stmspdataread.py').read(), globals())
        # Rates information transfer from Smart Track
        exec(open('stratesdataread.py').read(), globals())
        # PromptCloud data load automation
        exec(open('prompt_cloud_automation.py').read(), globals())
        # Transferring files from staging collection to masters collection
        exec(open('staging_data_read.py').read(), globals())
        # Generating master integer graph
        exec(open('gen_docintgraph_from_db.py').read(), globals())
        # Transfering file to webserver
        exec(open('master_int_graph_transfer.py').read(), globals())
        # Learning automation
        exec(open('knowledge_build_automation.py').read(), globals())
    except BaseException as ex:
        utility.log_exception_file(config.ConfigManager().LogFile, ex)
def modifygeodata():
    ratesData = mastercoll.find({})
    for row in ratesData:
        try:
            if row['cityLocationFlag'] == 1:
                cityGeoLocation = []
                cityGeoLocation.append(float(row['cityLongitude']))
                cityGeoLocation.append(float(row['cityLatitude']))
                row['coordinates'] = cityGeoLocation
                mastercoll.update({"doc_id": row['doc_id']},
                                  {"$set": {
                                      "coordinates": cityGeoLocation
                                  }})


# if row['stateLocationFlag'] == 1:
#     stateGeoLocation = []
#     stateGeoLocation.append(float(row['stateLatitude']))
#     stateGeoLocation.append(float(row['stateLongitude']))
#     row['stateGeoLocation'] = stateGeoLocation
#     # print(row,"\n")
#     mastercoll.update({"doc_id": row['doc_id']},
#    {"$set": {"stateGeoLocation": stateGeoLocation}})

        except BaseException as ex:
            utility.log_exception_file(ex, config.ConfigManager().LogFile)
def pc_rates_data_storage(page_dict_object, filepath, dbrecordcount):
    global totalrecords
    global invalidrecords
    global emptydesc
    global incompletedesc
    global smalldesc
    global nonedesc
    global nodesc
    global totaljobsdict
    global jobsitedict
    dict_object_record_list = []
    try:
        page_object_list = page_dict_object['page']
        if isinstance(page_object_list['record'], list):
            for record_object in page_object_list['record']:
                record_object = pc_rates_add_fields(record_object, filepath)
                if sys.getsizeof(record_object['description']) < 13000000:
                    dict_object_record_list.append(record_object)
                dbrecordcount += 1
        else:
            record_object = page_object_list['record']
            record_object = pc_rates_add_fields(record_object, filepath)

            if sys.getsizeof(record_object['description']) < 13000000:
                dict_object_record_list.append(record_object)
            dbrecordcount += 1
    except BaseException as ex:
        utility.log_exception_file(ex,
                                   config.ConfigManager().PromptcloudLogFile)
    if dict_object_record_list:
        insert_to_db(dict_object_record_list)
    # updating doc_id in config table

    return dbrecordcount
def job_info_analysis(page, filepath, dbrecordcount):
    global totalrecords
    global invalidrecords
    global emptydesc
    global incompletedesc
    global smalldesc
    global nonedesc
    global nodesc
    global totaljobsdict
    global jobsitedict

    dict_object_record_list = []
    for jobinfo in page.findall('record'):
        try:
            # creating dictionary from xml tag contents
            dict_object = utility.xml_to_dict(ET.tostring(jobinfo))
            # totaljobsdict = fill_job_by_site(filepath)
            # totalrecords += 1

            # outer if check is jobdescription tag is in the xml
            if 'jobdescription' in (dict_object['record']):
                # checking if job description is none
                if ((dict_object['record'])['jobdescription'] is not None):

                    incorrectjobdescription = 0

                    if (((dict_object['record'])['jobdescription']).strip()
                        ) == '':
                        incorrectjobdescription = 1

                    if (len(((dict_object['record'])['jobdescription'])) < 20):
                        incorrectjobdescription = 1

                    if (((dict_object['record'])['jobdescription']
                         ).strip()[-3:]) == '...':
                        incorrectjobdescription = 1

                    if (incorrectjobdescription == 0):
                        (dict_object['record']
                         )['dateCreated'] = datetime.datetime.now()
                        (dict_object['record']
                         )['dateModified'] = datetime.datetime.now()
                        (dict_object['record'])['createdUser'] = '******'
                        (dict_object['record'])['modifiedUser'] = '******'
                        (dict_object['record'])['source'] = 'PromptCloud'
                        #(dict_object['record'])['Url'] = page['pageurl']
                        dict_object_record_list.append(dict_object['record'])
                        dbrecordcount += 1

        except BaseException as ex:
            utility.log_exception_file(
                ex,
                dcrconfig.ConfigManager().SemanticGraphLogFile)
    if dict_object_record_list:
        insert_to_db(dict_object_record_list)
    # updating doc_id in config table

    return dbrecordcount
def job_info_analysis_storage(page_dict_object, filepath, dbrecordcount):
    global totalrecords
    global invalidrecords
    global emptydesc
    global incompletedesc
    global smalldesc
    global nonedesc
    global nodesc
    global totaljobsdict
    global jobsitedict

    dict_object_record_list = []
    try:
        dict_object = page_dict_object['page']
        # outer if check is jobdescription tag is in the xml
        if 'jobdescription' in (dict_object['record']):
            # checking if job description is none
            if ((dict_object['record'])['jobdescription'] is not None):

                incorrectjobdescription = 0

                if (((dict_object['record'])['jobdescription']).strip()) == '':
                    incorrectjobdescription = 1

                if (len(((dict_object['record'])['jobdescription'])) < 20):
                    incorrectjobdescription = 1

                if (((dict_object['record'])['jobdescription']).strip()[-3:]
                    ) == '...':
                    incorrectjobdescription = 1

                if (incorrectjobdescription == 0):
                    (dict_object['record']
                     )['dateCreated'] = datetime.datetime.now()
                    (dict_object['record']
                     )['dateModified'] = datetime.datetime.now()
                    (dict_object['record'])['createdUser'] = '******'
                    (dict_object['record'])['modifiedUser'] = '******'
                    (dict_object['record'])['source'] = 'PromptCloud'
                    (dict_object['record'])['Url'] = dict_object['pageurl']
                    (dict_object['record'])['fileName'] = filepath.replace(
                        config.ConfigManager().PCFileFolder + '/', '')
                    dict_object_record_list.append(dict_object['record'])
                    dbrecordcount += 1

    except BaseException as ex:
        utility.log_exception_file(
            ex,
            dcrconfig.ConfigManager().SemanticGraphLogFile)
    if dict_object_record_list:
        insert_to_db(dict_object_record_list)
    # updating doc_id in config table

    return dbrecordcount
def sendflagdetailstoexternalsystem():
    flagdetailsdata = flagdetailscollection.find({"isSent": 0}, {
        "_id": 0,
        "isSent": 0
    })
    flagdetailsList = []
    for data in flagdetailsdata:
        flagdetailsList.append(data)
    if flagdetailsList:
        headers = {"Content-Type": "application/json"}
        flagdetailsList = json.dumps(flagdetailsList).encode('utf8')
        conn = http.client.HTTPConnection(config.ConfigManager().STwebApiHost,
                                          "80")
        conn.request(config.ConfigManager().JobServerMethod,
                     config.ConfigManager().stWebApiSendData, flagdetailsList,
                     headers)
        # conn = http.client.HTTPConnection("localhost", "4400")
        # conn.request(config.ConfigManager().JobServerMethod, config.ConfigManager().stWebApiSendData, flagdetailsList, headers)
        response = conn.getresponse()
        try:
            if response.status == 200:
                data = response.read()
                result = json.loads(data.decode('utf8'))
                # resumeIds = result[0]["resumeID"].split(',')
                resumeIds = [str(x.strip()) for x in result]
                if resumeIds:
                    flagdetailscollection.update(
                        {"batchId": {
                            "$in": resumeIds
                        }}, {"$set": {
                            "isSent": 1
                        }},
                        multi=True)
                    utility.write_to_file(
                        config.ConfigManager().LogFile, 'a',
                        'ST candidate resume screening, ' +
                        str(len(resumeIds)) +
                        ' resume detection details sent successfully' + ' ' +
                        str(datetime.datetime.now()))
                else:
                    utility.write_to_file(
                        config.ConfigManager().LogFile, 'a',
                        'ST candidate resume screening, no resume detection update done'
                        + ' ' + str(datetime.datetime.now()))
            else:
                ex = str(response.status) + "--" + str(response.reason)
                utility.log_exception_file(ex, file)
                utility.write_to_file(
                    config.ConfigManager().LogFile, 'a',
                    'ST candidate resume screening, API down time' + ' ' +
                    str(datetime.datetime.now()))
        except BaseException as ex:
            utility.log_exception_file(ex, file)
def automate_processes():
    utility.write_to_file(
        dcrconfig.ConfigManager().SemanticGraphLogFile, 'a',
        'Knowledge build automation running..! ' +
        str(datetime.datetime.now()))
    try:
        # Copies files from the previous cycle
        exec(open('filecopy.py').read(), globals())
        # Copy the noun phrase text from Mongo DB
        exec(open('dbtophrasefile.py').read(), globals())
        # Remove ngram anything above 3 or more words.
        exec(open('ngramremoval.py').read(), globals())
        # Remove duplicates and save it in new distinct phrase file.
        exec(open('duplicatefinder.py').read(), globals())
        # Checks if there is an existing semantic graph, if yes load and update
        # with new documents else create a new semantic graph and store.
        # Normally, this is run after n gram removal and duplicate
        # find and removal.
        exec(open('dcrgraphgenerator.py').read(), globals())
        # Read the semantic graph which is saved using dcrgraphgenerator.py
        # and read the document phrase file and create optimized integer
        # semantic edge file.
        exec(open('dcrgraphcompactor.py').read(), globals())
        # Save the node dictionary using pickle to file. This will be used by
        # above programs for finding node ids
        exec(open('savenodes.py').read(), globals())
        # Generate document integer graph and store. This will be used for
        # searching the documents.
        # exec(open('dcrdocumentintgraphgenerator.py').read(), globals())
        # Copy the noun phrase text from Mongo DB (Intelligence collection)
        exec(open('stdbtophrasefile.py').read(), globals())
        # Remove ngram anything above 3 or more words.
        exec(open('ngramremoval.py').read(), globals())
        # Remove duplicates and save it in new distinct phrase file.
        exec(open('duplicatefinder.py').read(), globals())
        # Checks if there is an existing semantic graph, if yes load and update
        # with new documents else create a new semantic graph and store.
        # Normally, this is run after n gram removal and duplicate
        # find and removal.
        exec(open('stdcrgraphgenerator.py').read(), globals())
        # Read the semantic graph which is saved using dcrgraphgenerator.py
        # and read the document phrase file and create optimized integer
        # semantic edge file.
        exec(open('stdcrgraphcompactor.py').read(), globals())
        # Save the node dictionary using pickle to file. This will be used by
        # above programs for finding node ids
        exec(open('savenodes.py').read(), globals())
        # Transfer generated intelligence files
        exec(open('filetransfer.py').read(), globals())
    except BaseException as ex:
        utility.log_exception_file(
            ex,
            dcrconfig.ConfigManager().SemanticGraphLogFile)
def sendtoexternalsystem():
    resumeData = collection.find({"isSent": 0}, {"_id": 0, "isSent": 0})
    candidateFlags = []
    for data in resumeData:
        data["fileName"] = " "
        candidateFlags.append(data)
    if candidateFlags:
        headers = {"Content-Type": "application/json"}
        candidateFlags = json.dumps(candidateFlags).encode('utf8')
        conn = http.client.HTTPConnection(config.ConfigManager().STwebApiHost,
                                          "80")
        conn.request(config.ConfigManager().JobServerMethod,
                     config.ConfigManager().STwebApiUrl, candidateFlags,
                     headers)
        response = conn.getresponse()
        try:
            if response.status == 200:
                data = response.read()
                result = json.loads(data.decode('utf8'))
                resumeIds = result[0]["resumeID"].split(',')
                resumeIds = [str(x.strip()) for x in resumeIds]
                print(resumeIds)
                if resumeIds:
                    collection.update({"batchId": {
                        "$in": resumeIds
                    }}, {"$set": {
                        "isSent": 1
                    }},
                                      multi=True)
                    utility.write_to_file(
                        config.ConfigManager().LogFile, 'a',
                        'ST candidate resume screening, ' +
                        str(len(resumeIds)) + ' flag(s) sent successfully' +
                        ' ' + str(datetime.datetime.now()))
                else:
                    utility.write_to_file(
                        config.ConfigManager().LogFile, 'a',
                        'ST candidate resume screening, no candidates found' +
                        ' ' + str(datetime.datetime.now()))
            else:
                ex = str(response.status) + "--" + str(response.reason)
                utility.log_exception_file(ex, file)
                utility.write_to_file(
                    config.ConfigManager().LogFile, 'a',
                    'ST candidate resume screening, API down time' + ' ' +
                    str(datetime.datetime.now()))
        except BaseException as ex:
            utility.log_exception_file(ex, file)
def process_staging_row(row):
    try:
        global dataList
        "Step:1 data scrubbing for email,phone,url and candidate name"
        row = dataclean(row)

        "Step:2 nounphrases generation"
        row = generatenounphrases(row)

        "Step:3 signature generation"
        row = signaturegraph(row)

        "Step:4 rates calculation"
        row = rates_calculation.billratescalculation(row)

        # Put rate value calculation before this check
        "Step:5 verification of rate availability"
        row = rate_available(row)

        # geographical data check and additions
        row = custom.geo_data_check(row, row['countryDictList'], 'country')
        row = custom.geo_data_check(row, row['stateDictList'], 'state')
        row = custom.geo_data_check(row, row['cityDictList'], 'city')
        row = custom.geo_data_check(row, row['zipCodeDictList'], 'zipCode')
        del row['countryDictList']
        del row['stateDictList']
        del row['cityDictList']
        del row['zipCodeDictList']

        dataList.append(row)
        if row['i'] % int(
                config.ConfigManager().StagingMasterTransferStep) == 0:
            stagingDateModified = row['stagingDateModified']
            del row['stagingDateModified']
            objectid = row['objectid']
            del row['objectid']
            del row['i']
            "Step:4 insert data to db"
            mastercoll.insert(dataList)
            dataList = []
            docid = row['doc_id']

            "Step:5 update config collection with doc_id and datetime"
            updateconfigcollection(docid, stagingDateModified, objectid)

    except BaseException as ex:
        utility.log_exception_file(ex, config.ConfigManager().LogFile)
Esempio n. 10
0
def update_graph():
    '''Load the existing graph and update with new set of job description
    from predefined locations based on the application.ini file'''
    semantic_graph = load_graph()

    phrase_file = open(dcrconfig.ConfigManager().DistinctPhraseFile, 'r')
    '''Get the config values'''
    graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight
    graph_filter_weight = dcrconfig.ConfigManager().FilterGraphEdgeWeight
    print("weight:%d filter weight: %d" % (graph_weight, graph_filter_weight))

    # graph_collection = []
    jdcount = 0

    for line in phrase_file:
        try:
            line = line.strip()

            if not (line.startswith('--') or len(line.strip()) < 1):
                graph = dcrgraph.create_graph_distant_neighbors(line, graph_weight)
                dcrgraph.union_graph(semantic_graph, graph, graph_weight)
                jdcount += 1
            elif (line.startswith('--')):
                ''' If the line starts with -- then it is job descriptin begenning
                    So print a dot indicate the progress '''
                print('.', end='')
                if jdcount % 1000 == 0:
                    print('%d' % jdcount)
                sys.stdout.flush()
        except BaseException as ex:
            utility.log_exception_file(ex, dcrconfig.ConfigManager().SemanticGraphLogFile)

    count = list((d['weight']) for u, v, d in
                 semantic_graph.edges_iter(data=True)
                 if d['weight'] > graph_filter_weight)

    ''' nx.write_gexf(semantic_graph,
                    dcrconfig.ConfigManager().SemanticGraphFile)'''
    mx = max(d for d in count)
    print('mx : %d, total jd processed : %d ' % (mx, jdcount))
    print('Semantic Graph Info: %s' % nx.info(semantic_graph))
    return semantic_graph
def automate_processes():
    utility.write_to_file(config.ConfigManager().PromptcloudLogFile, 'a',
                          'PromptCloudautomationscript running')
    try:
        # download files into PCCompData with in mnt/nlpdata,xml format..
        exec(
            open('rates_pc_download_crawldata_threading.py').read(), globals())
        # compress the PCCompdata folder
        exec(open('compress.py').read(), globals())
        # unzip files created in PCData folder time stored in dataloadconfig..
        exec(open('pc_rates_unzip_gz.py').read(), globals())
        # download data into pcdataanalysisresults.ods
        exec(open('pc_rates_dataload.py').read(), globals())
        # for automatically sending emails
        # exec(open('mailsend.py').read(), globals())
        # store analysis file in s3 backup
        # exec(open('pcdataanalysisbackup.py').read(), globals())
    except BaseException as ex:
        utility.log_exception_file(ex,
                                   config.ConfigManager().PromptcloudLogFile)
Esempio n. 12
0
def datamasking(row):
    maskingText = makingjsondata(row)
    maskingText = json.dumps(maskingText)
    headers = {"Content-Type": "application/json"}
    conn = http.client.HTTPConnection(config.ConfigManager().Host,
                                      config.ConfigManager().Port)
    conn.request(config.ConfigManager().JobServerMethod,
                 config.ConfigManager().API, maskingText, headers)
    response = conn.getresponse()
    data = response.read()
    result = json.loads(data.decode('utf8'))
    try:
        row['supplierName'] = result['supplierName']
        row['clientId'] = result['clientId']
        row['mspId'] = result['mspId']
        row['dataSource'] = result['dataSource']
        # row['source'] = result['source']
    except BaseException as ex:
        print(ex)
        utility.log_exception_file(ex, file)
    conn.close()
    return row
import utility
import datetime
import dcrgraph

utility.write_to_file(config.ConfigManager().LogFile, 'a',
                      'Document integer graph from DB!(gen_docintgraph_from_db.py)' + str(datetime.datetime.now()))
cl = MongoClient(config.ConfigManager().MongoClient.replace("##host##", config.ConfigManager().mongoDBHost))
db = cl[config.ConfigManager().RatesDB]
mastercoll = db[config.ConfigManager().masterCollection]
ratesConfig = db[config.ConfigManager().RatesConfigCollection]

ratesConfigValues = ratesConfig.find({})
masterDateCreated = ratesConfigValues[0]['masterDateModified']
masterDateCreatedList = []


for doc in mastercoll.find({"dateCreated": {"$gt": masterDateCreated}}):
    try:
        if not doc["signGraph"] == "":
            dcrgraph.generate_document_integer_graph_fromdb(doc["signGraph"], doc['doc_id'],'a',config.ConfigManager().masterDocumentIntegerFile)
            masterDateCreatedList.append(doc["dateCreated"])
    except BaseException as ex:
        utility.log_exception_file(str(ex), config.ConfigManager().LogFile)

try:
    if masterDateCreatedList:
        masterDateCreatedLatest = max(masterDateCreatedList)
        ratesConfig.update({"_id": ratesConfigValues[0]['_id']}, {"$set": {"masterDateModified": masterDateCreatedLatest}})
except BaseException as ex:
    utility.log_exception_file(str(ex), config.ConfigManager().LogFile)
Esempio n. 14
0
def readstagingdata():
    utility.write_to_file(
        config.ConfigManager().LogFile, 'a',
        'Staging dataread running' + ' ' + str(datetime.datetime.now()))
    ratesConfigValues = ratesConfig.find({})
    ratesDate = ratesConfigValues[0]['stagingDateModified']
    ratesData = stagingcoll.find({'dateModified': {
        "$gt": ratesDate
    }},
                                 no_cursor_timeout=True)
    doc_id = ratesConfigValues[0]['masterDocId']
    objectid = ratesConfigValues[0]['_id']
    dataList = []
    dateModifiedList = []
    geoCountryQuery = "select distinct iso_alpha2, name, iso_alpha3, fips_code from geo_country order by name"
    geoStateQuery = "select ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid"
    geoCityQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPlaceName, fLatitude, fLongitude from GeoPostal order by sPlaceName"
    geoZipCodeQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPostalCode, fLatitude, fLongitude from GeoPostal  order by sPostalCode"
    countryDictList = custom.create_sql_dict_list(
        geoCountryQuery,
        config.ConfigManager().geographicalDataConnstr)
    stateDictList = custom.create_sql_dict_list(
        geoStateQuery,
        config.ConfigManager().geographicalDataConnstr)
    cityDictList = custom.create_sql_dict_list(
        geoCityQuery,
        config.ConfigManager().geographicalDataConnstr)
    zipCodeDictList = custom.create_sql_dict_list(
        geoZipCodeQuery,
        config.ConfigManager().geographicalDataConnstr)
    i = 0
    for row in ratesData:
        try:
            dateModifiedList.append(row['dateModified'])
            i += 1
            del row['_id']
            doc_id += 1
            row['doc_id'] = doc_id

            "Step:1 data scrubbing for email,phone,url and candidate name"
            row = dataclean(row)

            "Step:2 nounphrases generation"
            row = generatenounphrases(row)

            "Step:3 signature generation"
            row = signaturegraph(row)

            "Step:4 rates calculation"
            row = rates_calculation.billratescalculation(row)

            # Put rate value calculation before this check
            "Step:5 verification of rate availability"
            row = rate_available(row)

            # "Step:5 verification of location/city availability"
            # row = location_available(row)

            # "Step:6 get lat long of city"
            # row = get_lat_long_of_city(row)

            # geographical data check and additions
            row['iso_alpha2_value'] = ')(*&^'
            row['admin1_value'] = ')(*&^'
            row['state_name'] = ')(*&^'
            row = custom.geo_data_check(row, countryDictList, 'country')
            row = custom.geo_data_check(row, stateDictList, 'state')
            row = custom.geo_data_check(row, cityDictList, 'city')
            row = custom.geo_data_check(row, zipCodeDictList, 'zipCode')
            del row['iso_alpha2_value']
            del row['admin1_value']
            del row['state_name']

            dataList.append(row)
            if i >= int(config.ConfigManager().StagingMasterTransferStep):
                "Step:4 insert data to db"
                mastercoll.insert(dataList)
                dataList = []
                i = 0
                docid = row['doc_id']
                stagingDateModified = max(dateModifiedList)
                "Step:5 update config collection with doc_id and datetime"
                updateconfigcollection(docid, stagingDateModified, objectid)

        except BaseException as ex:
            print(ex)
            utility.log_exception_file(ex, config.ConfigManager().LogFile)
            # utility.log_exception_file(row, config.ConfigManager().LogFile)

    ratesData.close()
    del ratesData
                "$ne": "Smart Track"
            }
        }]
}).sort([("doc_id", 1)]).limit(20000):
    try:
        allphrases = ''
        phrases = doc['nounPhrases']
        docId = int(doc['doc_id'])
        docIdList.append(docId)
        jobUniqueId = '-' * 3 + str(docId) + '-' * 3
        allphrases += '\n' + jobUniqueId + '\n' + phrases
        print(allphrases, file=jobDescPhrasesFile)
        jcount += 1

        #  Print status
        print('.', end='')
        sys.stdout.flush()
    except BaseException as ex:
        utility.log_exception_file(
            ex,
            dcrconfig.ConfigManager().SemanticGraphLogFile)

# Updating maximum value in order to take delta in next cycle
if docIdList:
    configcol.update({"_id": configdocs[0]['_id']},
                     {"$set": {
                         "dataDbToPhraseDocId": max(docIdList)
                     }})

print("total documents processed: " + str(jcount))
Esempio n. 16
0
def staging_data_load(filepath):
    dataList = []
    errorList = []
    errorString = ""
    errorFinalString = ""
    excel_file = open_workbook(filepath)
    sheet = excel_file.sheets()[0]
    header_keys = [
        sheet.cell(0, colindex).value for colindex in range(sheet.ncols)
    ]
    stTypeofServiceRows = [x for x in stTypeofServiceCollection.find({})]
    stLaborCategoryRows = [x for x in stLaborCategoryCollection.find({})]
    industryRows = [x for x in industryCollection.find({})]
    currencyRows = [x for x in currencyCollection.find({})]
    stClientRows = [x for x in stClientsCollection.find({})]
    stMspRows = [x for x in stMspCollection.find({})]
    yesNoRows = [{'yesNo': 'Yes'}, {'yesNo': 'No'}]
    geoCountryQuery = "select distinct name from geo_country order by name"
    geoStateQuery = "select distinct name from geo_admin1 order by name"
    geoCityQuery = "select distinct sPlaceName from GeoPostal order by sPlaceName"
    geoZipCodeQuery = "select distinct  sPostalCode from GeoPostal  order by sPostalCode"
    countryList = create_sql_data_list(
        geoCountryQuery,
        config.ConfigManager().geographicalDataConnstr, 'name')
    stateList = create_sql_data_list(
        geoStateQuery,
        config.ConfigManager().geographicalDataConnstr, 'name')
    cityList = create_sql_data_list(
        geoCityQuery,
        config.ConfigManager().geographicalDataConnstr, 'sPlaceName')
    zipCodeList = create_sql_data_list(
        geoZipCodeQuery,
        config.ConfigManager().geographicalDataConnstr, 'sPostalCode')
    for rowindex in range(1, sheet.nrows):
        try:
            errorString = ""
            row_dict = {
                header_keys[colindex]: sheet.cell(rowindex, colindex).value
                for colindex in range(sheet.ncols)
            }
            row_dict = add_fields(row_dict)
            mandatoryFieldsPresent = mandatory_fields_check(
                row_dict, mandatoryFields)
            if mandatoryFieldsPresent:
                mandatoryFieldsValuePresent = mandatory_fields_value_presence_check(
                    row_dict, mandatoryFields)
                if mandatoryFieldsValuePresent:
                    stTypeofServiceValueAccuracyCheck = value_accuracy_check(
                        row_dict, 'typeOfService', stTypeofServiceRows,
                        "VMSTypeofService")
                    stLaborCategoryValueAccuracyCheck = value_accuracy_check(
                        row_dict, 'laborCategory', stLaborCategoryRows,
                        "LaborCategory")
                    industryValueAccuracyCheck = value_accuracy_check(
                        row_dict, 'industry', industryRows, "IndustryName")
                    currencyValueAccuracyCheck = value_accuracy_check(
                        row_dict, 'currency', currencyRows, "currencyCode")
                    clientValueAccuracyCheck = numerical_value_accuracy_check(
                        row_dict, 'clientId', stClientRows, "clientID")
                    mspValueAccuracyCheck = numerical_value_accuracy_check(
                        row_dict, 'mspId', stMspRows, "mspID")
                    rVFlagValueAccuracyCheck = value_accuracy_check(
                        row_dict, 'remoteOrVirtualFlag', yesNoRows, "yesNo")
                    fpTFlagValueAccuracyCheck = value_accuracy_check(
                        row_dict, 'fullTime', yesNoRows, "yesNo")
                    geoCountryAccuracyCheck = geo_data_check(
                        row_dict, countryList, 'country')
                    geoStateAccuracyCheck = geo_data_check(
                        row_dict, stateList, 'state')
                    geoCityAccuracyCheck = geo_data_check(
                        row_dict, cityList, 'city')
                    geoZipCodeAccuracyCheck = geo_data_check(
                        row_dict, zipCodeList, 'zipCode')
                    numericalValidationList = numerical_validation(
                        row_dict, errorString)
                    errorString = numericalValidationList[1]
                    numericalValidation = numericalValidationList[0]
                    dateFormatValidation1 = post_date_format_check(
                        row_dict, errorString, excel_file)
                    dateFormatValidation2 = post_date_format_check_two(
                        row_dict, errorString)
                    if dateFormatValidation1 == False and dateFormatValidation2 == False:
                        dateFormatValidation = False
                        errorString += 'Post date is not in the right format; '
                    else:
                        dateFormatValidation = True

                    if dateFormatValidation1:
                        row_dict['postDate'] = (datetime.datetime(
                            *xlrd.xldate_as_tuple(row_dict['postDate'],
                                                  excel_file.datemode))
                                                ).date().isoformat()
                    if dateFormatValidation2:
                        row_dict['postDate'] = (datetime.datetime.strptime(
                            str(row_dict['postDate']),
                            '%Y-%m-%d')).date().isoformat()
                    if stTypeofServiceValueAccuracyCheck and stLaborCategoryValueAccuracyCheck and industryValueAccuracyCheck and currencyValueAccuracyCheck \
                       and geoCountryAccuracyCheck and geoStateAccuracyCheck and geoCityAccuracyCheck and geoZipCodeAccuracyCheck and clientValueAccuracyCheck\
                       and mspValueAccuracyCheck and numericalValidation and rVFlagValueAccuracyCheck and fpTFlagValueAccuracyCheck and dateFormatValidation:
                        dataList.append(row_dict)
                    else:
                        errorString = master_list_mismatch_message_composition(
                            errorString, stTypeofServiceValueAccuracyCheck,
                            stLaborCategoryValueAccuracyCheck,
                            industryValueAccuracyCheck,
                            currencyValueAccuracyCheck,
                            clientValueAccuracyCheck, mspValueAccuracyCheck,
                            rVFlagValueAccuracyCheck,
                            fpTFlagValueAccuracyCheck, geoCountryAccuracyCheck,
                            geoStateAccuracyCheck, geoCityAccuracyCheck,
                            geoZipCodeAccuracyCheck)

                else:
                    errorString += 'Mandatory fields are empty; '
            else:
                errorString += 'Mandatory fields are absent; '
            errorString = error_string_clean(errorString)
            if not errorString == "":
                errorString = 'Errors in row ' + str(rowindex +
                                                     1) + ' - ' + errorString
                errorList.append(errorString)
        except BaseException as ex:
            errorString = 'Errors in row ' + str(
                rowindex + 1) + ' - ' + 'exception!!; ' + errorString
            errorList.append(errorString)
            utility.log_exception_file(ex, config.ConfigManager().LogFile)

    try:
        # file_back_up_and_removal(filepath)
        pass
    except BaseException as ex:
        utility.log_exception_file(ex, config.ConfigManager().LogFile)

    try:
        if dataList:
            insert_to_db(dataList, stagingCollection)
            if errorList:
                errorList.insert(
                    0,
                    'Data submitted successfully! Please upload a brand new file after correcting the following errors.'
                )
            else:
                errorList.insert(0, 'Data submitted successfully!')
        errorFinalString = '|!@#$%|'.join(errorList)
        print(errorFinalString)
    except BaseException as ex:
        print('Exception during data load!')
        utility.log_exception_file(ex, config.ConfigManager().LogFile)
def job_info_analysis(page, filepath, dbrecordcount):
    global totalrecords
    global invalidrecords
    global emptydesc
    global incompletedesc
    global smalldesc
    global nonedesc
    global nodesc
    global totaljobsdict
    global jobsitedict

    # Fetching current config paramters
    configdocs = custom.retrieve_data_from_DB(int(config.ConfigManager().
                                              MongoDBPort),
                                              config.ConfigManager().
                                              IntelligenceDb,
                                              config.ConfigManager().
                                              ConfigCollection)

    docid_count = int(configdocs[0]['docid_count'])
    dict_object_record_list = []
    for jobinfo in page.findall('record'):
                try:
                    # creating dictionary from xml tag contents
                    dict_object = utility.xml_to_dict(ET.tostring(jobinfo))
                    totaljobsdict = fill_job_by_site(filepath)
                    totalrecords += 1

                    # outer if check is jobdescription tag is in the xml
                    if 'jobdescription' in (dict_object['record']):
                        # checking if job description is none
                        if ((dict_object['record'])['jobdescription'] is not
                           None):
                            # variable to determine if record needs to be
                            # updated in DB
                            incorrectjobdescription = 0

                            # checking if job description is empty
                            if (((dict_object['record'])['jobdescription'])
                               .strip()) == '':
                                write_fileinfo(filepath, dict_object)
                                invalidrecords += 1
                                emptydesc += 1
                                incorrectjobdescription = 1
                                jobsitedict = fill_job_site_data(filepath)

                            # checking if job desc has less than 20 chars
                            if (len(((dict_object['record'])['jobdescription'])
                                    ) < 20):
                                incorrectjobdescription = 1
                                # eliminating the incomplete desc case
                                if (((dict_object['record'])['jobdescription'])
                                   .strip()[-3:]) == '...':
                                    print('Do nothing')
                                else:
                                    write_fileinfo(filepath, dict_object)
                                    invalidrecords += 1
                                    smalldesc += 1
                                    jobsitedict = fill_job_site_data(filepath)

                            # checking the incomplete desc case
                            if (((dict_object['record'])['jobdescription'])
                               .strip()[-3:]) == '...':
                                incorrectjobdescription = 1
                                write_fileinfo(filepath, dict_object)
                                invalidrecords += 1
                                incompletedesc += 1
                                jobsitedict = fill_job_site_data(filepath)

                            if (incorrectjobdescription == 0):
                                docid_count += 1
                                (dict_object['record'])['doc_id'] = docid_count
                                (dict_object['record'])['description'] = ((dict_object['record'])['jobdescription'])
                                (dict_object['record'])['nounPhrases'] = ""
                                dict_object_record_list.append(dict_object['record'])
                                dbrecordcount += 1

                        # checking if job description is none
                        if (dict_object['record'])['jobdescription'] is None:
                            write_fileinfo(filepath, dict_object)
                            invalidrecords += 1
                            nonedesc += 1
                            jobsitedict = fill_job_site_data(filepath)

                    else:
                        write_fileinfo(filepath, dict_object)
                        invalidrecords += 1
                        nodesc += 1
                        jobsitedict = fill_job_site_data(filepath)

                except BaseException as ex:
                    utility.log_exception_file(ex, dcrconfig.ConfigManager().SemanticGraphLogFile)
    if dict_object_record_list:
        insert_to_db(dict_object_record_list)
    # updating doc_id in config table
    UpdateTemplateWhere = utility.clean_dict()
    UpdateTemplateSet = utility.clean_dict()
    UpdateTemplateWhere['_id'] = configdocs[0]['_id']
    UpdateTemplateSet['docid_count'] = docid_count
    DBSet = utility.clean_dict()
    DBSet['$set'] = UpdateTemplateSet
    custom.update_data_to_Db_noupsert(int(config.ConfigManager().MongoDBPort),
                                      config.ConfigManager().IntelligenceDb,
                                      config.ConfigManager().ConfigCollection,
                                      UpdateTemplateWhere, DBSet, connection)
    return dbrecordcount
def readstagingdata():
    utility.write_to_file(
        config.ConfigManager().LogFile, 'a',
        'Staging dataread running' + ' ' + str(datetime.datetime.now()))
    ratesConfigValues = ratesConfig.find({})
    ratesDate = ratesConfigValues[0]['stagingDateModified']
    # ratesDataDateMax = ((stagingcoll.find().sort([('dateModified', -1)]).limit(1))[0])['dateModified']
    ratesDataCount = (stagingcoll.count({'dateModified': {"$gt": ratesDate}}))

    geoCountryQuery = "select distinct iso_alpha2, name, iso_alpha3, fips_code from geo_country order by name"
    # geoStateQuery = "select ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid"
    geoStateQuery = "select gc.iso_alpha2, ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid inner join geo_country gc on ltrim(rtrim(ga1.code)) like '%'+ ltrim(rtrim(gc.iso_alpha2))+'.' + '%'"
    geoCityQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPlaceName, fLatitude, fLongitude from GeoPostal order by sPlaceName"
    geoZipCodeQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPostalCode, fLatitude, fLongitude from GeoPostal  order by sPostalCode"
    # countryDictList = custom.create_sql_dict_list(geoCountryQuery, config.ConfigManager().geographicalDataConnstr)
    # stateDictList = custom.create_sql_dict_list(geoStateQuery, config.ConfigManager().geographicalDataConnstr)
    # cityDictList = custom.create_sql_dict_list(geoCityQuery, config.ConfigManager().geographicalDataConnstr)
    # zipCodeDictList = custom.create_sql_dict_list(geoZipCodeQuery, config.ConfigManager().geographicalDataConnstr)

    geoCountryDict = custom.create_geo_dict(
        geoCountryQuery,
        config.ConfigManager().geographicalDataConnstr, 'Country')
    geoStateDict = custom.create_geo_dict(
        geoStateQuery,
        config.ConfigManager().geographicalDataConnstr, 'State')
    geoCityDict = custom.create_geo_dict(
        geoCityQuery,
        config.ConfigManager().geographicalDataConnstr, 'City')
    geoZipCodeDict = custom.create_geo_dict(
        geoZipCodeQuery,
        config.ConfigManager().geographicalDataConnstr, 'zipCode')

    cleanUpListDict = data_cleanup_lists()

    ratesConfigValues = ratesConfig.find({})
    ratesDate = ratesConfigValues[0]['stagingDateModified']
    objectid = ratesConfigValues[0]['_id']
    lastDateTime = ratesConfigValues[0]['masterAutomationStartDate']
    oldDate = datetime.datetime.strptime(lastDateTime, '%Y-%m-%d')

    mapping_dict = dcrgraphcompactor.load_node_dict()
    edge_int_dict = dcrgraphcompactor.get_normalized_dictionary_from_int_edges(
    )

    neighborCount = dcrgraph.neighbor_count_for_edge_weight()
    diminition_percent = dcrconfig.ConfigManager().DiminitionPercentage
    # while ratesDate < ratesDataDateMax:
    while ratesDataCount > 0:
        ratesConfigValues = ratesConfig.find({})
        ratesDate = ratesConfigValues[0]['stagingDateModified']
        # countTotalRecords = stagingcoll.count({'dateModified': {"$gt": ratesDate}})
        stepSize = int(config.ConfigManager().StagingMasterTransferStep)

        # if countTotalRecords < stepSize:
        # stepSize = countTotalRecords
        if ratesDataCount < stepSize:
            stepSize = ratesDataCount
        ratesDataCount = ratesDataCount - stepSize

        ratesData = stagingcoll.find(
            {
                'dateModified': {
                    "$gt": ratesDate
                }
            }, no_cursor_timeout=True).sort([
                ('dateModified', 1)
            ]).limit(int(config.ConfigManager().StagingMasterTransferStep))
        doc_id = ratesConfigValues[0]['masterDocId']
        dataList = []
        dateModifiedList = []

        i = 0
        for row in ratesData:
            try:

                dateModifiedList.append(row['dateModified'])
                i += 1
                print(i)
                del row['_id']
                doc_id += 1
                row['doc_id'] = doc_id
                # print('Start data clean ' + str(datetime.datetime.now()))
                # "Step:1 data scrubbing for email,phone,url and candidate name"
                row = dataclean(row, cleanUpListDict)
                # print('Start noun phrase gen ' + str(datetime.datetime.now()))
                # "Step:2 nounphrases generation"
                row = generatenounphrases(row)
                # print('Start signature graph ' + str(datetime.datetime.now()))
                # "Step:3 signature generation"
                row = signaturegraph(row, mapping_dict, edge_int_dict,
                                     neighborCount, diminition_percent)
                # print('Start rates calculation ' + str(datetime.datetime.now()))
                # "Step:4 rates calculation"
                row = rates_calculation.billratescalculation(row)
                # print('Start rate available ' + str(datetime.datetime.now()))
                # Put rate value calculation before this check
                # "Step:5 verification of rate availability"
                row = rate_available(row)
                # print('Start geo verify ' + str(datetime.datetime.now()))
                # geographical data check and additions
                row['iso_alpha2_value'] = ')(*&^'
                row['admin1_value'] = ')(*&^'
                row['state_name'] = ')(*&^'
                row = custom.geo_data_verify(row, geoCountryDict, 'country')
                row = custom.geo_data_verify(row, geoStateDict, 'state')
                row = custom.geo_data_verify(row, geoCityDict, 'city')
                row = custom.geo_data_verify(row, geoZipCodeDict, 'zipCode')
                del row['iso_alpha2_value']
                del row['admin1_value']
                del row['state_name']

                # print('Stop geo verify ' + str(datetime.datetime.now()))
                dataList.append(row)

            except BaseException as ex:
                utility.log_exception_file(ex, config.ConfigManager().LogFile)

        ratesData.close()
        del ratesData

        if dataList:
            # Step:4 insert data to db
            mastercoll.insert(dataList)
            doc_id = row['doc_id']

        todayDate = datetime.date.today()
        todayDate = datetime.datetime.strptime(str(todayDate), '%Y-%m-%d')
        delta = todayDate - oldDate
        days = delta.days

        if dateModifiedList:
            ratesDate = max(dateModifiedList)
        # Step:5 update config collection with doc_id and datetime
        updateconfigcollection(doc_id, ratesDate, objectid)

        if int(days) >= int(5):
            break
def stcandidate_update():
    utility.write_to_file(
        config.ConfigManager().LogFile, 'a',
        'in stcandidates update currency Code running.!' + ' ' +
        str(datetime.datetime.now()))
    recordnumber = 0
    query = custom.fetch_query(
        config.ConfigManager().STCandidateCurrencyQueryId)
    print(query)
    query = custom.query_variable_replace(
        query,
        config.ConfigManager().STCandidateCurrencyDetails,
        config.ConfigManager().ST)
    print(query)
    cursor = dbmanager.cursor_odbc_connection(config.ConfigManager().STConnStr)
    db_data_dict = dbmanager.cursor_execute(cursor, query)
    db_data = db_data_dict['dbdata']
    db_data_cursorexec = db_data_dict['cursor_exec']
    cursor_description = db_data_cursorexec.description
    column_headers = [column[0] for column in cursor_description]
    connection = dbmanager.mongoDB_connection(
        int(config.ConfigManager().MongoDBPort))
    data_dict1 = {}
    req_list = []
    candidateDatesList = []

    for row1 in db_data:
        try:
            print(data_dict1)
            strtimestamp = str(datetime.datetime.now())
            recordnumber += 1
            print(recordnumber)
            data_dict1 = dict(utility.zip_list(column_headers, row1))
            STCandidateCollection.update(
                {
                    "$and": [{
                        "candidateid": data_dict1['candidateid']
                    }, {
                        "requirementRateStatusList": {
                            "$elemMatch": {
                                "requirementId": data_dict1['requirementid']
                            }
                        }
                    }]
                }, {
                    "$set": {
                        "requirementRateStatusList.$.currencyCode":
                        data_dict1['currencycode'],
                        "requirementRateStatusList.$.SupplierCurrencyCode":
                        data_dict1['SupplierCurrencyCode'],
                        "requirementRateStatusList.$.supplierRegBillRateEX":
                        str(data_dict1['supplierRegBillRateEX'])
                    }
                })
            candidateDatesList.append(data_dict1['dateCreated'])
        except BaseException as ex:
            print(ex)
            utility.log_exception_file(ex, config.ConfigManager().LogFile)
    if 'dateCreated' in data_dict1:
        maxCandDate = max(candidateDatesList)
        UpdateTemplateWhere = utility.clean_dict()
        UpdateTemplateSet = utility.clean_dict()
        UpdateTemplateWhere['_id'] = configdocs[0]['_id']
        print(maxCandDate)
        print(str(maxCandDate))
        UpdateTemplateSet['STCandidateCurrencyCodeLastDate'] = str(maxCandDate)
        DBSet = utility.clean_dict()
        DBSet['$set'] = UpdateTemplateSet
        custom.update_data_to_Db_noupsert(
            int(config.ConfigManager().MongoDBPort),
            config.ConfigManager().DataCollectionDB,
            config.ConfigManager().ConfigCollection, UpdateTemplateWhere,
            DBSet, connection)