def generate_search_nodes(text): noun_phrases = dcrnlp.extract_nounphrases_sentences(text) integer_dict = dcrgraphcompactor.load_node_dict() graph = dcrgraph.create_graph_distant_neighbors(noun_phrases) dcrgraph.print_graph(graph) nodes = dcrgraph.generate_document_integer_nodes(integer_dict, graph) print('\ninput.nodes = ' + ' '.join(str(x) for x in nodes))
def signaturegraph(row): noun_phrases = row['nounPhrases'] mapping_dict = dcrgraphcompactor.load_node_dict() edge_int_dict = dcrgraphcompactor.get_normalized_dictionary() graph = dcrgraphcompactor.generate_document_graphs_from_dict_list_savetodb( mapping_dict, edge_int_dict, noun_phrases) row['signGraph'] = graph row['signGraphFlag'] = 1 return row
def integer_edges_from_dict(graph): int_dict = dcrgraphcompactor.load_node_dict() edges = [] for v1, v2, w in graph.edges_iter(data=True): weight = int(w['weight']) n1 = int_dict[v1] n2 = int_dict[v2] if n1 > n2: n1 = n2 n2 = int_dict[v1] edges.append([n1, n2, weight]) return edges
#!/usr/bin/python3.4 # Generates integer graphs for documents from a phrase file # Reads a graph from a predefined file and optimizes # by converting it into integer nodes import networkx as nx import dcrgraphcompactor import dcrconfig import utility import datetime # main function entry if __name__ == "__main__": utility.write_to_file( dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', 'Semantic graph Generation Step 10..! (dcrdocumentintgraphgenerator.py) ' + str(datetime.datetime.now())) mapping_dict = dcrgraphcompactor.load_node_dict() # edge_int_dict = dcrgraphcompactor.get_normalized_dictionary() edge_int_dict = dcrgraphcompactor.get_normalized_dictionary_from_int_edges( ) print('Saving Integer Document Graphs...') dcrgraphcompactor.generate_document_graphs(mapping_dict, edge_int_dict) print("Successfully Completed.!")
def readstagingdata(): utility.write_to_file( config.ConfigManager().LogFile, 'a', 'Staging dataread running' + ' ' + str(datetime.datetime.now())) ratesConfigValues = ratesConfig.find({}) ratesDate = ratesConfigValues[0]['stagingDateModified'] # ratesDataDateMax = ((stagingcoll.find().sort([('dateModified', -1)]).limit(1))[0])['dateModified'] ratesDataCount = (stagingcoll.count({'dateModified': {"$gt": ratesDate}})) geoCountryQuery = "select distinct iso_alpha2, name, iso_alpha3, fips_code from geo_country order by name" # geoStateQuery = "select ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid" geoStateQuery = "select gc.iso_alpha2, ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid inner join geo_country gc on ltrim(rtrim(ga1.code)) like '%'+ ltrim(rtrim(gc.iso_alpha2))+'.' + '%'" geoCityQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPlaceName, fLatitude, fLongitude from GeoPostal order by sPlaceName" geoZipCodeQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPostalCode, fLatitude, fLongitude from GeoPostal order by sPostalCode" # countryDictList = custom.create_sql_dict_list(geoCountryQuery, config.ConfigManager().geographicalDataConnstr) # stateDictList = custom.create_sql_dict_list(geoStateQuery, config.ConfigManager().geographicalDataConnstr) # cityDictList = custom.create_sql_dict_list(geoCityQuery, config.ConfigManager().geographicalDataConnstr) # zipCodeDictList = custom.create_sql_dict_list(geoZipCodeQuery, config.ConfigManager().geographicalDataConnstr) geoCountryDict = custom.create_geo_dict( geoCountryQuery, config.ConfigManager().geographicalDataConnstr, 'Country') geoStateDict = custom.create_geo_dict( geoStateQuery, config.ConfigManager().geographicalDataConnstr, 'State') geoCityDict = custom.create_geo_dict( geoCityQuery, config.ConfigManager().geographicalDataConnstr, 'City') geoZipCodeDict = custom.create_geo_dict( geoZipCodeQuery, config.ConfigManager().geographicalDataConnstr, 'zipCode') cleanUpListDict = data_cleanup_lists() ratesConfigValues = ratesConfig.find({}) ratesDate = ratesConfigValues[0]['stagingDateModified'] objectid = ratesConfigValues[0]['_id'] lastDateTime = ratesConfigValues[0]['masterAutomationStartDate'] oldDate = datetime.datetime.strptime(lastDateTime, '%Y-%m-%d') mapping_dict = dcrgraphcompactor.load_node_dict() edge_int_dict = dcrgraphcompactor.get_normalized_dictionary_from_int_edges( ) neighborCount = dcrgraph.neighbor_count_for_edge_weight() diminition_percent = dcrconfig.ConfigManager().DiminitionPercentage # while ratesDate < ratesDataDateMax: while ratesDataCount > 0: ratesConfigValues = ratesConfig.find({}) ratesDate = ratesConfigValues[0]['stagingDateModified'] # countTotalRecords = stagingcoll.count({'dateModified': {"$gt": ratesDate}}) stepSize = int(config.ConfigManager().StagingMasterTransferStep) # if countTotalRecords < stepSize: # stepSize = countTotalRecords if ratesDataCount < stepSize: stepSize = ratesDataCount ratesDataCount = ratesDataCount - stepSize ratesData = stagingcoll.find( { 'dateModified': { "$gt": ratesDate } }, no_cursor_timeout=True).sort([ ('dateModified', 1) ]).limit(int(config.ConfigManager().StagingMasterTransferStep)) doc_id = ratesConfigValues[0]['masterDocId'] dataList = [] dateModifiedList = [] i = 0 for row in ratesData: try: dateModifiedList.append(row['dateModified']) i += 1 print(i) del row['_id'] doc_id += 1 row['doc_id'] = doc_id # print('Start data clean ' + str(datetime.datetime.now())) # "Step:1 data scrubbing for email,phone,url and candidate name" row = dataclean(row, cleanUpListDict) # print('Start noun phrase gen ' + str(datetime.datetime.now())) # "Step:2 nounphrases generation" row = generatenounphrases(row) # print('Start signature graph ' + str(datetime.datetime.now())) # "Step:3 signature generation" row = signaturegraph(row, mapping_dict, edge_int_dict, neighborCount, diminition_percent) # print('Start rates calculation ' + str(datetime.datetime.now())) # "Step:4 rates calculation" row = rates_calculation.billratescalculation(row) # print('Start rate available ' + str(datetime.datetime.now())) # Put rate value calculation before this check # "Step:5 verification of rate availability" row = rate_available(row) # print('Start geo verify ' + str(datetime.datetime.now())) # geographical data check and additions row['iso_alpha2_value'] = ')(*&^' row['admin1_value'] = ')(*&^' row['state_name'] = ')(*&^' row = custom.geo_data_verify(row, geoCountryDict, 'country') row = custom.geo_data_verify(row, geoStateDict, 'state') row = custom.geo_data_verify(row, geoCityDict, 'city') row = custom.geo_data_verify(row, geoZipCodeDict, 'zipCode') del row['iso_alpha2_value'] del row['admin1_value'] del row['state_name'] # print('Stop geo verify ' + str(datetime.datetime.now())) dataList.append(row) except BaseException as ex: utility.log_exception_file(ex, config.ConfigManager().LogFile) ratesData.close() del ratesData if dataList: # Step:4 insert data to db mastercoll.insert(dataList) doc_id = row['doc_id'] todayDate = datetime.date.today() todayDate = datetime.datetime.strptime(str(todayDate), '%Y-%m-%d') delta = todayDate - oldDate days = delta.days if dateModifiedList: ratesDate = max(dateModifiedList) # Step:5 update config collection with doc_id and datetime updateconfigcollection(doc_id, ratesDate, objectid) if int(days) >= int(5): break