def merge_int_edges(edge_count_max): edge_int_dict = {} edge_file_temp = open(dcrconfig.ConfigManager().IntegerEdegesFile.replace('int_edges', 'int_edges_temp'), 'w') edge_file_new = open(dcrconfig.ConfigManager().IntegerEdegesFile.replace('int_edges', 'int_edges_new'), 'r') for each_line_new in edge_file_new: edge_count_new = int((each_line_new.split(' '))[0]) edge1_new = int((each_line_new.split(' '))[1]) edge2_new = int((each_line_new.split(' '))[2]) edge_weight_new = int((each_line_new.split(' '))[3]) edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r') repeat_flag = 0 for each_line in edge_file: edge_count = int((each_line.split(' '))[0]) edge1 = int((each_line.split(' '))[1]) edge2 = int((each_line.split(' '))[2]) edge_weight = int((each_line.split(' '))[3]) if edge1_new == edge1 and edge2_new == edge2: repeat_flag = 1 print('%d %d %d %d' % (edge_count, edge1, edge2, edge_weight_new), file=edge_file_temp) edge_int_dict[(edge1, edge2)] = edge_count edge_file.close() if repeat_flag == 0: edge_count_max += 1 print('%d %d %d %d' % (edge_count_max, edge1_new, edge2_new, edge_weight_new), file=edge_file_temp) edge_int_dict[(edge1_new, edge2_new)] = edge_count_max edge_file_new.close() edge_file_temp.close()
def nounphrase_generate(): c = MongoClient(dcrconfig.ConfigManager().Datadb) db = c[config.ConfigManager().IntelligenceDb] col = db[config.ConfigManager().IntelligenceDataCollection] docs = col.find({'nounPhrases': ""}, { "description": 1, "doc_id": 1, "_id": 1 }) mongoport = int(config.ConfigManager().MongoDBPort) connection = dbmanager.mongoDB_connection(mongoport) for doc in docs: try: data = {} data['desc'] = doc['description'] data['_id'] = doc['_id'] data['doc_id'] = doc['doc_id'] data['connection'] = connection q.put(data) except BaseException as ex: exception_message = '\n' + 'Exception:' + '\n' str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file( dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', exception_message)
def generate_document_graphs(dict, edge_dict): # Loop thru all phrase files and generate the integer graph phrase_file = open(dcrconfig.ConfigManager().DistinctPhraseFile, 'r') jdcount = 0 graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight for line in phrase_file: line = line.strip() if (line.startswith('--')): # If the line starts with -- then it is job descriptin begenning # So print a dot indicate the progress print('.', end='') sys.stdout.flush() doc = line.strip() if not (line.startswith('--') or len(line.strip()) < 1): graph = dcrgraph.create_graph_distant_neighbors(line, graph_weight) graph = dcrgraph.generate_document_integer_graph(dict, graph, doc, edge_dict) jdcount += 1 if jdcount % 10 == 0: print('%d' % jdcount)
def create_document_graph_distant_neighbors(phrase_string, neighborCount, diminition_percent, edge_weight=1): phrase_sentences = phrase_string.split('.') base_graph = create_graph(phrase_string.replace('.', '')) neighbor_sensitive_graph = nx.Graph() for sent in phrase_sentences: ph = sent.split('|') phrases = [s for s in ph if len(s) > 2] neighbor_sensitive_graph.add_nodes_from(phrases) neighborPhrasesList = list(product(enumerate(phrases), repeat=2)) edge_weight = dcrconfig.ConfigManager().GraphEdgeWeight for neighbor in neighborPhrasesList: if (neighbor[0])[0] < (neighbor[1])[0]: neighborDistance = (neighbor[1])[0] - (neighbor[0])[0] if neighborDistance <= neighborCount: if neighborDistance == 1: edge_weight = dcrconfig.ConfigManager().GraphEdgeWeight else: edge_weight = math.floor( dcrconfig.ConfigManager().GraphEdgeWeight * (diminition_percent / 100)**(neighborDistance - 1)) neighbor_sensitive_graph.add_edge((neighbor[0])[1], (neighbor[1])[1], weight=edge_weight) union_graph(base_graph, neighbor_sensitive_graph) return base_graph
def st_create_graph_distant_neighbors(phrase_string, edge_weight=1): phrase_sentences = phrase_string.split('.') base_graph = create_graph(phrase_string.replace('.', '')) neighbor_sensitive_graph = nx.Graph() diminition_percent = dcrconfig.ConfigManager().STDiminitionPercentage for sent in phrase_sentences: ph = sent.split('|') phrases = [s for s in ph if len(s) > 2] neighbor_sensitive_graph.add_nodes_from(phrases) # Add all edges phrase_len = len(phrases) for i in range(phrase_len - 1): edge_weight = dcrconfig.ConfigManager().STGraphEdgeWeight for j in range(i + 1, phrase_len): neighbor_sensitive_graph.add_edge(phrases[i], phrases[j], weight=edge_weight) # Reduce the graph weight by the predefined percentage edge_weight = math.floor(edge_weight * diminition_percent / 100) # If the edge_weight diminishes to less than 1, # then you don't need to proceed. if edge_weight < 1: break union_graph(base_graph, neighbor_sensitive_graph) return base_graph
def neighbor_count_for_edge_weight(): neighborCount = 0 edge_weight = dcrconfig.ConfigManager().GraphEdgeWeight diminition_percent = dcrconfig.ConfigManager().DiminitionPercentage while True: neighborCount += 1 edge_weight = math.floor(edge_weight * diminition_percent / 100) if edge_weight < 1: break return neighborCount
def save_node_dict(): import pickle print('Reading Semantic Graph...') graph = nx.read_gexf(dcrconfig.ConfigManager().SemanticGraphFile) # create an integer mapping for each of the phrases # mapping_dict = dcrgraph.create_node_dictionary(graph.nodes()) old_node_dict = load_node_dict() mapping_dict = dcrgraph.append_node_dictionary(graph.nodes(), old_node_dict) pickle.dump(mapping_dict, open(dcrconfig.ConfigManager().NodeFile, 'wb')) print('Saving nodes completed')
def automate_processes(): utility.write_to_file( dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', 'Knowledge build automation running..! ' + str(datetime.datetime.now())) try: # Copies files from the previous cycle exec(open('filecopy.py').read(), globals()) # Copy the noun phrase text from Mongo DB exec(open('dbtophrasefile.py').read(), globals()) # Remove ngram anything above 3 or more words. exec(open('ngramremoval.py').read(), globals()) # Remove duplicates and save it in new distinct phrase file. exec(open('duplicatefinder.py').read(), globals()) # Checks if there is an existing semantic graph, if yes load and update # with new documents else create a new semantic graph and store. # Normally, this is run after n gram removal and duplicate # find and removal. exec(open('dcrgraphgenerator.py').read(), globals()) # Read the semantic graph which is saved using dcrgraphgenerator.py # and read the document phrase file and create optimized integer # semantic edge file. exec(open('dcrgraphcompactor.py').read(), globals()) # Save the node dictionary using pickle to file. This will be used by # above programs for finding node ids exec(open('savenodes.py').read(), globals()) # Generate document integer graph and store. This will be used for # searching the documents. # exec(open('dcrdocumentintgraphgenerator.py').read(), globals()) # Copy the noun phrase text from Mongo DB (Intelligence collection) exec(open('stdbtophrasefile.py').read(), globals()) # Remove ngram anything above 3 or more words. exec(open('ngramremoval.py').read(), globals()) # Remove duplicates and save it in new distinct phrase file. exec(open('duplicatefinder.py').read(), globals()) # Checks if there is an existing semantic graph, if yes load and update # with new documents else create a new semantic graph and store. # Normally, this is run after n gram removal and duplicate # find and removal. exec(open('stdcrgraphgenerator.py').read(), globals()) # Read the semantic graph which is saved using dcrgraphgenerator.py # and read the document phrase file and create optimized integer # semantic edge file. exec(open('stdcrgraphcompactor.py').read(), globals()) # Save the node dictionary using pickle to file. This will be used by # above programs for finding node ids exec(open('savenodes.py').read(), globals()) # Transfer generated intelligence files exec(open('filetransfer.py').read(), globals()) except BaseException as ex: utility.log_exception_file( ex, dcrconfig.ConfigManager().SemanticGraphLogFile)
def generate_nodes(): semantic_edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r') node_file = open(dcrconfig.ConfigManager().IntegerNodesFile, 'w') nodes = set() for line in semantic_edge_file: words = line.split() nodes.add(words[0]) nodes.add(words[1]) print('Saving integer nodes to file ...') for node in nodes: print("%s" % node, file=node_file)
def generate_document_graph_images(): # Loop thru all phrase files and generate the integer graph phrase_file = open(dcrconfig.ConfigManager().DocumentsEdgesIntegerFile, 'r') jdcount = 0 doc = '' node_collection = [] for line in phrase_file: line = line.strip() if (line.startswith('--')): # If the line starts with -- then it is job descriptin begenning # So print a dot indicate the progress print('.', end='') sys.stdout.flush() if node_collection: generate_graph_image(node_collection, doc) node_collection = [] jdcount += 1 doc = line.strip() if not (line.startswith('--') or len(line.strip()) < 1): node_collection.append(line.split(' ')) if jdcount > 50: break elif jdcount % 20 == 0: plt.close('all')
def generate_req_candidate_file_selected_req(req_list): candidate_list = list( candidates.find({}, { "candidateid": 1, "requirementIDList": 1 })) # Remove the duplicates if it is coming from another list distinct_req_list = list(set(req_list)) for req in distinct_req_list: # File name is the requirement Id with the path from config req_file_name = dcrconfig.ConfigManager().SmartTrackDirectory req_file_name += str(req) # Clear the file. This can be changed to # append if only the new candidates are picked in the candidate list open(req_file_name, 'w').close() # Find candidates for the requirement and generate the # req candidate file. req_candidate_list = find_candidates(candidate_list, req) generate_req_candidate_file_edge_dict_from_file( req_file_name, req_candidate_list)
def generate_document_integer_graph(integer_dict, document_graph, document, edge_dict, file_operation='a', edge_file_path=''): # Generate compacted/Integer subgraph based on the integer dictionary. # If the file_operation = a append to the configuration document graph # Step1: Remove the nodes that is not present in the dictionary document_graph = remove_missing_nodes(integer_dict, document_graph) # Step2: Integerize nodes graph = relabel_nodes(document_graph, integer_dict) # Step3: Save int the file # Check if the file path is not empty if edge_file_path == '': edge_file_path = dcrconfig.ConfigManager().DocumentsEdgesIntegerFile edge_file = open(edge_file_path, file_operation) print(document, file=edge_file) for edge in graph.edges(data=True): edge1 = edge[0] edge2 = edge[1] if edge1 > edge2: edge1 = edge[1] edge2 = edge[0] key = edge1, edge2 if key in edge_dict: print('%d %d %d %d' % (edge_dict[key], edge1, edge2, edge[2]['weight']), file=edge_file) edge_file.close() return graph
def generate_document_signature_graph(dict, edge_dict, noun_phrases, neighborCount, diminition_percent): graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight # noun_phrases = "|human|asst|level|performs variety|general personnel clerical tasks|areas|employee|education training|employment|compensation|equal employment opportunity.|personnel|compiles sensitive|confidential personnel.|accordance|information.|provides information|personnel.|mba hr." # graph = dcrgraph.create_document_graph_distant_neighbors(noun_phrases, neighborCount, diminition_percent, graph_weight) graph = dcrgraph.create_graph_distant_neighbors_with_generator(noun_phrases, graph_weight) graph = dcrgraph.graph_to_signature_graph(dict, graph, edge_dict) return graph
def job_info_analysis(page, filepath, dbrecordcount): global totalrecords global invalidrecords global emptydesc global incompletedesc global smalldesc global nonedesc global nodesc global totaljobsdict global jobsitedict dict_object_record_list = [] for jobinfo in page.findall('record'): try: # creating dictionary from xml tag contents dict_object = utility.xml_to_dict(ET.tostring(jobinfo)) # totaljobsdict = fill_job_by_site(filepath) # totalrecords += 1 # outer if check is jobdescription tag is in the xml if 'jobdescription' in (dict_object['record']): # checking if job description is none if ((dict_object['record'])['jobdescription'] is not None): incorrectjobdescription = 0 if (((dict_object['record'])['jobdescription']).strip() ) == '': incorrectjobdescription = 1 if (len(((dict_object['record'])['jobdescription'])) < 20): incorrectjobdescription = 1 if (((dict_object['record'])['jobdescription'] ).strip()[-3:]) == '...': incorrectjobdescription = 1 if (incorrectjobdescription == 0): (dict_object['record'] )['dateCreated'] = datetime.datetime.now() (dict_object['record'] )['dateModified'] = datetime.datetime.now() (dict_object['record'])['createdUser'] = '******' (dict_object['record'])['modifiedUser'] = '******' (dict_object['record'])['source'] = 'PromptCloud' #(dict_object['record'])['Url'] = page['pageurl'] dict_object_record_list.append(dict_object['record']) dbrecordcount += 1 except BaseException as ex: utility.log_exception_file( ex, dcrconfig.ConfigManager().SemanticGraphLogFile) if dict_object_record_list: insert_to_db(dict_object_record_list) # updating doc_id in config table return dbrecordcount
def create_edges_file_with_dict(): print('Reading Semantic Graph...') graph = nx.read_gexf(dcrconfig.ConfigManager().SemanticGraphFile) edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r') edge_int_dict_old = {} last_line = '0 ' for each_line in edge_file: last_line = each_line edge_count = int((each_line.split(' '))[0]) edge1 = int((each_line.split(' '))[1]) edge2 = int((each_line.split(' '))[2]) edge_weight = int((each_line.split(' '))[3]) edge_int_dict_old[(edge1, edge2)] = [edge_count, edge_weight] edge_count_old_max = int((last_line.split(' '))[0]) edge_file.close() # create an integer mapping for each of the phrases # mapping_dict = dcrgraph.create_node_dictionary(graph.nodes()) old_node_dict = load_node_dict() mapping_dict = dcrgraph.append_node_dictionary(graph.nodes(), old_node_dict) # writer = csv.writer(open('/mnt/nlpdata/nodedict.csv', 'w')) # for key, value in mapping_dict.items(): # writer.writerow([key, value]) new_graph = nx.relabel_nodes(graph, mapping_dict) edge_int_dict = {} edge_count = 0 # Loop thru the edges. Compare the nodes order the first node be greater # than the second one. This will help in compressing the graph for edge in new_graph.edges(data=True): edge1 = edge[0] edge2 = edge[1] if edge1 > edge2: edge1 = edge[1] edge2 = edge[0] edge_count += 1 edge_int_dict[(edge1, edge2)] = [int(edge_count), int(edge[2]['weight'])] edge_file.close() merge_int_edges_with_dict(edge_count_old_max, edge_int_dict_old, edge_int_dict) generate_nodes()
def get_normalized_dictionary_from_int_edges(): edge_int_dict = {} edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r') for each_line in edge_file: edge_count = int((each_line.split(' '))[0]) edge1 = int((each_line.split(' '))[1]) edge2 = int((each_line.split(' '))[2]) edge_int_dict[(edge1, edge2)] = edge_count return edge_int_dict
def update_graph(): '''Load the existing graph and update with new set of job description from predefined locations based on the application.ini file''' semantic_graph = load_graph() phrase_file = open(dcrconfig.ConfigManager().DistinctPhraseFile, 'r') '''Get the config values''' graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight graph_filter_weight = dcrconfig.ConfigManager().FilterGraphEdgeWeight print("weight:%d filter weight: %d" % (graph_weight, graph_filter_weight)) # graph_collection = [] jdcount = 0 for line in phrase_file: try: line = line.strip() if not (line.startswith('--') or len(line.strip()) < 1): graph = dcrgraph.create_graph_distant_neighbors(line, graph_weight) dcrgraph.union_graph(semantic_graph, graph, graph_weight) jdcount += 1 elif (line.startswith('--')): ''' If the line starts with -- then it is job descriptin begenning So print a dot indicate the progress ''' print('.', end='') if jdcount % 1000 == 0: print('%d' % jdcount) sys.stdout.flush() except BaseException as ex: utility.log_exception_file(ex, dcrconfig.ConfigManager().SemanticGraphLogFile) count = list((d['weight']) for u, v, d in semantic_graph.edges_iter(data=True) if d['weight'] > graph_filter_weight) ''' nx.write_gexf(semantic_graph, dcrconfig.ConfigManager().SemanticGraphFile)''' mx = max(d for d in count) print('mx : %d, total jd processed : %d ' % (mx, jdcount)) print('Semantic Graph Info: %s' % nx.info(semantic_graph)) return semantic_graph
def job_info_analysis_storage(page_dict_object, filepath, dbrecordcount): global totalrecords global invalidrecords global emptydesc global incompletedesc global smalldesc global nonedesc global nodesc global totaljobsdict global jobsitedict dict_object_record_list = [] try: dict_object = page_dict_object['page'] # outer if check is jobdescription tag is in the xml if 'jobdescription' in (dict_object['record']): # checking if job description is none if ((dict_object['record'])['jobdescription'] is not None): incorrectjobdescription = 0 if (((dict_object['record'])['jobdescription']).strip()) == '': incorrectjobdescription = 1 if (len(((dict_object['record'])['jobdescription'])) < 20): incorrectjobdescription = 1 if (((dict_object['record'])['jobdescription']).strip()[-3:] ) == '...': incorrectjobdescription = 1 if (incorrectjobdescription == 0): (dict_object['record'] )['dateCreated'] = datetime.datetime.now() (dict_object['record'] )['dateModified'] = datetime.datetime.now() (dict_object['record'])['createdUser'] = '******' (dict_object['record'])['modifiedUser'] = '******' (dict_object['record'])['source'] = 'PromptCloud' (dict_object['record'])['Url'] = dict_object['pageurl'] (dict_object['record'])['fileName'] = filepath.replace( config.ConfigManager().PCFileFolder + '/', '') dict_object_record_list.append(dict_object['record']) dbrecordcount += 1 except BaseException as ex: utility.log_exception_file( ex, dcrconfig.ConfigManager().SemanticGraphLogFile) if dict_object_record_list: insert_to_db(dict_object_record_list) # updating doc_id in config table return dbrecordcount
def load_graph(): '''Load semantic graph if it is already present in the system''' semantic_graph = nx.Graph() semantic_graph_path = dcrconfig.ConfigManager().SemanticGraphFile if os.path.isfile(semantic_graph_path): print("File found") semantic_graph = nx.read_gexf(semantic_graph_path) print('Semantic Graph Info: %s' % nx.info(semantic_graph)) else: print('No existing semantic graph found') return semantic_graph
def remove_ngram_from_allphrasefile(): utility.write_to_file( dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', 'Semantic graph Generation Step 5..! (ngramremoval.py) ' + str(datetime.datetime.now())) # Loop thru all phrase files and generate the integer graph phrase_file = open(dcrconfig.ConfigManager().PhraseFile, 'r') ng_phrase_file = open(dcrconfig.ConfigManager().NGramFilteredPhraseFile, 'w') for line in phrase_file: line = line.strip() if (line.startswith('--')): # If the line starts with -- then it is job descriptin beginning # So print a dot indicate the progress print('.', end='') sys.stdout.flush() print(line, file=ng_phrase_file) # If the line doesn't start with -- or is not empty space if not (line.startswith('--') or len(line.strip()) < 1): print(remove_ngram(line), file=ng_phrase_file)
def merge_int_edges_with_dict(edge_count_max, edge_int_dict_old, edge_int_dict): edge_file_new = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'w') edge_int_dict_new = edge_int_dict_old.copy() for key in edge_int_dict: if key in edge_int_dict_old: edge_int_dict_new[key] = [(edge_int_dict_old[key])[0], (edge_int_dict[key])[1]] else: edge_count_max += 1 edge_int_dict_new[key] = [edge_count_max, (edge_int_dict[key])[1]] edge_int_dict.clear() edge_int_dict_old.clear() for key in sorted(edge_int_dict_new.keys(), key=lambda k: edge_int_dict_new[k][0]): print('%d %d %d %d' % ((edge_int_dict_new[key])[0], int(key[0]), int(key[1]), (edge_int_dict_new[key])[1]), file=edge_file_new) edge_file_new.close()
def generate_document_graphs_from_dict_list_savetodb(dict, edge_dict, noun_phrases): # Loop thru all phrase files and generate the integer graph jdcount = 0 graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight graph = dcrgraph.create_graph_distant_neighbors(noun_phrases, graph_weight) graph = dcrgraph.generate_document_integer_graph_savetodb(dict, graph, edge_dict) jdcount += 1 if jdcount % 10 == 0: # print('%d' % jdcount) test = '' return graph
def append_edges_file(): print('Reading Semantic Graph...') graph = nx.read_gexf(dcrconfig.ConfigManager().SemanticGraphFile) # create an integer mapping for each of the phrases # mapping_dict = dcrgraph.create_node_dictionary(graph.nodes()) old_node_dict = load_node_dict() mapping_dict = dcrgraph.delta_node_dictionary(graph.nodes(), old_node_dict) # writer = csv.writer(open('/mnt/nlpdata/nodedict.csv', 'w')) # for key, value in mapping_dict.items(): # writer.writerow([key, value]) new_graph = nx.relabel_nodes(graph, mapping_dict) print('Saving integer Semantic graph...') edge_file = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'r') print(dcrconfig.ConfigManager().IntegerEdegesFile) last_line = '0 ' for each_line in edge_file: last_line = each_line # last_line = each_line edge_int_dict = {} edge_count = int((last_line.split(' '))[0]) print(last_line) print(edge_count) edge_file.close() edge_file_append = open(dcrconfig.ConfigManager().IntegerEdegesFile, 'a') if new_graph.edges(data=True): print('%s' % (''), file=edge_file_append) # Loop thru the edges. Compare the nodes order the first node be greater # than the second one. This will help in compressing the graph for edge in new_graph.edges(data=True): if (isinstance(edge[0], int) and isinstance(edge[1], int)): edge1 = edge[0] edge2 = edge[1] if edge1 > edge2: edge1 = edge[1] edge2 = edge[0] edge_count += 1 print(edge1, edge2) print(edge_count, edge1, edge2, edge[2]['weight']) print('%d %d %d %d' % (edge_count, edge1, edge2, edge[2]['weight']), file=edge_file_append) edge_int_dict[(edge1, edge2)] = edge_count edge_file_append.close() rewrite_strip_edges_file(dcrconfig.ConfigManager().IntegerEdegesFile, (dcrconfig.ConfigManager().IntegerEdegesFile).replace('int_edges', 'int_edges_temp')) rewrite_strip_edges_file((dcrconfig.ConfigManager().IntegerEdegesFile).replace('int_edges', 'int_edges_temp'), dcrconfig.ConfigManager().IntegerEdegesFile) generate_nodes()
def generate_document_graphs_from_dict_list(dict, edge_dict, list, directory): # Loop thru all phrase files and generate the integer graph jdcount = 0 graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight for listitem in list: doc = '---' + str(listitem['doc_id']) + '---' filepath = directory + '/' + str(doc) graph = dcrgraph.create_graph_distant_neighbors(listitem ['nounPhrases'], graph_weight) graph = dcrgraph.generate_document_integer_graph(dict, graph, doc, edge_dict, 'w', filepath) jdcount += 1 if jdcount % 10 == 0: print('%d' % jdcount)
def generate_nounphrase_insert_into_db(data): global count try: status = "{:<8}".format(str(count)) + " :" status += str(datetime.datetime.now()) count += 1 mongoport = int(config.ConfigManager().MongoDBPort) col = config.ConfigManager().IntelligenceDataCollection desc = data['desc'] noun_phrases = dcrnlp.extract_nounphrases_sentences(desc) UpdateTemplateWhere = utility.clean_dict() UpdateTemplateSet = utility.clean_dict() DBSet = utility.clean_dict() UpdateTemplateWhere['_id'] = data['_id'] UpdateTemplateSet['nounPhrases'] = noun_phrases UpdateTemplateSet['description'] = desc DBSet['$set'] = UpdateTemplateSet status += " |" + str(datetime.datetime.now()) custom.update_data_to_Db_con(mongoport, config.ConfigManager().IntelligenceDb, col, UpdateTemplateWhere, DBSet, data['connection']) status += " |" + str(datetime.datetime.now()) status += " :" + "{:<9}".format(str(data['doc_id'])) print(status) except BaseException as ex: exception_message = '\n' + 'Exception:' + '\n' str(datetime.datetime.now()) + '\n' exception_message += 'File: ' + '\n' exception_message += '\n' + str(ex) + '\n' exception_message += '-' * 100 utility.write_to_file(dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', exception_message)
def get_normalized_dictionary(): print('Reading Semantic Graph...') graph = nx.read_gexf(dcrconfig.ConfigManager().SemanticGraphFile) # create an integer mapping for each of the phrases mapping_dict = load_node_dict() new_graph = nx.relabel_nodes(graph, mapping_dict) edge_int_dict = {} edge_count = 0 # Loop thru the edges. Compare the nodes order the first node be greater # than the second one. This will help in compressing the graph for edge in new_graph.edges(data=True): edge1 = edge[0] edge2 = edge[1] if edge1 > edge2: edge1 = edge[1] edge2 = edge[0] edge_count += 1 edge_int_dict[(edge1, edge2)] = edge_count return edge_int_dict
def load_document_edges(): edge_file = open(dcrconfig.ConfigManager().DocumentsEdgesIntegerFile, 'r') jdcount = 0 docs = [] doc_edges = [] doc_id = 0 for line in edge_file: line = line.strip() if (line.startswith('--')): # If the line starts with -- then it is job descriptin beginning # So print a dot indicate the progress if (len(doc_edges) > 0): doc = {'id': doc_id, 'edges': doc_edges} docs.append(doc) doc_edges = [] doc_id = int(line.strip('-')) jdcount += 1 if not (line.startswith('--') or len(line.strip()) < 1): doc_edges.append(int(line.split(' ')[0])) return docs
def generate_document_graphs_from_list(dict, edge_dict, candidates, req_cand_file): # Loop thru all phrase files and generate the integer graph jdcount = 0 graph_weight = dcrconfig.ConfigManager().GraphEdgeWeight for candidate in candidates: line = candidate["phrases"] doc = '---' + str(candidate["id"]) + '---' print("writing %s" % req_cand_file) graph = dcrgraph.create_graph_distant_neighbors(line, graph_weight) graph = dcrgraph.generate_document_integer_graph(dict, graph, doc, edge_dict, 'a', req_cand_file) jdcount += 1 if jdcount % 10 == 0: print('%d' % jdcount)
#!/usr/bin/python3.4 # File transfer. # Runs shell script file to transfer files import subprocess import os import config import utility import dcrconfig if __name__ == "__main__": fileTransferDestination = config.ConfigManager( ).webServerIp + ':' + config.ConfigManager().mountDirectory semanticGraph = dcrconfig.ConfigManager().SemanticGraphFile.replace( config.ConfigManager().mountDirectory + '/', '') intEdges = dcrconfig.ConfigManager().IntegerEdegesFile.replace( config.ConfigManager().mountDirectory + '/', '') nodeDict = dcrconfig.ConfigManager().NodeFile.replace( config.ConfigManager().mountDirectory + '/', '') # Transferring knowledge files before spark server reboot subprocess.call([ config.ConfigManager().knowledgeFilesTransferScript, config.ConfigManager().webServerPassword, semanticGraph, intEdges, nodeDict, fileTransferDestination, config.ConfigManager().knowledgeFilesBackup ])
#!/usr/bin/python3.4 # Generates integer graphs for documents from a phrase file # Reads a graph from a predefined file and optimizes # by converting it into integer nodes import networkx as nx import dcrgraphcompactor import dcrconfig import utility import datetime # main function entry if __name__ == "__main__": utility.write_to_file( dcrconfig.ConfigManager().SemanticGraphLogFile, 'a', 'Semantic graph Generation Step 10..! (dcrdocumentintgraphgenerator.py) ' + str(datetime.datetime.now())) mapping_dict = dcrgraphcompactor.load_node_dict() # edge_int_dict = dcrgraphcompactor.get_normalized_dictionary() edge_int_dict = dcrgraphcompactor.get_normalized_dictionary_from_int_edges( ) print('Saving Integer Document Graphs...') dcrgraphcompactor.generate_document_graphs(mapping_dict, edge_int_dict) print("Successfully Completed.!")