def generate_search_nodes(text):
    noun_phrases = dcrnlp.extract_nounphrases_sentences(text)
    integer_dict = dcrgraphcompactor.load_node_dict()
    graph = dcrgraph.create_graph_distant_neighbors(noun_phrases)
    dcrgraph.print_graph(graph)
    nodes = dcrgraph.generate_document_integer_nodes(integer_dict, graph)
    print('\ninput.nodes = ' + ' '.join(str(x) for x in nodes))
Example #2
0
def signaturegraph(row):
    noun_phrases = row['nounPhrases']
    mapping_dict = dcrgraphcompactor.load_node_dict()
    edge_int_dict = dcrgraphcompactor.get_normalized_dictionary()
    graph = dcrgraphcompactor.generate_document_graphs_from_dict_list_savetodb(
        mapping_dict, edge_int_dict, noun_phrases)
    row['signGraph'] = graph
    row['signGraphFlag'] = 1
    return row
def integer_edges_from_dict(graph):
    int_dict = dcrgraphcompactor.load_node_dict()
    edges = []
    for v1, v2, w in graph.edges_iter(data=True):
        weight = int(w['weight'])
        n1 = int_dict[v1]
        n2 = int_dict[v2]
        if n1 > n2:
            n1 = n2
            n2 = int_dict[v1]
        edges.append([n1, n2, weight])
    return edges
Example #4
0
#!/usr/bin/python3.4
#   Generates integer graphs for documents from a phrase file
#   Reads a graph from a predefined file and optimizes
#   by converting it into integer nodes

import networkx as nx
import dcrgraphcompactor
import dcrconfig
import utility
import datetime

#   main function entry
if __name__ == "__main__":
    utility.write_to_file(
        dcrconfig.ConfigManager().SemanticGraphLogFile, 'a',
        'Semantic graph Generation Step 10..! (dcrdocumentintgraphgenerator.py) '
        + str(datetime.datetime.now()))
    mapping_dict = dcrgraphcompactor.load_node_dict()
    # edge_int_dict = dcrgraphcompactor.get_normalized_dictionary()
    edge_int_dict = dcrgraphcompactor.get_normalized_dictionary_from_int_edges(
    )
    print('Saving Integer Document Graphs...')
    dcrgraphcompactor.generate_document_graphs(mapping_dict, edge_int_dict)
    print("Successfully Completed.!")
def readstagingdata():
    utility.write_to_file(
        config.ConfigManager().LogFile, 'a',
        'Staging dataread running' + ' ' + str(datetime.datetime.now()))
    ratesConfigValues = ratesConfig.find({})
    ratesDate = ratesConfigValues[0]['stagingDateModified']
    # ratesDataDateMax = ((stagingcoll.find().sort([('dateModified', -1)]).limit(1))[0])['dateModified']
    ratesDataCount = (stagingcoll.count({'dateModified': {"$gt": ratesDate}}))

    geoCountryQuery = "select distinct iso_alpha2, name, iso_alpha3, fips_code from geo_country order by name"
    # geoStateQuery = "select ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid"
    geoStateQuery = "select gc.iso_alpha2, ga1.code, ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid inner join geo_country gc on ltrim(rtrim(ga1.code)) like '%'+ ltrim(rtrim(gc.iso_alpha2))+'.' + '%'"
    geoCityQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPlaceName, fLatitude, fLongitude from GeoPostal order by sPlaceName"
    geoZipCodeQuery = "select distinct sAdminName1, sAdminCode1, sCountryCode, sPostalCode, fLatitude, fLongitude from GeoPostal  order by sPostalCode"
    # countryDictList = custom.create_sql_dict_list(geoCountryQuery, config.ConfigManager().geographicalDataConnstr)
    # stateDictList = custom.create_sql_dict_list(geoStateQuery, config.ConfigManager().geographicalDataConnstr)
    # cityDictList = custom.create_sql_dict_list(geoCityQuery, config.ConfigManager().geographicalDataConnstr)
    # zipCodeDictList = custom.create_sql_dict_list(geoZipCodeQuery, config.ConfigManager().geographicalDataConnstr)

    geoCountryDict = custom.create_geo_dict(
        geoCountryQuery,
        config.ConfigManager().geographicalDataConnstr, 'Country')
    geoStateDict = custom.create_geo_dict(
        geoStateQuery,
        config.ConfigManager().geographicalDataConnstr, 'State')
    geoCityDict = custom.create_geo_dict(
        geoCityQuery,
        config.ConfigManager().geographicalDataConnstr, 'City')
    geoZipCodeDict = custom.create_geo_dict(
        geoZipCodeQuery,
        config.ConfigManager().geographicalDataConnstr, 'zipCode')

    cleanUpListDict = data_cleanup_lists()

    ratesConfigValues = ratesConfig.find({})
    ratesDate = ratesConfigValues[0]['stagingDateModified']
    objectid = ratesConfigValues[0]['_id']
    lastDateTime = ratesConfigValues[0]['masterAutomationStartDate']
    oldDate = datetime.datetime.strptime(lastDateTime, '%Y-%m-%d')

    mapping_dict = dcrgraphcompactor.load_node_dict()
    edge_int_dict = dcrgraphcompactor.get_normalized_dictionary_from_int_edges(
    )

    neighborCount = dcrgraph.neighbor_count_for_edge_weight()
    diminition_percent = dcrconfig.ConfigManager().DiminitionPercentage
    # while ratesDate < ratesDataDateMax:
    while ratesDataCount > 0:
        ratesConfigValues = ratesConfig.find({})
        ratesDate = ratesConfigValues[0]['stagingDateModified']
        # countTotalRecords = stagingcoll.count({'dateModified': {"$gt": ratesDate}})
        stepSize = int(config.ConfigManager().StagingMasterTransferStep)

        # if countTotalRecords < stepSize:
        # stepSize = countTotalRecords
        if ratesDataCount < stepSize:
            stepSize = ratesDataCount
        ratesDataCount = ratesDataCount - stepSize

        ratesData = stagingcoll.find(
            {
                'dateModified': {
                    "$gt": ratesDate
                }
            }, no_cursor_timeout=True).sort([
                ('dateModified', 1)
            ]).limit(int(config.ConfigManager().StagingMasterTransferStep))
        doc_id = ratesConfigValues[0]['masterDocId']
        dataList = []
        dateModifiedList = []

        i = 0
        for row in ratesData:
            try:

                dateModifiedList.append(row['dateModified'])
                i += 1
                print(i)
                del row['_id']
                doc_id += 1
                row['doc_id'] = doc_id
                # print('Start data clean ' + str(datetime.datetime.now()))
                # "Step:1 data scrubbing for email,phone,url and candidate name"
                row = dataclean(row, cleanUpListDict)
                # print('Start noun phrase gen ' + str(datetime.datetime.now()))
                # "Step:2 nounphrases generation"
                row = generatenounphrases(row)
                # print('Start signature graph ' + str(datetime.datetime.now()))
                # "Step:3 signature generation"
                row = signaturegraph(row, mapping_dict, edge_int_dict,
                                     neighborCount, diminition_percent)
                # print('Start rates calculation ' + str(datetime.datetime.now()))
                # "Step:4 rates calculation"
                row = rates_calculation.billratescalculation(row)
                # print('Start rate available ' + str(datetime.datetime.now()))
                # Put rate value calculation before this check
                # "Step:5 verification of rate availability"
                row = rate_available(row)
                # print('Start geo verify ' + str(datetime.datetime.now()))
                # geographical data check and additions
                row['iso_alpha2_value'] = ')(*&^'
                row['admin1_value'] = ')(*&^'
                row['state_name'] = ')(*&^'
                row = custom.geo_data_verify(row, geoCountryDict, 'country')
                row = custom.geo_data_verify(row, geoStateDict, 'state')
                row = custom.geo_data_verify(row, geoCityDict, 'city')
                row = custom.geo_data_verify(row, geoZipCodeDict, 'zipCode')
                del row['iso_alpha2_value']
                del row['admin1_value']
                del row['state_name']

                # print('Stop geo verify ' + str(datetime.datetime.now()))
                dataList.append(row)

            except BaseException as ex:
                utility.log_exception_file(ex, config.ConfigManager().LogFile)

        ratesData.close()
        del ratesData

        if dataList:
            # Step:4 insert data to db
            mastercoll.insert(dataList)
            doc_id = row['doc_id']

        todayDate = datetime.date.today()
        todayDate = datetime.datetime.strptime(str(todayDate), '%Y-%m-%d')
        delta = todayDate - oldDate
        days = delta.days

        if dateModifiedList:
            ratesDate = max(dateModifiedList)
        # Step:5 update config collection with doc_id and datetime
        updateconfigcollection(doc_id, ratesDate, objectid)

        if int(days) >= int(5):
            break