コード例 #1
0
def nounphrase_generate():
    c = MongoClient(dcrconfig.ConfigManager().Datadb)
    db = c[config.ConfigManager().IntelligenceDb]
    col = db[config.ConfigManager().IntelligenceDataCollection]
    docs = col.find({'nounPhrases': ""}, {
        "description": 1,
        "doc_id": 1,
        "_id": 1
    })

    mongoport = int(config.ConfigManager().MongoDBPort)
    connection = dbmanager.mongoDB_connection(mongoport)

    for doc in docs:
        try:
            data = {}
            data['desc'] = doc['description']
            data['_id'] = doc['_id']
            data['doc_id'] = doc['doc_id']
            data['connection'] = connection
            q.put(data)

        except BaseException as ex:
            exception_message = '\n' + 'Exception:' + '\n'
            str(datetime.datetime.now()) + '\n'
            exception_message += 'File: ' + '\n'
            exception_message += '\n' + str(ex) + '\n'
            exception_message += '-' * 100
            utility.write_to_file(
                dcrconfig.ConfigManager().SemanticGraphLogFile, 'a',
                exception_message)
コード例 #2
0
def nounphrase_generate():
    docs = custom.retrieve_rowdata_from_DB(
        int(config.ConfigManager().MongoDBPort),
        config.ConfigManager().DataCollectionDB,
        config.ConfigManager().DataCollectionDBCollection,
        dictionaries.DBWhereConditon)
    connection = dbmanager.mongoDB_connection(
        int(config.ConfigManager().MongoDBPort))
    description = ''
    for doc in docs:
        try:
            description = doc['description']
            noun_phrases = dcrnlp.extract_nounphrases_sentences(description)
            dictionaries.UpdateTemplateSet['nounPhrases'] = noun_phrases
            dictionaries.UpdateTemplateWhere['_id'] = doc['_id']
            dictionaries.DBSet['$set'] = dictionaries.UpdateTemplateSet
            custom.update_data_to_Db_con(
                int(config.ConfigManager().MongoDBPort),
                config.ConfigManager().DataCollectionDB,
                config.ConfigManager().DataCollectionDBCollection,
                dictionaries.UpdateTemplateWhere, dictionaries.DBSet,
                connection)
        except BaseException as ex:
            exception_message = '\n' + 'Exception:' + \
                str(datetime.datetime.now()) + '\n'
            exception_message += 'File: ' + '\n'
            exception_message += '\n' + str(ex) + '\n'
            exception_message += '-' * 100
            utility.write_to_file(config.ConfigManager().LogFile, 'a',
                                  exception_message)
コード例 #3
0
def route_compfileread(filepaths):
    for filepath in filepaths:
        try:
            # extracting data from .gz file.
            gzipfile = gzip.GzipFile(filepath, 'rb')
            gzipdata = gzipfile.read()
            gzipfile.close()

            # getting complete file name of the .gz file
            compfilename = utility.filename_from_filepath(filepath)
            # extracting the original file name
            filename = compfilename.split('.gz')[0]
            print(filename)

            # creating file and writing data
            uncompfile = open(
                config.ConfigManager().PCFileFolder + '/' + filename, 'wb')
            uncompfile.write(gzipdata)
            uncompfile.close()

        except BaseException as ex:
            utility.log_exception_with_filepath(ex, filepath)
            # writing to file the file names that cannot be extracted using
            # gzip
            utility.write_to_file(
                config.ConfigManager().PCDataAnalysisResultsFile, 'a',
                compfilename + '  cannot be extracted')
        os.remove(filepath)
コード例 #4
0
def main():
    """load the feature and cluster them by linkage.
    Record all the cluster tree and dump it.
    """
    words_set = load_model(GOOGLE_WORD_FEATURE)

    features = []
    words = []
    count = 0

    for word in words_set:
        v = words_set[word]
        if v is not None:
            features.append(v)
            words.append(word)
            count += 1

    clusters, maxcluster = hierarchical_cluster(features, words)

    #can not use json.dump
    txt = str(clusters[maxcluster])
    txt = txt.replace("'", '"')
    write_to_file(GOOGLE_CLUSTER_PATH,
                  txt.encode(encoding='utf_8', errors='strict'),
                  mode='wb+')
コード例 #5
0
def automate_processes():
    utility.write_to_file(config.ConfigManager().PromptcloudLogFile, 'a',
                          'PromptCloudautomationscript running')
    try:
        # download files into PCCompData with in mnt/nlpdata,xml format..
        exec(open('pc_download_crawldata_threading.py').read(), globals())
        # compress the PCCompdata folder
        exec(open('compress.py').read(), globals())
        # unzip files created in PCData folder time stored in dataloadconfig..
        exec(open('pc_unzip_gz.py').read(), globals())
        # download data into pcdataanalysisresults.ods
        exec(open('analyze_crawldata.py').read(), globals())
        # for automatically sending emails
        # exec(open('mailsend.py').read(), globals())
        # store analysis file in s3 backup
        exec(open('pcdataanalysisbackup.py').read(), globals())
    except BaseException as ex:
        exception_message = '\n' + 'Exception:' + \
            str(datetime.datetime.now()) + '\n'
        exception_message += 'File: ' + '\n'
        exception_message += '\n' + str(ex) + '\n'
        exception_message += '-' * 100
        # .encode('utf8'))
        utility.write_to_file(config.ConfigManager().PromptcloudLogFile, 'a',
                              exception_message)
コード例 #6
0
ファイル: FDF.py プロジェクト: oerpli/FirstDosesFirst
 def print_table(self):
     tmp = (self.efficacy.iloc[:50:3] * 100).astype(int)
     tmp.columns = [x + " (%)" for x in tmp.columns]
     write_to_file(ANALYSIS_NOTES, "EfficacyTable", tmp.to_markdown())
     tmp.plot()
     imgPath = OUT_FOLDER / "img" / "Efficacy.png"
     plt.savefig(imgPath)
     caption = "Estimated efficacy after n days"
     write_img_to_file(ANALYSIS_NOTES, "EfficacyFigure", imgPath, caption)
コード例 #7
0
def valid_records():
    global totaljobsdict
    global jobsitedict

    # subtracting dictionary key values to get valid records per site
    validjobsdict = {key: totaljobsdict[key] - jobsitedict.get(key, 0)
                     for key in totaljobsdict.keys()}
    utility.write_to_file(config.ConfigManager().PCDataAnalysisResultsFile,
                          'a', 'Total valid records per site: ')
    utility.write_to_file(config.ConfigManager().PCDataAnalysisResultsFile,
                          'a', str(validjobsdict))
コード例 #8
0
def automate_processes():
    utility.write_to_file(
        dcrconfig.ConfigManager().SemanticGraphLogFile, 'a',
        'Knowledge build automation running..! ' +
        str(datetime.datetime.now()))
    try:
        # Copies files from the previous cycle
        exec(open('filecopy.py').read(), globals())
        # Copy the noun phrase text from Mongo DB
        exec(open('dbtophrasefile.py').read(), globals())
        # Remove ngram anything above 3 or more words.
        exec(open('ngramremoval.py').read(), globals())
        # Remove duplicates and save it in new distinct phrase file.
        exec(open('duplicatefinder.py').read(), globals())
        # Checks if there is an existing semantic graph, if yes load and update
        # with new documents else create a new semantic graph and store.
        # Normally, this is run after n gram removal and duplicate
        # find and removal.
        exec(open('dcrgraphgenerator.py').read(), globals())
        # Read the semantic graph which is saved using dcrgraphgenerator.py
        # and read the document phrase file and create optimized integer
        # semantic edge file.
        exec(open('dcrgraphcompactor.py').read(), globals())
        # Save the node dictionary using pickle to file. This will be used by
        # above programs for finding node ids
        exec(open('savenodes.py').read(), globals())
        # Generate document integer graph and store. This will be used for
        # searching the documents.
        # exec(open('dcrdocumentintgraphgenerator.py').read(), globals())
        # Copy the noun phrase text from Mongo DB (Intelligence collection)
        exec(open('stdbtophrasefile.py').read(), globals())
        # Remove ngram anything above 3 or more words.
        exec(open('ngramremoval.py').read(), globals())
        # Remove duplicates and save it in new distinct phrase file.
        exec(open('duplicatefinder.py').read(), globals())
        # Checks if there is an existing semantic graph, if yes load and update
        # with new documents else create a new semantic graph and store.
        # Normally, this is run after n gram removal and duplicate
        # find and removal.
        exec(open('stdcrgraphgenerator.py').read(), globals())
        # Read the semantic graph which is saved using dcrgraphgenerator.py
        # and read the document phrase file and create optimized integer
        # semantic edge file.
        exec(open('stdcrgraphcompactor.py').read(), globals())
        # Save the node dictionary using pickle to file. This will be used by
        # above programs for finding node ids
        exec(open('savenodes.py').read(), globals())
        # Transfer generated intelligence files
        exec(open('filetransfer.py').read(), globals())
    except BaseException as ex:
        utility.log_exception_file(
            ex,
            dcrconfig.ConfigManager().SemanticGraphLogFile)
コード例 #9
0
def display_scores(vectorizer, tfidf_result, savesubfolder):
    # http://stackoverflow.com/questions/16078015/

    write_to_file(
        os.path.join(STORAGE,
                     '{}/webpage.vocabulary.txt'.format(savesubfolder)), b'',
        'wb+')
    for fea_name in vectorizer.get_feature_names():
        fea_name = fea_name + '\n'
        write_to_file(
            os.path.join(STORAGE,
                         '{}/webpage.vocabulary.txt'.format(savesubfolder)),
            fea_name.encode('utf-8'))

    scores = zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())

    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

    index = 0
    write_to_file(
        os.path.join(STORAGE,
                     '{}/webpage.vocabulary_top.txt'.format(savesubfolder)),
        b'', 'wb+')
    for item in sorted_scores:
        index += 1
        txt = "{0} {1:50} Score: {2}\n".format(index, repr(item[0]), item[1])
        write_to_file(
            os.path.join(
                STORAGE,
                '{}/webpage.vocabulary_top.txt'.format(savesubfolder)),
            txt.encode('utf-8'))
コード例 #10
0
def test_json_to_txt():

    folder = os.path.join(STORAGE, 'test_json')
    save_apicalls = os.path.join(STORAGE, 'test_apicall.txt')
    save_rtvalue = os.path.join(STORAGE, 'test_rtvalue.txt')

    write_to_file(save_apicalls, '', mode='w+')
    write_to_file(save_rtvalue, '', mode='w+')

    for subfolder in os.listdir(folder):
        fullsubfolder = os.path.join(folder, subfolder)
        for fname in os.listdir(fullsubfolder):
            fullname = os.path.join(fullsubfolder, fname)
            with open(fullname) as f:
                jsondata = json.load(f)
                common = [str(jsondata['file_id'])]
                apis = []
                rtns = []
                for tid in jsondata['threads']:

                    api_calls = jsondata['threads'][tid]['api_calls']
                    apis += [apicall[1] for apicall in api_calls]
                    rtns += [apicall[2] for apicall in api_calls]

                    apis += ['.']
                    rtns += ['.']

                txt = ' '.join(apis)
                txt = ','.join(common + [txt])
                write_to_file(save_apicalls, txt + '\n', mode='a+')

                txt = ' '.join(rtns)
                txt = ','.join(common + [txt])

                write_to_file(save_rtvalue, txt + '\n', mode='a+')
コード例 #11
0
def automate_processes():
    utility.write_to_file(config.ConfigManager().LogFile, 'a',
                          'stautomationscript running')
    try:
        # Reading requirement and candidate data from ST
        exec(open('stdataread.py').read(), globals())
        # Extracting candidate resumes
        exec(open('resume_extract.py').read(), globals())
        # Read extracted resumes and update to 'resumeText' field
        exec(open('resumeread.py').read(), globals())
        # Appending 'resumeText' to description field
        exec(open('resume_append.py').read(), globals())
        # Generate nounphrases for candidate table
        exec(open('stnounphrase_generate.py').read(), globals())
        # Update requirements and rates for candidates
        exec(open('requirement_update_fastest.py').read(), globals())
        # Update candidate statuses which changed
        exec(open('submission_status_update.py').read(), globals())
        # Extracting requirement description files
        exec(open('req_desc_file_extract.py').read(), globals())
        # Read extracted description files and update to 'reqFileDesc' field
        exec(open('req_desc_file_read.py').read(), globals())
        # Appending 'reqFileDesc' to description field
        exec(open('req_desc_file_append.py').read(), globals())
        # Generate nounPhrases for requirement tables
        exec(open('streqnounphrase_generate.py').read(), globals())
        # Get supplier info
        exec(open('stsupplierdataread.py').read(), globals())
        # Candidate resume screening
        exec(open('contactinfodetect.py').read(), globals())
        # Client master list load
        # exec(open('stclientsdataread.py').read(), globals())
        # # Currency master list load
        # exec(open('currencydataread.py').read(), globals())
        # # Industry master list load
        # exec(open('industrydataread.py').read(), globals())
        # # MSP master list load
        # exec(open('stmspdataread.py').read(), globals())
        # currency code update
        exec(open('stcandidateCurrency_update_fastest.py').read(), globals())
    except BaseException as ex:
        exception_message = '\n' + 'Exception:' + \
            str(datetime.datetime.now()) + '\n'
        exception_message += 'File: ' + '\n'
        exception_message += '\n' + str(ex) + '\n'
        exception_message += '-' * 100
        # .encode('utf8'))
        utility.write_to_file(config.ConfigManager().LogFile, 'a',
                              exception_message)
コード例 #12
0
    def create_account(self, account, card_file, initial_balance):
        # Check initial balance to be greater or equal to 10
        if Decimal(initial_balance) < 10:
            raise SystemExit(255)

        # Check if card_file exits.
        if os.path.isfile(card_file):
            raise SystemExit(255)

        card_secret = generate_random_secret()

        self.send_request('create_account', account, card_secret,
                          initial_balance)

        # Save new card_file
        write_to_file(card_file, card_secret)
コード例 #13
0
def automate_processes():
    utility.write_to_file(config.ConfigManager().LogFile, 'a',
                          'pcanalysisautomationscript running')
    try:
        exec(open('download_crawldata_threading.py').read(), globals())
        exec(open('unzip_gz.py').read(), globals())
        exec(open('analyze_crawldata.py').read(), globals())
    except BaseException as ex:
        exception_message = '\n' + 'Exception:' + \
            str(datetime.datetime.now()) + '\n'
        exception_message += 'File: ' + '\n'
        exception_message += '\n' + str(ex) + '\n'
        exception_message += '-' * 100
        # .encode('utf8'))
        utility.write_to_file(config.ConfigManager().LogFile, 'a',
                              exception_message)
コード例 #14
0
def readstagingdata():
    utility.write_to_file(
        config.ConfigManager().LogFile, 'a',
        'Staging dataread running' + ' ' + str(datetime.datetime.now()))
    ratesConfigValues = ratesConfig.find({})
    ratesDate = ratesConfigValues[0]['stagingDateModified']
    ratesData = stagingcoll.find({'dateModified': {
        "$gt": ratesDate
    }},
                                 no_cursor_timeout=True)
    doc_id = ratesConfigValues[0]['masterDocId']
    objectid = ratesConfigValues[0]['_id']
    dateModifiedList = []
    geoCountryQuery = "select distinct name,iso_alpha3, fips_code from geo_country order by name"
    geoStateQuery = "select ga1.name, gn.admin1, gn.latitude, gn.longitude from geo_admin1 ga1 inner join geo_name gn on ga1.geonameid = gn.geonameid"
    geoCityQuery = "select distinct sPlaceName, fLatitude, fLongitude from GeoPostal order by sPlaceName"
    geoZipCodeQuery = "select distinct sPostalCode, fLatitude, fLongitude from GeoPostal  order by sPostalCode"
    countryDictList = custom.create_sql_dict_list(
        geoCountryQuery,
        config.ConfigManager().geographicalDataConnstr)
    stateDictList = custom.create_sql_dict_list(
        geoStateQuery,
        config.ConfigManager().geographicalDataConnstr)
    cityDictList = custom.create_sql_dict_list(
        geoCityQuery,
        config.ConfigManager().geographicalDataConnstr)
    zipCodeDictList = custom.create_sql_dict_list(
        geoZipCodeQuery,
        config.ConfigManager().geographicalDataConnstr)
    i = 0
    for row in ratesData:
        dateModifiedList.append(row['dateModified'])
        i += 1
        del row['_id']
        doc_id += 1
        row['doc_id'] = doc_id
        row['stagingDateModified'] = max(dateModifiedList)
        row['i'] = i
        row['objectid'] = objectid
        row['countryDictList'] = countryDictList
        row['stateDictList'] = stateDictList
        row['cityDictList'] = cityDictList
        row['zipCodeDictList'] = zipCodeDictList
        q.put(row)

    ratesData.close()
    del ratesData
コード例 #15
0
    def __init__(self, ip, port, auth_file):
        # Store all accounts data in memory
        self._accounts = {}

        # Check if auth_file already exits.
        if os.path.isfile(auth_file):
            raise SystemExit(255)

        # Create server using TCP/IP socket
        self._server = socket.socket()
        self._server.bind((ip, port))
        self._server.listen(3)

        # Generate new auth token and write to file
        self._auth_token = generate_random_secret()
        write_to_file(auth_file, self._auth_token)

        print('created', flush=1)
コード例 #16
0
def model_build_main(storage, datasetpath, featureheaders, targethearders):

    name, clf, modelinfo = model_build(datasetpath, featureheaders,
                                       targethearders)

    summarypath = os.path.join(storage, 'model/lightgbm.model.esimate')
    modelsavepath = os.path.join(storage, 'model/lightgbm.model')
    modelinfosavepath = os.path.join(storage, 'model/lightgbm.modelinfo')

    txt = json.dumps(modelinfo, indent=4)

    write_to_file(modelinfosavepath, txt.encode('utf-8'), mode='wb+')

    save_model(clf, modelsavepath)

    print('model summary:')

    print('save model summary->', summarypath)
    write_to_file(summarypath, txt.encode('utf-8'), mode='wb+')
コード例 #17
0
def requirement_update():
    (dictionaries.DBWhereCondition1)['documentType'] = 'candidate details'
    (dictionaries.DBWhereCondition1)['dataSource'] = 'Smart Track'
    docs = custom.retrieve_rowdata_from_DB_notimeout(
        int(config.ConfigManager().MongoDBPort),
        config.ConfigManager().DataCollectionDB,
        config.ConfigManager().DataCollectionDBCollection,
        dictionaries.DBWhereCondition1)
    recordnumber = 0
    for doc in docs:
        recordnumber += 1
        requirementIDList = []
        query = custom.fetch_query(
            config.ConfigManager().STCandidateSubmissionsQueryId)
        cursor = dbmanager.cursor_odbc_connection(
            config.ConfigManager().STConnStr)
        db_data_dict = dbmanager.cursor_execute(cursor, query)
        db_data = db_data_dict['dbdata']
        db_data_cursorexec = db_data_dict['cursor_exec']
        cursor_description = db_data_cursorexec.description
        column_headers = [column[0] for column in cursor_description]
        for row in db_data:
            try:
                data_dict = dict(utility.zip_list(column_headers, row))
                if (data_dict['CandidateID'] == doc['candidateid']):
                    requirementIDList.append(data_dict['requirementID'])
            except BaseException as ex:
                exception_message = '\n' + 'Exception:' + \
                    str(datetime.datetime.now()) + '\n'
                exception_message += 'File: ' + '\n'
                exception_message += '\n' + str(ex) + '\n'
                exception_message += '-' * 100
                utility.write_to_file(config.ConfigManager().LogFile, 'a',
                                      exception_message)
        dictionaries.UpdateTemplateSet['requirementIDList'] = requirementIDList
        dictionaries.UpdateTemplateWhere['candidateid'] = doc['candidateid']
        dictionaries.DBSet['$set'] = dictionaries.UpdateTemplateSet
        print(recordnumber, doc['candidateid'], requirementIDList)
        custom.update_data_to_Db_noupsert(
            int(config.ConfigManager().MongoDBPort),
            config.ConfigManager().DataCollectionDB,
            config.ConfigManager().DataCollectionDBCollection,
            dictionaries.UpdateTemplateWhere, dictionaries.DBSet)
コード例 #18
0
ファイル: analysis.py プロジェクト: scorpio2017/Word2Tree
def dump_flat_result(oidpath, flatsavepath):

    write_to_file(flatsavepath,
                  ''.encode(encoding='utf_8', errors='strict'),
                  mode='wb+')
    for filename in os.listdir(oidpath):
        fullname = os.path.join(oidpath, filename)
        if filename.endswith('.json'):
            print(fullname)
            with open(fullname, encoding='utf-8') as f:
                data = json.load(f)
                v = []
                get_children(data, v)
                txt = ' '.join(v)
                txt += '\n'
                txt = filename + "\t" + str(len(v)) + "\t" + txt
                write_to_file(flatsavepath,
                              txt.encode(encoding='utf_8', errors='strict'))
                fullname = fullname.replace('.json', '')
                draw_cluster_tree(v, fullname + '.pdf')
コード例 #19
0
def automate_processes():
    utility.write_to_file(config.ConfigManager().PromptcloudLogFile, 'a',
                          'PromptCloudautomationscript running')
    try:
        # download files into PCCompData with in mnt/nlpdata,xml format..
        exec(
            open('rates_pc_download_crawldata_threading.py').read(), globals())
        # compress the PCCompdata folder
        exec(open('compress.py').read(), globals())
        # unzip files created in PCData folder time stored in dataloadconfig..
        exec(open('pc_rates_unzip_gz.py').read(), globals())
        # download data into pcdataanalysisresults.ods
        exec(open('pc_rates_dataload.py').read(), globals())
        # for automatically sending emails
        # exec(open('mailsend.py').read(), globals())
        # store analysis file in s3 backup
        # exec(open('pcdataanalysisbackup.py').read(), globals())
    except BaseException as ex:
        utility.log_exception_file(ex,
                                   config.ConfigManager().PromptcloudLogFile)
コード例 #20
0
def train_file_split():
    'split the train file to many files by file id.Rows with the same file id should be save in one file'
    count = 0
    with open(trainpath, 'r') as f:
        for line in f:
            count += 1
            if count == 1:
                continue

            splits = line.split(',')
            if len(splits) >= 2:
                lable = splits[1]
                fileid = splits[0]
                subfolder = lable

                savefolder = os.path.join(STORAGE, 'train_flat', subfolder)
                os.makedirs(savefolder, exist_ok=True)
                savefile = os.path.join(savefolder, str(fileid) + '.txt')
                write_to_file(savefile, line)
                if count % 10000 == 0:
                    print(count)
コード例 #21
0
def remove_ngram_from_allphrasefile():
    utility.write_to_file(
        dcrconfig.ConfigManager().SemanticGraphLogFile, 'a',
        'Semantic graph Generation Step 5..! (ngramremoval.py) ' +
        str(datetime.datetime.now()))
    # Loop thru all phrase files and generate the integer graph
    phrase_file = open(dcrconfig.ConfigManager().PhraseFile, 'r')
    ng_phrase_file = open(dcrconfig.ConfigManager().NGramFilteredPhraseFile,
                          'w')

    for line in phrase_file:
        line = line.strip()
        if (line.startswith('--')):
            #  If the line starts with -- then it is job descriptin beginning
            #  So print a dot indicate the progress
            print('.', end='')
            sys.stdout.flush()
            print(line, file=ng_phrase_file)
            # If the line doesn't start with -- or is not empty space
        if not (line.startswith('--') or len(line.strip()) < 1):
            print(remove_ngram(line), file=ng_phrase_file)
コード例 #22
0
def folder_to_json(folder, fileparser, save_folder):
    stime = time.time()
    count = 0
    for subfolder in os.listdir(folder):
        fullsubfolder = os.path.join(folder, subfolder)
        save_subfolder = os.path.join(save_folder, subfolder)
        os.makedirs(save_subfolder, exist_ok=True)
        for fname in os.listdir(fullsubfolder):
            try:
                fullname = os.path.join(save_subfolder, fname)
                document = fileparser(fullname)
                txt = json.dumps(document)
                write_to_file(fullname, txt, mode='a+')
                count += 1
                if count % 1000 == 0:
                    print(count)
            except Exception as e:
                traceback.print_exc()
                print(fullname)

    print(time.time() - stime, count)
コード例 #23
0
def lda_similarity_main(dictionary, lda_save_path, corpora_path, index_path):

    questions = get_question()

    corpus = load_corpora(corpora_path)

    ldamodel = load_lad_model(lda_save_path)

    if os.path.exists(index_path):
        index_sim = similarities.MatrixSimilarity.load(index_path)
    else:
        index_sim = similarities.MatrixSimilarity(ldamodel[corpus])
        index_sim.save(index_path)

    write_to_file('../data/query.lda.txt', ''.encode('utf-8'), mode='wb+')
    for i in range(1000):
        querydoc = questions[i]
        vec_bow = dictionary.doc2bow(jieba.lcut(querydoc))
        vec_lda = ldamodel[vec_bow]

        sims = index_sim[vec_lda]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        for sim in sims[:10]:
            index = sim[0]
            distance = sim[1]
            txt = '{} {} {} {} {}\n'.format(i, querydoc, index,
                                            questions[index], distance)
            write_to_file('../data/query.lda.txt', txt.encode('utf-8'))
        write_to_file('../data/query.lda.txt', '\n'.encode('utf-8'))
コード例 #24
0
def nounphrase_generate():
    (dictionaries.DBWhereConditon)['documentType'] = 'candidate details'
    (dictionaries.DBWhereConditon)['dataSource'] = 'Smart Track'
    docs = custom.retrieve_rowdata_from_DB(
        int(config.ConfigManager().MongoDBPort),
        config.ConfigManager().DataCollectionDB,
        config.ConfigManager().DataCollectionDBCollection,
        dictionaries.DBWhereConditon)
    description = ''
    for doc in docs:
        try:
            if not doc['descriptionOld'] is None:
                print('Inside if')
                description = doc['descriptionOld'] + '. ' + doc['resumeText']
                noun_phrases = dcrnlp.extract_nounphrases_sentences(
                    description)
                dictionaries.UpdateTemplateSet['nounPhrases'] = noun_phrases
                dictionaries.UpdateTemplateSet['description'] = description
            else:
                print('Inside else')
                description = doc['resumeText']
                noun_phrases = dcrnlp.extract_nounphrases_sentences(
                    description)
                dictionaries.UpdateTemplateSet['description'] = description
                dictionaries.UpdateTemplateSet['nounPhrases'] = noun_phrases
            dictionaries.UpdateTemplateWhere['_id'] = doc['_id']
            dictionaries.DBSet['$set'] = dictionaries.UpdateTemplateSet
            custom.update_data_to_Db(
                int(config.ConfigManager().MongoDBPort),
                config.ConfigManager().DataCollectionDB,
                config.ConfigManager().DataCollectionDBCollection,
                dictionaries.UpdateTemplateWhere, dictionaries.DBSet)
        except BaseException as ex:
            exception_message = '\n' + 'Exception:' + \
                str(datetime.datetime.now()) + '\n'
            exception_message += 'File: ' + '\n'
            exception_message += '\n' + str(ex) + '\n'
            exception_message += '-' * 100
            utility.write_to_file(config.ConfigManager().LogFile, 'a',
                                  exception_message)
コード例 #25
0
def rp_similarity_main(dictionary, tfidf_save_path, rp_save_path, corpora_path,
                       index_path):

    questions = get_question()

    corpus = load_corpora(corpora_path)

    rpmodel = load_rp_model(lda_save_path)
    tfidfmodel = load_tfidf_model(tfidf_save_path)

    if os.path.exists(index_path):
        index_sim = similarities.MatrixSimilarity.load(index_path)
    else:
        print('build matrix similarity')
        corpus_tfidf = tfidfmodel[corpus]
        index_sim = similarities.MatrixSimilarity(rpmodel[corpus_tfidf])
        index_sim.save(index_path)

    write_to_file('../data/query.rp.txt', ''.encode('utf-8'), mode='wb+')
    for i in range(1000):
        querydoc = questions[i]
        vec_bow = dictionary.doc2bow(jieba.lcut(querydoc))
        vectfidf = tfidfmodel[vec_bow]
        vec_rp = rpmodel[vectfidf]

        sims = index_sim[vec_rp]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])

        for sim in sims[:10]:
            index = sim[0]
            distance = sim[1]
            txt = '{} {} {} {} {}\n'.format(i, querydoc, index,
                                            questions[index], distance)
            write_to_file('../data/query.rp.txt', txt.encode('utf-8'))
        write_to_file('../data/query.rp.txt', '\n'.encode('utf-8'))
コード例 #26
0
def test_file_split(mode, left):
    'split the test file to many files by file id.Rows with the same file id should be save in one file'
    count = 0

    with open(testpath, 'r') as f:
        for line in f:
            count += 1
            if count == 1:
                continue

            splits = line.split(',')
            if len(splits) >= 2:
                fileid = splits[0]
                moderesut = int(fileid) % mode
                if moderesut == left:
                    savefolder = os.path.join(STORAGE, 'test_flat')
                    savefolder = os.path.join(savefolder, str(left))
                    os.makedirs(savefolder, exist_ok=True)
                    savefile = os.path.join(savefolder, str(fileid) + '.txt')
                    write_to_file(savefile, line, mode='a+')
                    if count % 10000 == 0:
                        print(count, fileid, mode, left)
コード例 #27
0
def generate_nounphrase_insert_into_db(data):
    global count
    try:
        status = "{:<8}".format(str(count)) + " :"
        status += str(datetime.datetime.now())
        count += 1
        mongoport = int(config.ConfigManager().MongoDBPort)
        col = config.ConfigManager().IntelligenceDataCollection
        desc = data['desc']

        noun_phrases = dcrnlp.extract_nounphrases_sentences(desc)

        UpdateTemplateWhere = utility.clean_dict()
        UpdateTemplateSet = utility.clean_dict()
        DBSet = utility.clean_dict()
        UpdateTemplateWhere['_id'] = data['_id']
        UpdateTemplateSet['nounPhrases'] = noun_phrases
        UpdateTemplateSet['description'] = desc
        DBSet['$set'] = UpdateTemplateSet

        status += " |" + str(datetime.datetime.now())
        custom.update_data_to_Db_con(mongoport,
                                     config.ConfigManager().IntelligenceDb,
                                     col, UpdateTemplateWhere, DBSet,
                                     data['connection'])

        status += " |" + str(datetime.datetime.now())
        status += " :" + "{:<9}".format(str(data['doc_id']))
        print(status)

    except BaseException as ex:
        exception_message = '\n' + 'Exception:' + '\n'
        str(datetime.datetime.now()) + '\n'
        exception_message += 'File: ' + '\n'
        exception_message += '\n' + str(ex) + '\n'
        exception_message += '-' * 100
        utility.write_to_file(dcrconfig.ConfigManager().SemanticGraphLogFile,
                              'a', exception_message)
コード例 #28
0
ファイル: analysis.py プロジェクト: scorpio2017/Word2Tree
def split_tree(root, clsid):

    if type(root) is not dict:
        return

    count = root['count']

    if clsid in knowoidgroup:

        txt = json.dumps(root, indent=4)

        write_to_file(os.path.join(CHECKED_OID_SAVE_FOLDER,
                                   '{}.json'.format(clsid)),
                      txt.encode(encoding='utf_8', errors='strict'),
                      mode='wb+')
        return

    if count < OID_NODE_MAX_COUNT:
        #dump the tree to file if count < OID_NODE_MAX_COUNT

        if count < 4:
            return

        txt = json.dumps(root, indent=4)

        write_to_file(os.path.join(OID_SAVE_FOLDER, '{}.json'.format(clsid)),
                      txt.encode(encoding='utf_8', errors='strict'),
                      mode='wb+')

        return

    left, right = root['children']

    oid = '{}1'.format(clsid)
    split_tree(left, oid)

    oid = '{}2'.format(clsid)
    split_tree(right, oid)
コード例 #29
0
def folder_to_basic_feature(folder, feature_save_path):

    featureheader = [
        'file_id', 'label', 'threadnum', 'totalapicall', 'maxapicall',
        'minapicall', 'meanapicallperthread'
    ]
    stime = time.time()
    count = 0
    write_to_file(feature_save_path, '', mode='w+')
    for subfolder in os.listdir(folder):
        fullsubfolder = os.path.join(folder, subfolder)
        for fname in os.listdir(fullsubfolder):
            fullname = os.path.join(fullsubfolder, fname)
            try:
                feature = basic_feature(fullname)
                attr = []
                for head in featureheader:
                    attr.append(str(feature.get(head, 0)))
                txt = ','.join(attr)
                write_to_file(feature_save_path, txt + '\n', mode='a+')

            except Exception as e:
                traceback.print_exc()
コード例 #30
0
def automate_processes():
    utility.write_to_file(config.ConfigManager().LogFile, 'a',
                          'staging automationscript running')
    try:
        # industry data read
        exec(open('industrydataread.py').read(), globals())
        # currency data read
        exec(open('currencydataread.py').read(), globals())
        # ST msp users data read
        exec(open('stmspdataread.py').read(), globals())
        # ST clients data read
        exec(open('stclientsdataread.py').read(), globals())
        # data move staging to master
        exec(open('stagingdataread.py').read(), globals())
    except BaseException as ex:
        exception_message = '\n' + 'Exception:' + \
            str(datetime.datetime.now()) + '\n'
        exception_message += 'File: ' + '\n'
        exception_message += '\n' + str(ex) + '\n'
        exception_message += '-' * 100
        # .encode('utf8'))
        utility.write_to_file(config.ConfigManager().LogFile, 'a',
                              exception_message)