コード例 #1
0
def run():
    # only write once, in order to keep data consistency
    # output = getAllPhrases()
    # output = [t for t in output if t != '']
    # output = sorted(output)
    # fileHandler.writeListToFile(output, '../../outputs/np_extract_all_normalized.txt')

    ## read from file to make the result consistent
    output = fileHandler.getwords('../../outputs/np_extract_all_normalized.txt', split=False)

    extractFeatures(output)
コード例 #2
0
def anotherrun(repeat=False):
    # run_dict {idx, score}
    rawscores = filehandler.getwords(
        '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False)
    run_dict = checkzeroscores(rawscores)
    cnt = 0
    # this step is to refill the 0 values due to google block
    if repeat:
        while (len(run_dict) > round(0.0 * len(rawscores))) and (cnt < 10):
            rawscores = secondrun(run_dict, rawscores)
            run_dict = checkzeroscores(rawscores)
            cnt += 1
        filehandler.writeListToFile(
            rawscores, '../../outputs/knownphrase/knowphrase_all_v2.txt')

    # update all the patterns with rawscore
    return rawscores
コード例 #3
0
def printHighQPhrases(debug=False):
    phrases = list(getPhrases().keys())
    phrases = [
        ' '.join([removePosFromWord(t) for t in phrase.split(' ')])
        for phrase in phrases
    ]
    scores = filehandler.getwords(
        '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False)
    output = []
    for i in range(len(scores)):
        if int(scores[i]) == 4:
            output.append(phrases[i])
    print("len of high quality phrase: ", len(output))
    if debug:
        print(phrases)
        print("length of total phrases: ", len(phrases))
    filehandler.writeListToFile(output, '../../tmp/kp4.txt')
コード例 #4
0
    def removePosTagFromDict(self, all_phrases_raw_pos):
        all_phrases_raw = {}
        scoresraw = fileHandler.getwords(
            '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False)
        scoresdict = {}
        print("all_phrases_raw_pos: ", all_phrases_raw_pos)
        i = 0
        for phrase_pos_key in all_phrases_raw_pos:
            phrase_key = re.sub(r'%[A-Z]+\b', '', phrase_pos_key)
            all_phrases_raw[phrase_key] = all_phrases_raw_pos[phrase_pos_key]
            if phrase_key not in scoresdict:
                scoresdict[phrase_key] = scoresraw[i]
            else:
                tmp = scoresdict[phrase_key]
                scoresdict[phrase_key] = max(tmp, scoresraw[i])
            i += 1

        return all_phrases_raw, scoresdict
コード例 #5
0
        print(ex)


def partition_worker(words, pid):
    work = []
    with open('../../tmp/wiki_quality_sentences_{}.txt'.format(pid), 'w') as f:
        for word in tqdm(words):
            try:
                sent = wikipedia.summary(word, sentences=2, auto_suggest=True)
                sent = sent.replace('\n', ' ')
                f.write("%s\n" % sent)
                work.append(word)
            except Exception as ex:
                print(ex)

    filehandler.writeListToFile(work,
                                "../../outputs/wiki_work_{}.txt".format(pid))


if __name__ == '__main__':
    #print (wikipedia.summary("new york city", sentences=2, auto_suggest=False))

    words = filehandler.getwords('../../input/wiki_quality.txt', split=False)
    # print(words)

    partition_worker(words[3401:3500], 1)

    # try_word("Henry Billings Brown")

    # print(sentences==True)
コード例 #6
0
def getAllPhrases():
    output = set(fileHandler.getwords('../../outputs/np_extract_r1.txt', split=False)).union(
        set(fileHandler.getwords('../../outputs/np_extract_r2.txt', split=False))).union(
        set(fileHandler.getwords('../../outputs/np_extract_r3.txt', split=False))
    )
    return list(output)
コード例 #7
0
def integratelist():

    scores = filehandler.getwords('../../outputs/knownphrase/knowphrase_0.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_1.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_2.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_3.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_4.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_5.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_6.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_7.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_8.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_9.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_10.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_11.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_12.txt', split=False)
    print(scores)
    print("len of scores: ", len(scores))
    # pick out those zero score items and run again
    filehandler.writeListToFile(
        scores, '../../outputs/knownphrase/knowphrase_all_v2.txt')
コード例 #8
0
def writePhrasesWithoutDuplicates():
    phrases = filehandler.getwords("../../tmp/kp4.txt", split=False)
    phrases = list(dict.fromkeys(phrases))
    phrases = [t for t in phrases if len(t.split(' ')) > 1]
    filehandler.writeListToFile(phrases,
                                "../../outputs/is_known_phrase_nodup.txt")