Beispiel #1
0
    def checkingKnownPhrases(self, repeat=False):

        rawscores = fileHandler.getwords(
            '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False)
        run_dict = self.checkzeroscores(rawscores)
        cnt = 0
        # this step is to refill the 0 values due to google block
        if repeat:
            while (len(run_dict) > round(0.000 * len(rawscores))) and (cnt <
                                                                       10):
                rawscores = knownphrase.secondrun(run_dict, rawscores)
                run_dict = self.checkzeroscores(rawscores)
                cnt += 1
            fileHandler.writeListToFile(
                rawscores, '../../outputs/knownphrase/knowphrase_all.txt')

        # update all the patterns with rawscore
        # ppp = []
        # for i in range(len(self.patterns)):
        #     self.patterns[i].is_know_phrase = rawscores[i]
        #     ppp.append(self.patterns[i].phrase)
        #     fileHandler.writeListToFile(ppp, '../../tmp/phrase_check_fm.txt')

        # made the score into dic
        ## attention: migrate the step into removePosFromDict()
        # print("len of scores dict: ", len(self.scoresdict))
        # print("len of patterns: ", len(self.patterns))

        assert len(self.scoresdict) == len(self.patterns)
        for i in range(len(self.patterns)):
            phrase = self.patterns[i].phrase
            self.patterns[i].is_know_phrase = self.scoresdict[phrase]

        return rawscores
def simplerun():
    phrases = list(getPhrases().keys())
    st1 = 4633
    end1 = 5633
    tmp = phrases[st1:end1]
    # print("tmp: ", tmp)
    output = [checkgoogle(t) for t in tmp]
    filehandler.writeListToFile(
        output,
        "../../outputs/knownphrase/knowphrase_{}.txt".format(5),
        append=False)
def partition_worker(phrases, pid):
    work = []
    with open ('../../tmp/wiki_quality_sentences_{}.txt'.format(pid), 'w') as f:
        for phrase in tqdm(phrases):
            try:
                score = checkgoogle(phrase)
                work.append(score)

            except Exception as ex:
                print(ex)

    filehandler.writeListToFile(work, "../../outputs/knownphrases/work_{}.txt".format(pid))
Beispiel #4
0
def partition_worker(words, pid):
    work = []
    with open ('../../tmp/wiki_quality_sentences_{}.txt'.format(pid), 'w') as f:
        for word in tqdm(words):
            try:
                sent = wikipedia.summary(word, sentences=2, auto_suggest=True)
                sent = sent.replace('\n', ' ')
                f.write("%s\n" % sent)
                work.append(word)
            except Exception as ex:
                print(ex)

    filehandler.writeListToFile(work, "../../outputs/wiki_work_{}.txt".format(pid))
def run():
    t5_list = filehandler.getwords('../input/t5.txt')
    t6_list = filehandler.getwords('../input/t6.txt')
    # list_diff = [item for item in t5_list if item not in t6_list]
    list_diff = list(set(t5_list) - set(t6_list))
    filehandler.writeListToFile(list_diff, "t4Dt5.txt")

    t5_dict = filehandler.getwordswithscore('../input/t5.txt')
    list_diff_score = [(t5_dict[t] + '\t' + t) for t in list_diff]

    list_diff_score = sorted(list_diff_score,
                             key=lambda t: t.split('\t')[0],
                             reverse=True)
    print(list_diff_score)

    filehandler.writeListToFile(list_diff_score, "t4Dt5_score.txt")
def anotherrun(repeat=False):
    # run_dict {idx, score}
    rawscores = filehandler.getwords(
        '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False)
    run_dict = checkzeroscores(rawscores)
    cnt = 0
    # this step is to refill the 0 values due to google block
    if repeat:
        while (len(run_dict) > round(0.0 * len(rawscores))) and (cnt < 10):
            rawscores = secondrun(run_dict, rawscores)
            run_dict = checkzeroscores(rawscores)
            cnt += 1
        filehandler.writeListToFile(
            rawscores, '../../outputs/knownphrase/knowphrase_all_v2.txt')

    # update all the patterns with rawscore
    return rawscores
def printHighQPhrases(debug=False):
    phrases = list(getPhrases().keys())
    phrases = [
        ' '.join([removePosFromWord(t) for t in phrase.split(' ')])
        for phrase in phrases
    ]
    scores = filehandler.getwords(
        '../../outputs/knownphrase/knowphrase_all_v2.txt', split=False)
    output = []
    for i in range(len(scores)):
        if int(scores[i]) == 4:
            output.append(phrases[i])
    print("len of high quality phrase: ", len(output))
    if debug:
        print(phrases)
        print("length of total phrases: ", len(phrases))
    filehandler.writeListToFile(output, '../../tmp/kp4.txt')
def run(n):
    phrases = list(getPhrases().keys())

    ed1 = 5150
    for i in range(n):
        try:
            st1 = ed1
            ed1 = st1 + 700
            tmp = phrases[st1:ed1]
            # print("tmp: ", tmp)
            output = [checkgoogle(t) for t in tmp]
            filehandler.writeListToFile(
                output,
                "../../outputs/knownphrase/knowphrase_{}.txt".format(i + 12),
                append=False)
        except Exception as e:
            print(e)
            continue
def integratelist():

    scores = filehandler.getwords('../../outputs/knownphrase/knowphrase_0.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_1.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_2.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_3.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_4.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_5.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_6.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_7.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_8.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_9.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_10.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_11.txt', split=False) + \
             filehandler.getwords('../../outputs/knownphrase/knowphrase_12.txt', split=False)
    print(scores)
    print("len of scores: ", len(scores))
    # pick out those zero score items and run again
    filehandler.writeListToFile(
        scores, '../../outputs/knownphrase/knowphrase_all_v2.txt')
Beispiel #10
0
    def clustering(self):
        label_dict = getLabels(method='ward')

        group1 = [
            self.patterns[i].phrase for i in label_dict if label_dict[i] == 0
        ]
        group2 = [
            self.patterns[i].phrase for i in label_dict if label_dict[i] == 1
        ]
        group3 = [
            self.patterns[i].phrase for i in label_dict if label_dict[i] == 2
        ]
        group4 = [
            self.patterns[i].phrase for i in label_dict if label_dict[i] == 3
        ]
        group5 = [
            self.patterns[i].phrase for i in label_dict if label_dict[i] == 4
        ]
        # group6 = [self.patterns[i].phrase for i in label_dict if label_dict[i] == 5]
        # group7 = [self.patterns[i].phrase for i in label_dict if label_dict[i] == 6]
        print("clustering=====")
        print("group1 length: ", len(group1))
        print("group2 length: ", len(group2))
        print("group3 length: ", len(group3))
        print("group4 length: ", len(group4))
        print("group5 length: ", len(group5))
        # print("group6 length: " , len(group6))
        # print("group7 length: " , len(group7))

        fileHandler.writeListToFile(group1,
                                    "../../outputs/features_group1_part.txt")
        fileHandler.writeListToFile(group2,
                                    "../../outputs/features_group2_part.txt")
        fileHandler.writeListToFile(group3,
                                    "../../outputs/features_group3_part.txt")
        fileHandler.writeListToFile(group4,
                                    "../../outputs/features_group4_part.txt")
        fileHandler.writeListToFile(group5,
                                    "../../outputs/features_group5_part.txt")
Beispiel #11
0
    # way 3: use trees
    all_phrases_dict = run(freq=True, all_phrase_dict=all_phrases_dict)

    # remove the empty keys
    all_phrases_dict = {
        k: v
        for k, v in all_phrases_dict.items() if v is not None
    }

    print(all_phrases_dict)
    list1 = list(all_phrases_dict.keys())
    list2 = list(all_phrases_dict.values())
    all_phrases_freq_list = ['\t'.join(map(str, i)) for i in zip(list2, list1)]

    # sort the list
    all_phrases_freq_list = sorted(all_phrases_freq_list,
                                   key=lambda x: int(x.split('\t')[0]),
                                   reverse=True)
    all_phrases_freq_list2 = sorted(all_phrases_freq_list,
                                    key=lambda x: (x.split('\t')[1]),
                                    reverse=False)

    # write output to file
    writeListToFile(all_phrases_freq_list,
                    '../../outputs/np_extract_with_freq.txt')
    writeListToFile(all_phrases_freq_list2,
                    '../../outputs/np_extract_with_freq_alpha.txt')

    # also pickle the file for future use
    pickle.dump(dict(all_phrases_dict), open('../../tmp/phrases_freq', 'wb'))
Beispiel #12
0
        """
    chunker = nltk.RegexpParser(grammar)
    toks = nltk.regexp_tokenize(text, sentence_re)
    postoks = nltk.tag.pos_tag(toks)
    tree = chunker.parse(postoks)
    terms = get_terms(tree, freq=freq)
    output = []
    for term in terms:
        name_entity = ' '.join(term)
        name_entity = Normalizer().cleanPhrase(name_entity)
        output.append(name_entity)
        if freq:
            all_phrase_dict[name_entity] += 1
    if freq:
        return all_phrase_dict
    return output


if __name__ == '__main__':

    # nltk.download('wordnet')

    output = run()
    print(output)
    print("length of output: " , len(output))
    writeListToFile(output, '../../outputs/np_extract_r3.txt')

    # test = run(freq=True, all_phrase_dict=defaultdict(lambda:0))
    # print(test)

                for token, pos in subtree.leaves()
            ]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            named_entity = Normalizer().cleanPhrase(named_entity)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk


if __name__ == '__main__':
    # Defining a grammar & Parser
    NP = "NP: {(<V\w+>|<NN\w?>)+.*<NN\w?>}"
    chunker = RegexpParser(NP)

    nz = Normalizer()

    # apply to our file
    sent = getsent('/Users/beidan/RASHIP/PDFs-TextExtract/output/section.txt')
    print(sent)
    # way 1 : use ner chunker
    chunks = get_continuous_chunks(sent)
    # way 2 :use custom chunker
    #chunks = get_continuous_chunks(sent, chunker.parse)
    print(chunks)
    writeListToFile(chunks, '../../outputs/np_extract_r1.txt')
def writePhrasesWithoutDuplicates():
    phrases = filehandler.getwords("../../tmp/kp4.txt", split=False)
    phrases = list(dict.fromkeys(phrases))
    phrases = [t for t in phrases if len(t.split(' ')) > 1]
    filehandler.writeListToFile(phrases,
                                "../../outputs/is_known_phrase_nodup.txt")