コード例 #1
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)

        seen_scores = set()

        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        min_count = float(bigram.min_count)
        len_vocab = float(len(bigram.vocab))
        graph = float(bigram.vocab[b"graph"])
        data = float(bigram.vocab[b"data"])
        data_and_graph = float(bigram.vocab[b"data_and_graph"])
        human = float(bigram.vocab[b"human"])
        interface = float(bigram.vocab[b"interface"])
        human_interface = float(bigram.vocab[b"human_interface"])

        assert seen_scores == set([
            # score for data and graph
            round((data_and_graph - min_count) / data / graph * len_vocab, 3),
            # score for human interface
            round((human_interface - min_count) / human / interface * len_vocab, 3),
        ])
コード例 #2
0
    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         common_terms=self.common_terms)

        seen_scores = set()

        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        min_count = float(bigram.min_count)
        len_vocab = float(len(bigram.vocab))
        graph = float(bigram.vocab[b"graph"])
        data = float(bigram.vocab[b"data"])
        data_and_graph = float(bigram.vocab[b"data_and_graph"])
        human = float(bigram.vocab[b"human"])
        interface = float(bigram.vocab[b"interface"])
        human_interface = float(bigram.vocab[b"human_interface"])

        assert seen_scores == set([
            # score for data and graph
            round((data_and_graph - min_count) / data / graph * len_vocab, 3),
            # score for human interface
            round(
                (human_interface - min_count) / human / interface * len_vocab,
                3),
        ])
コード例 #3
0
def extract_phrases(filename, min_count):
    rst = build_input(filename)
    gen = list(itertools.chain.from_iterable(rst))
    bigram = Phrases(gen, threshold=5, min_count=min_count)
    trigram = Phrases(bigram[gen], threshold=2, min_count=2)
    # write
    with open('data/phrases_%d_%s' % (min_count, os.path.basename(filename)),
              'wb') as fout:
        ph_dic = {}
        for phrase, score in bigram.export_phrases(gen):
            ph_dic[phrase] = score
        for phrase, score in trigram.export_phrases(bigram[gen]):
            ph_dic[phrase] = score
        for phrase, score in ph_dic.items():
            if re.search(r'\d+', phrase):  # remove digits
                continue
            phrase = b"_".join(phrase.split(b' '))
            fout.write(phrase + b'\n')
コード例 #4
0
    def testMultipleBigramsSingleEntry(self):
        """ a single entry should produce multiple bigrams. """
        bigram = Phrases(self.sentences, min_count=1, threshold=1)
        seen_bigrams = set()

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == {b'graph minors', b'human interface'}
コード例 #5
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testMultipleBigramsSingleEntry(self):
        """ a single entry should produce multiple bigrams. """
        bigram = Phrases(self.sentences, min_count=1, threshold=1)
        seen_bigrams = set()

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == {b'graph minors', b'human interface'}
コード例 #6
0
ファイル: test_phrases.py プロジェクト: rmalouf/gensim
    def testExportPhrases(self):
        """Test Phrases bigram export_phrases functionality."""
        bigram = Phrases(sentences, min_count=1, threshold=1)

        seen_bigrams = set()

        for phrase, score in bigram.export_phrases(sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == {b'response time', b'graph minors', b'human interface'}
コード例 #7
0
def extract_phrases(reviews_sents, reviews_docs, save=False):
    logging.info("Extracting phrases...")
    bigram = Phrases(reviews_sents, threshold=5, min_count=5)
    trigram = Phrases(bigram[reviews_sents], threshold=3, min_count=3)
    if save:
        with open('../data/phrase/phrases_%d_%s' % (3, 'app_review'), 'wb') as fout:
            ph_dic = {}
            for phrase, score in bigram.export_phrases(reviews_sents):
                ph_dic[phrase] = score
            for phrase, score in trigram.export_phrases(bigram[reviews_sents]):
                ph_dic[phrase] = score
            for phrase, score in ph_dic.items():
                if re.search(r'\d+', phrase):  # remove digits
                    continue
                phrase = b"_".join(phrase.split(b' '))
                fout.write(phrase + b'\n')
        bigram.save("../model/bigram.model")
        trigram.save("../model/trigram.model")

    return trigram[bigram[reviews_docs]]
コード例 #8
0
    def testExportPhrases(self):
        """Test Phrases bigram export_phrases functionality."""
        bigram = Phrases(sentences, min_count=1, threshold=1)

        seen_bigrams = set()

        for phrase, score in bigram.export_phrases(sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == set(
            [b'response time', b'graph minors', b'human interface'])
コード例 #9
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testCustomScorer(self):
        """ test using a custom scoring function """

        bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)

        seen_scores = []
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.append(score)

        assert all(seen_scores)  # all scores 1
        assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
コード例 #10
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testMultipleBigramsSingleEntry(self):
        """ a single entry should produce multiple bigrams. """
        bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)

        seen_bigrams = set()
        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_bigrams.add(phrase)
        assert seen_bigrams == set([
            b'data and graph',
            b'human interface',
        ])
コード例 #11
0
def show_phrases(corpus, threshold=1000, shown=1000):
    # Training the multi-word expression detector
    tokenized_sentences = tokenize_sentences(corpus)
    phrases = Phrases(tokenized_sentences, threshold=threshold)
    i = 0
    for phrase, score in phrases.export_phrases(tokenized_sentences):
        if i > shown:
            break
        else:
            print("Expression : {0}, score = {1}".format(
                phrase.decode('utf-8'), score))
        i = i + 1
コード例 #12
0
def generating_bigrams(final_df):
    eligibility_criteria = final_df['features']
    bigrams_input = [each_row.split() for each_row in eligibility_criteria]
    bigram_transformer = Phrases(bigrams_input, min_count=20, threshold=500)
    bigram_transformer.save("bigrams", pickle_protocol=4)

    fd = open("bigrams.txt", 'a')
    for phrase, score in bigram_transformer.export_phrases(bigrams_input):
        fd.write(u'{0}   {1}'.format(phrase, score))
    fd.close()

    return bigram_transformer
コード例 #13
0
 def test_create_and_decode_phrases(self):
     df = pd.read_csv('text_analytics/tests/NYT.Corruption')
     phrases = Phrases(
         sentences=read_clean(df),
         min_count=100,
         threshold=0.70,
         scoring="npmi",
         max_vocab_size=100000000,
         delimiter="_",
     )
     exported = phrases.export_phrases()
     return exported
コード例 #14
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testCustomScorer(self):
        """ test using a custom scoring function """

        bigram = Phrases(self.sentences, min_count=1, threshold=.001,
                         scoring=dumb_scorer, common_terms=self.common_terms)

        seen_scores = []
        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.append(score)

        assert all(seen_scores)  # all scores 1
        assert len(seen_scores) == 2  # 'data and graph' 'survey for human'
コード例 #15
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testScoringNpmi(self):
        """ test normalized pointwise mutual information scoring """
        bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi')

        seen_scores = set()
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == {
            .882,  # score for graph minors
            .714  # score for human interface
        }
コード例 #16
0
    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences, min_count=1, threshold=1)

        seen_scores = set()

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == {
            5.167,  # score for graph minors
            3.444  # score for human interface
        }
コード例 #17
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences, min_count=1, threshold=1)

        seen_scores = set()

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == {
            5.167,  # score for graph minors
            3.444  # score for human interface
        }
コード例 #18
0
ファイル: test_phrases.py プロジェクト: zwytop/gensim
    def testScoringNpmi(self):
        """ test normalized pointwise mutual information scoring """
        bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi')

        seen_scores = set()

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == set([
            .882,  # score for graph minors
            .714  # score for human interface
        ])
コード例 #19
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testExportPhrases(self):
        """Test Phrases bigram export_phrases functionality."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)

        seen_bigrams = set()

        for phrase, score in bigram.export_phrases(self.sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == set([
            b'human interface',
            b'graph of trees',
            b'data and graph',
            b'lack of interest',
        ])
コード例 #20
0
ファイル: test_phrases.py プロジェクト: yujuyeon0511/gensim
    def test_export_phrases(self):
        """Test Phrases bigram and trigram export phrases."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         delimiter=' ')
        trigram = Phrases(bigram[self.sentences],
                          min_count=1,
                          threshold=1,
                          delimiter=' ')
        seen_bigrams = set(bigram.export_phrases().keys())
        seen_trigrams = set(trigram.export_phrases().keys())

        assert seen_bigrams == set([
            'human interface',
            'response time',
            'graph minors',
            'minors survey',
        ])

        assert seen_trigrams == set([
            'human interface',
            'graph minors survey',
        ])
コード例 #21
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testScoringNpmi(self):
        """ test normalized pointwise mutual information scoring """
        bigram = Phrases(self.sentences, min_count=1, threshold=.5,
                         scoring='npmi', common_terms=self.common_terms)

        seen_scores = set()

        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == set([
            .74,  # score for data and graph
            .894  # score for human interface
        ])
コード例 #22
0
ファイル: phrases.py プロジェクト: XsongyangX/ift6285-hw4
def salient_bigrams(phrases: Phrases):
    """Finds the most salient bigrams

    Args:
        phrases (Phrases): Phrases class set up for bigram search
    """
    for slice in read_corpus():
        phrases.add_vocab(read_slice(slice))

        # evaluate all previous corpus slices
        found = set()
        total_bigrams_encountered = 0
        for previous_slice in read_corpus():
            for phrase, score in phrases.export_phrases(
                    read_slice(previous_slice)):
                found.add((phrase, score))
                total_bigrams_encountered += 1
            if previous_slice == slice:
                break

        found = sorted(found, key=lambda element: element[1], reverse=True)

        # no bigrams found?
        if len(found) == 0:
            output(slice, "")

        # log the top ten bigrams
        for phrase, score in found[:10]:
            output(slice, "{phrase}, {score}".format(phrase=phrase,
                                                     score=score))

        # log the total counts
        output(
            slice, """
Total bigrams: {total}
Unique bigrams: {unique}
Median score:{median}
Max score:{max}
Min score:{min}
""".format(total=total_bigrams_encountered,
           unique=len(found),
           median=found[len(found) // 2] if len(found) != 0 else 0,
           max=found[0] if len(found) != 0 else 0,
           min=found[-1]) if len(found) != 0 else 0)

        # will log a time if command line args were enabled
        Timer.try_to_time()
コード例 #23
0
    def testMultipleBigramsSingleEntry(self):
        """ a single entry should produce multiple bigrams. """
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         common_terms=self.common_terms)

        seen_bigrams = set()
        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_bigrams.add(phrase)
        assert seen_bigrams == set([
            b'data and graph',
            b'human interface',
        ])
コード例 #24
0
ファイル: test_phrases.py プロジェクト: yujuyeon0511/gensim
 def test_export_phrases(self):
     """Test Phrases bigram export phrases."""
     bigram = Phrases(self.sentences,
                      min_count=1,
                      threshold=1,
                      delimiter=' ')
     seen_bigrams = set(bigram.export_phrases().keys())
     assert seen_bigrams == set([
         'and graph',
         'data and',
         'graph of',
         'graph survey',
         'human interface',
         'lack of',
         'of interest',
         'of trees',
     ])
コード例 #25
0
    def testExportPhrases(self):
        """Test Phrases bigram export_phrases functionality."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         common_terms=self.common_terms)

        seen_bigrams = set()

        for phrase, score in bigram.export_phrases(self.sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == set([
            b'human interface',
            b'graph of trees',
            b'data and graph',
            b'lack of interest',
        ])
コード例 #26
0
ファイル: test_phrases.py プロジェクト: YantianZha/Distr2Vec
    def testExportPhrases(self):
        """Test Phrases bigram export_phrases functionality."""
        bigram = Phrases(sentences, min_count=1, threshold=1)

        # with this setting we should get response_time and graph_minors
        bigram1_seen = False
        bigram2_seen = False

        for phrase, score in bigram.export_phrases(sentences):
            if not bigram1_seen and b'response time' == phrase:
                bigram1_seen = True
            elif not bigram2_seen and b'graph minors' == phrase:
                bigram2_seen = True
            if bigram1_seen and bigram2_seen:
                break

        self.assertTrue(bigram1_seen)
        self.assertTrue(bigram2_seen)
コード例 #27
0
    def testCustomScorer(self):
        """ test using a custom scoring function """

        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=.001,
                         scoring=dumb_scorer,
                         common_terms=self.common_terms)

        seen_scores = []
        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.append(score)

        assert all(seen_scores)  # all scores 1
        assert len(seen_scores) == 2  # 'data and graph' 'survey for human'
コード例 #28
0
ファイル: test_phrases.py プロジェクト: ArifAhmed1995/gensim
 def testExportPhrases(self):
     """Test Phrases bigram export_phrases functionality."""
     bigram = Phrases(sentences, min_count=1, threshold=1)
     
     # with this setting we should get response_time and graph_minors
     bigram1_seen = False
     bigram2_seen = False
     
     for phrase, score in bigram.export_phrases(sentences):
         if not bigram1_seen and b'response time' == phrase:
             bigram1_seen = True
         elif not bigram2_seen and b'graph minors' == phrase:
             bigram2_seen = True
         if bigram1_seen and bigram2_seen:
             break
     
     self.assertTrue(bigram1_seen)
     self.assertTrue(bigram2_seen)
コード例 #29
0
    def testCustomScorer(self):
        """ test using a custom scoring function """

        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=.001,
                         scoring=dumb_scorer)

        seen_scores = []
        test_sentences = [[
            'graph', 'minors', 'survey', 'human', 'interface', 'system'
        ]]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.append(score)

        assert all(seen_scores)  # all scores 1
        assert len(
            seen_scores
        ) == 3  # 'graph minors' and 'survey human' and 'interface system'
コード例 #30
0
    def testScoringNpmi(self):
        """ test normalized pointwise mutual information scoring """
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=.5,
                         scoring='npmi',
                         common_terms=self.common_terms)

        seen_scores = set()

        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == set([
            .74,  # score for data and graph
            .894  # score for human interface
        ])
コード例 #31
0
    tri_phrase = Phrases(bi_sentances_tokens,
                         min_count=3,
                         threshold=10,
                         scoring='default')
    tri_grapm = Phraser(tri_phrase)
    tri_phrase.save(trigram_model_save_path)
    print('bigram model saved at {}'.format(trigram_model_save_path))

    #%%
    print('run one test:')
    sent = list(jieba.cut("我们今天来谈一谈区块链和人工智能的结合"))
    print('origin split: \n {}'.format(sent))
    print('updated split: \n {}'.format(tri_grapm[bi_grapm[sent]]))
    #%%
    ## export all phrases to excel for review
    bi_phrases_b = bi_phrase.export_phrases(sentances_tokens)
    tri_phrases_b = tri_phrase.export_phrases(bi_sentances_tokens)
    detected_bi_phrases = list(
        set([(p.decode('utf-8'), v) for (p, v) in bi_phrases_b]))
    detected_tri_phrases = list(
        set([(p.decode('utf-8'), v) for (p, v) in tri_phrases_b]))
    detected_tri_phrases = [(p.replace('_', ' '), v)
                            for (p, v) in detected_tri_phrases
                            if "_" in p]  ## only keep real trigrams
    #%%
    ## incrrase in vocabulary
    bi_phrases_df = pd.DataFrame(detected_bi_phrases,
                                 columns=['phrases', 'score'])
    tri_phrases_df = pd.DataFrame(detected_tri_phrases,
                                  columns=['phrases', 'score'])
    bi_phrases_df.sort_values(by='score', inplace=True, ascending=False)
        + 'threshold: ' + str(threshold) + '\t' \
        + 'top_ngram: ' + str(top_ngram)
    print(log_str)
# ----------------------------Entrainement


# Train bigram model.
if (int(args.order) == 2 and args.save == True) or (int(args.order) >= 2 and args.save == False):
    start = time.time()
    bigram_phrases = Phrases(
        sentences, min_count=args.mincount[0], threshold=args.threshold[0], scoring='npmi')
    bigram = Phraser(bigram_phrases)
    if args.save == True:
        bigram_phrases.save('result/model_' + filename + '_m_' +
                            mincount + '_t_' + threshold + '_bigram_model.pkl')
    score_bigram = sorted(list(set(bigram_phrases.export_phrases(
        sentences))), key=lambda x: x[1], reverse=True)
    train_time = time.time() - start
    ngram = score_bigram
    log_train('bigram', len(sentences), train_time,
              len(ngram), ngram[0][1], ngram[-1][1], mincount, threshold, ngram[:10])

# Train trigram model.
if (int(args.order) == 3 and args.save == True) or (int(args.order) >= 3 and args.save == False):
    start = time.time()
    if args.save == True:
        bigram_phrases = Phrases.load(
            'result/model_' + filename + '_m_' +
            mincount + '_t_' + threshold + '_bigram_model.pkl')
        bigram = Phraser(bigram_phrases)

    trigram_phrases = Phrases(
コード例 #33
0
def process_and_save_worker(
        infile,
        threshold=10,
        min_count=50,
        min_len=5,
        delete_orig=False,
        num_phrasing_rounds=2,
):
    """
    Single threaded worker for the text preprocessing and saving of files.
    Called by process_and_save().

    :param infile: str
        Path to the .txt.bz2 file to be processed.
    :param threshold: float
        The threshold kwarg of Gensim's Phrases() object.
    :param min_count: int
        The min_count kwarg of Gensim's Phrases() object.
    :param min_len: int
        minimum number of words in a post to keep it.  Posts
        shorter than min_len words AFTER PROCESSING are discarded.
    :param delete_orig: bool
        True to delete the original .bz2 file (from Archive.org) after processing,
        False to keep it.  Deleting it can save disk space.
    :return:
    """
    # Grab two temporary files, so we can shunt data between them,
    # processing it while it's in memory.
    suff = infile.replace("\\", "/").split("/")[-1][:-8]
    S = f"{suff:<20s}"
    raw_out = infile\
        .replace("\\", "/") \
        .replace(".txt.bz2", "_raw.txt.bz2") \
        .replace("By Subreddit/FINAL/", "Processed Files/")
    processed_out = infile \
        .replace("\\", "/") \
        .replace(".txt.bz2", "_processed.txt.bz2") \
        .replace("By Subreddit/FINAL/", "Processed Files/")
    working_file = processed_out.replace("_processed.txt.bz2", "_working.txt.bz2")

    # Create a total count variable that we'll use to update tqdm appropriately.
    total = 0
    with bz2.open(infile, "rt", encoding="utf8") as I, \
            bz2.open(raw_out, "wt", encoding="utf8") as R, \
            bz2.open(processed_out, "wt", encoding="utf8") as P:
        for i in tqdm(I, desc=f"{S}: Preprocessing", mininterval=5, position=1):
            # Preprocess, and skip if length is too low.
            text = process_string(literal_eval(i))
            if len(text) < min_len: continue
            text = " ".join(text)
            # write repr() for raw files to ensure one line per post;
            # write the ID as a fixed length string.
            R.write(f"{i.strip()}\n")
            P.write(f"{text}\n")
            total += 1

    # Stream the processed files through a Gensim Phrases() object.
    # If there are any phrases to be found, stream through a Phraser()
    # object and into a temp file.  Then overwrite the original
    # processed file and repeat.
    for i in range(num_phrasing_rounds):
        with bz2.open(processed_out, "rt", encoding="utf8") as IN, \
                bz2.open(working_file, "wt", encoding="utf8") as OUT:
            p = Phrases(
                (i.strip().split()
                 for i in tqdm(
                    IN,
                    total=total,
                    desc=f"{S} Phrase-finding {i+1}",
                    mininterval=5,
                    position=1
                )),
                threshold=threshold,
                min_count=min_count,
                # for some reason I get errors if the delimiter isn't a bytestring
                delimiter=b'_'
            )
            IN.seek(0)
            # See if there were any phrases found.  If not, abort phrasing early.
            try:
                next(p.export_phrases(i.strip().split() for i in IN))
            except StopIteration:
                break
            pp = Phraser(p)
            IN.seek(0)
            for i in tqdm(IN, total=total, desc=f"{S} Applying phraser 1", mininterval=5, position=1):
                OUT.write(f"{' '.join(list(pp[i.strip().split()]))}\n")
        move(working_file, processed_out)

    # Now, do a final pass to filter posts by length again.  This second pass
    # is because the length of a file may have changed considerably after
    # processing.  As before, stream to a temporary working file, then
    # overwrite the original when done.
    raw_working = raw_out.replace(".txt.bz2", "_working.txt.bz2")
    processed_working = processed_out.replace(".txt.bz2", "_working.txt.bz2")
    with bz2.open(raw_out, "rt", encoding="utf8") as RAW_IN, \
        bz2.open(processed_out, "rt", encoding="utf8") as PROC_IN, \
        bz2.open(raw_working, "wt", encoding="utf8") as RAW_OUT,  \
        bz2.open(processed_working, "wt", encoding="utf8") as PROC_OUT:

        for i in zip(PROC_IN, RAW_IN):
            if len(i[0].split()) >= min_len:
                PROC_OUT.write(i[0])
                RAW_OUT.write(i[1])

    move(processed_working, processed_out)
    move(raw_working, raw_out)


    if delete_orig == True:
        os.remove(infile)

    return 0
コード例 #34
0
    def _generate_phrase(self, pd_data, load_model=False, section='phrase'):
        """
        (Private) Generate phrase using the gensim Phrase detection module.

        Inputs:
            pd_data: (pd.Series) Data which will be used to generate phase.
            section: (str, optional) Section name of the .ini file.

        Returns:
            pd_data: (pd.Series) Input data but using phrases.
        """
        if not self.configparser.getbool('generate_phrase', section):
            log.info('Skipping phrase generation...')
            return pd_data

        if load_model:
            model_filepath = self.configparser.getstr('phrase_model', section)
            model = Phraser.load(model_filepath)

            # apply phrase model
            log.info('Applying loaded phrase model...')
            pd_data = pd_data.apply(lambda x: model[x], convert_dtype=False)
        else:
            log.info('Generating new phrases...')

            # this is our training data
            sentences = pd_data.tolist()

            # detect phrases using the configuration
            model = Phrases(
                sentences,
                min_count=self.configparser.getint('min_count', section),
                threshold=self.configparser.getfloat('threshold', section),
                max_vocab_size=self.configparser.getint(
                    'max_vocab_size', section),
                progress_per=self.configparser.getint('progress_per', section),
                scoring=self.configparser.getstr('scoring', section))

            # apply trained model to generate phrase
            log.info('Applying phrase model...')
            pd_data = pd_data.apply(lambda x: model[x], convert_dtype=False)

            # save phrase model
            model_filepath = self.configparser.getstr('phrase_model', section)

            log.info('Saving phrase model to \'%s\'...', model_filepath)
            model.save(model_filepath)

            # dump phrase and its score as text
            phrase_score_list = []
            for phrase, score in model.export_phrases(sentences):
                phrase_score_list.append([phrase.decode('utf-8'), score])

            pd_phrase_score = pd.DataFrame(phrase_score_list,
                                           columns=['phrase', 'score'])
            pd_phrase_score.drop_duplicates(subset='phrase', inplace=True)

            export_filepath = self.configparser.getstr('phrase_dump_filename',
                                                       section)

            log.info('Dumping phrases to \'%s\'...', export_filepath)
            pd_phrase_score.to_csv(export_filepath, sep='\t', index=False)

        return pd_data
コード例 #35
0
#     print(bigram[i])


result = []
for i in test:
    result.append(bigram[i])

pickle.dump(result, open('phrases_res_nost.pkl','wb'))


### based on bigram phrases to detect trigram and 4-grams
bg = pickle.load(open('phrases_res_nost.pkl', 'rb'))
phrases2 = Phrases(bg, min_count=10, threshold=0.1, scoring = 'npmi')

phr_score = {}
for phrase, score in phrases2.export_phrases(bg):
#     print(phrase.decode(), score)
    phr_score[phrase.decode()] = score


print('The npmi score of phrases:' 
    sorted(phr_score.items(), key=lambda d: d[1], reverse=True))


trigram = Phraser(phrases2)
# for i in bg[:100]:
#     print(trigram[i])
bi_tri_res = []
for i in bg:
    bi_tri_res.append(trigram[i])
コード例 #36
0
number_points = 15

sentences_taken = []
bigramme_taken  = []
time_taken      = []

for i in range(1,number_points+1):
    print(i)
    
    start = time.time()
    
    sentences = list_sentences[0: int(i * (len_total_sentences/number_points))]
    
    bigram_phrases = Phrases(sentences, min_count=1, threshold=10)
    score_bigram = sorted(list(set(bigram_phrases.export_phrases(
    sentences))), key=lambda x: x[1], reverse=True)

    end = time.time()
    time_taken.append(end-start)
    sentences_taken.append(len(sentences))
    bigramme_taken.append(len(score_bigram))

with open("10mil_text_sentences_bigramme_time.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerows(zip(sentences_taken, trigramme_taken, tri_time_taken))

# ===============================================================
# Informations pour les trigrammes

sents = PathLineSentences(PATH_TRAINING,limit = 160000) 
list_sentences = list(sents)
コード例 #37
0
    all_phrases = {}
    phrases = Phrases(DOCUMENTS, **param)
    bi_gram = Phraser(phrases)

    Bi_PHRASES = []

    for doc in DOCUMENTS:
        bi_grams = bi_gram[doc]
        Bi_PHRASES.append(bi_grams)

    # {(10, 10), (15, 20), (20, 10)}
    tri_phrases = Phrases(Bi_PHRASES)

    TRI_PHRASES = {}
    for phrase, score in tri_phrases.export_phrases(Bi_PHRASES):
        phrase = phrase.decode("utf-8").replace("_", " ")
        if len(phrase.split()) > 2:
            all_phrases[phrase] = score

    results = {
        k: v
        for k, v in sorted(
            all_phrases.items(), key=lambda item: item[1], reverse=True)
    }

    print(f"Model Dumping {index}")
    with open(
            f"models/phrases_ahsan_{param['min_count']}_{param['threshold']}.json",
            "w") as out_json:
        json.dump(results, out_json, ensure_ascii=False, indent=4)
コード例 #38
0
ファイル: phrases.py プロジェクト: XsongyangX/ift6285-hw4
def salient_trigrams(phrases: Phrases):
    """Finds the most salient trigrams

    Args:
        phrases (Phrases): Phrases class set up for bigram search
    """
    trigram = Phrases()

    for slice in read_corpus():
        # prepare the bigram
        for previous_slice in read_corpus():
            phrases.add_vocab(read_slice(slice))
            if previous_slice == slice:
                break

        # transform sentences into possible bigrams
        bigram_phraser = Phraser(phrases)

        def bigrammed(slice: str):
            for sent in read_slice(slice):
                yield bigram_phraser[sent]

        trigram.add_vocab(bigrammed(slice))

        # evaluate all previous corpus slices
        found = set()
        total_trigrams_encountered = 0
        for previous_slice in read_corpus():
            for phrase, score in trigram.export_phrases(
                    bigrammed(previous_slice)):
                if phrase.count(b'_') == 2:
                    found.add((phrase, score))
                    total_trigrams_encountered += 1
                elif '_' in phrase:
                    print(phrase)
            if previous_slice == slice:
                break

        found = sorted(found, key=lambda element: element[1], reverse=True)

        # no trigrams found?
        if len(found) == 0:
            output(slice, "")

        # log the top ten trigrams
        for phrase, score in found[:10]:
            output(slice, "{phrase}, {score}".format(phrase=phrase,
                                                     score=score))

        # log the total counts
        output(
            slice, """
Total trigrams: {total}
Unique trigrams: {unique}
Mean score:{median}
Max score:{max}
Min score:{min}
""".format(total=total_trigrams_encountered,
           unique=len(found),
           median=found[len(found) // 2] if len(found) != 0 else 0,
           max=found[0] if len(found) != 0 else 0,
           min=found[-1] if len(found) != 0 else 0))

        # will log a time if command line args were enabled
        Timer.try_to_time()