コード例 #1
0
def main(args):
    sentences = TextNormalizer(LineSentence(args.infile), 
                               args.keep_mixedcase, args.keep_digits, args.keep_punc)

    # build initial bigram phrase model
    model = Phrases(sentences, min_count=5, threshold=10)
    model.save("%sphrase.model" % (args.outdir))
コード例 #2
0
 def build_trigram_model(self,sentences,bigram):
     print "In Trigram Model"
     trigram = Phrases(bigram[sentences])
     dest = self.models + 'trigram_model'
     trigram.save(dest)
     
     return trigram
コード例 #3
0
    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        try:
            bigram = Phrases(self.sentences,
                             min_count=1,
                             threshold=.001,
                             scoring=dumb_scorer)
            bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            bigram_loaded = Phrases.load(
                "test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            seen_scores = []
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(
                seen_scores
            ) == 3  # 'graph minors' and 'survey human' and 'interface system'

        finally:
            if os.path.exists(
                    "test_phrases_testSaveLoadCustomScorer_temp_save.pkl"):
                os.remove(
                    "test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
コード例 #4
0
def load_shit(file_paths, save_path):
    for i, path in enumerate(file_paths):
        # first iteration
        if i == 0:
            print('[info] initializing phrase model')
            with open(path) as f:
                reader = ndjson.reader(f)
                reader = extract_text(reader)
                # initialize phrase model
                phrases = Phrases(reader, delimiter=b" ")

        # every other iteration
        else:
            if i % 1000:
                progress = (i / len(file_paths)) * 100
                print('[info] processed {}% files'.format(round(progress, 1)))

            with open(path) as f:
                reader = ndjson.reader(f)
                reader = extract_text(reader)
                # show the model new data
                phrases.add_vocab(reader)

    # save model after iterations are done
    with open(save_path, 'w') as f:
        phrases.save(save_path)
コード例 #5
0
    def testSaveLoadNoScoring(self):
        """ Saving and loading a Phrases object with no scoring parameter.
        This should ensure backwards compatibility with old versions of Phrases"""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            del (bigram.scoring)
            bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            bigram_loaded = Phrases.load(
                "test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            seen_scores = set()
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists(
                    "test_phrases_testSaveLoadNoScoring_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
コード例 #6
0
def build_phrase_model():
    global review_df
    ### Trigram phrase model.  Fed back into the phrases for MWETokenizer
    bigram = Phrases(review_df.review_pp1, min_count=1, threshold=1)
    bigram_phraser = Phraser(bigram)
    trigram = Phrases(bigram_phraser[review_df.review_pp1])
    trigram.save(PHRASE_MODEL_LOC)
    return trigram
コード例 #7
0
ファイル: step1.3_phrase_SO.py プロジェクト: DunZhang/SEDict
def trainSOPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath):
    """

    :param g_DataQueue:全局变量存放数据库中的数据
    :param g_FinishRead:是否读取完数据库的标志
    :param savePath:短语学习器保存的位置
    :param priorPhrasePath:前一个学习器保存的位置
    :return:
    """
    count = 0
    phrase = Phrases(None, min_count=10, threshold=15)
    if (priorPhrasePath is None):
        priorPhraser = None
    else:
        priorPhraser = Phraser(Phrases.load(priorPhrasePath))
    while (g_FinishRead.value == 0 or (not g_DataQueue.empty())):
        data = g_DataQueue.get()
        count += len(data)
        print("have processed:", count)
        words = []
        reSub0 = re.compile(
            "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
        )  # URL
        reSub1 = re.compile(
            "[()\"{},:/-]|[^a-z]'|'[^a-z;?.!]|'$")  # replace with " "
        reSub2 = re.compile(
            "'[.?;!]")  # replace with . 主要考虑所有格问题,核心思想单引号左右的各种复杂情况
        reSplit1 = re.compile("\.[^a-z0-9]|[?!;]")
        # 获取单词
        for t in data:
            if (t[0] is not None):
                st = re.sub(reSub0, " ", t[0].lower())
                st = re.sub(reSub1, ".", st)
                st = re.sub(reSub2, ".", st)
                for sentence in re.split(reSplit1, st):
                    sen_word = sentence.split()
                    if (len(sen_word) > 6):
                        words.append(sen_word)
            if (t[1] is not None):
                st = re.sub(reSub0, " ", t[1].lower())
                st = re.sub(reSub1, ".", st)
                st = re.sub(reSub2, ".", st)
                for sentence in re.split(reSplit1, st):
                    sen_word = sentence.split()
                    if (len(sen_word) > 6):
                        words.append(sen_word)
        del data
        gc.collect()
        # 训练短语
        if (priorPhraser is None):  # 第一次训练
            phrase.add_vocab(words)
        else:  # 已经训练过一次,寻找个数更多的短语
            phrase.add_vocab(priorPhraser[words])
        del words
        # print(len(phrase.vocab))
        gc.collect
    phrase.save(savePath)
コード例 #8
0
def extract_phrases(app_files, bigram_min, trigram_min):
    rst = build_input(app_files)
    gen = list(itertools.chain.from_iterable(rst))  # 列表平滑处理

    bigram = Phrases(gen, threshold=6, min_count=bigram_min)
    trigram = Phrases(bigram[gen], threshold=4, min_count=trigram_min)

    bigram.save('model/%s_bigram_model.pkl' % (app))
    trigram.save('model/%s_trigram_model.pkl' % (app))
コード例 #9
0
def train_phraser(sentence_stream, stopword_list, threshold, model_path,
                  save_prefix):
    phrases_model = Phrases(sentence_stream,
                            common_terms=stopword_list,
                            threshold=threshold)
    phrases_model.save(
        os.path.join(model_path, '{}_phrases.bin'.format(save_prefix)))
    phraser_model = Phraser(phrases_model)
    phraser_model.save(
        os.path.join(model_path, '{}_phraser.bin'.format(save_prefix)))
    return phraser_model
コード例 #10
0
ファイル: extract_phrase.py プロジェクト: yttty/tour
def extract_phrases(app_files, bigram_min, trigram_min):
    bigram_fp = os.path.join("model", "bigram.model")
    trigram_fp = os.path.join("model", "trigram.model")

    rst = build_input(app_files)
    gen = list(itertools.chain.from_iterable(rst))  # flatten
    bigram = Phrases(gen, threshold=5, min_count=bigram_min)
    trigram = Phrases(bigram[gen], threshold=3, min_count=trigram_min)
    # write
    bigram.save(bigram_fp)
    trigram.save(trigram_fp)
コード例 #11
0
    def test_save_load_with_connector_words(self):
        """Test saving and loading a Phrases object."""
        connector_words = frozenset({'of'})
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         connector_words=connector_words)
        with temporary_file("test.pkl") as fpath:
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)

        assert bigram_loaded.connector_words == connector_words
コード例 #12
0
def generating_bigrams(final_df):
    eligibility_criteria = final_df['features']
    bigrams_input = [each_row.split() for each_row in eligibility_criteria]
    bigram_transformer = Phrases(bigrams_input, min_count=20, threshold=500)
    bigram_transformer.save("bigrams", pickle_protocol=4)

    fd = open("bigrams.txt", 'a')
    for phrase, score in bigram_transformer.export_phrases(bigrams_input):
        fd.write(u'{0}   {1}'.format(phrase, score))
    fd.close()

    return bigram_transformer
コード例 #13
0
def make_phraser(infile):
    """
    Train the phraser object and save it.
    :param infile: path to xml file with the wikipedia dump
    :return:
    """
    p = Phrases(
        tqdm((i.split() for i in file_yielder(infile)), desc="Phrase-finding"))
    p = Phraser(p)
    p.save("../models/phraser")

    return 0
コード例 #14
0
class GramFacade:
    def __init__(self, model_dir, min_count_bigrams=8, min_count_trigrams=7):
        self.model_dir = model_dir
        self.min_count_bigrams = min_count_bigrams
        self.min_count_trigrams = min_count_trigrams

    def load_models(self):
        self.bigrams_phraser = Phraser.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser = Phraser.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASER_FILENAME)

    def load_phrases(self):
        self.bigrams_phrases = Phrases.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases = Phrases.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASES_FILENAME)

    def export_bigrams(self, docs):
        return [self.bigrams_phraser[doc] for doc in docs]

    def export_trigrams(self, bigrams):
        return [self.trigrams_phraser[bigram] for bigram in bigrams]

    def phrase(self, doc):
        bigrams = self.bigrams_phraser[doc]
        trigrams = self.trigrams_phraser[bigrams]
        return trigrams

    def create_model(self, doc_list):
        self.bigrams_phrases = Phrases(doc_list,
                                       min_count=self.min_count_bigrams)
        self.bigrams_phraser = Phraser(self.bigrams_phrases)
        self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list],
                                        min_count=self.min_count_trigrams)
        self.trigrams_phraser = Phraser(self.trigrams_phrases)
        self.bigrams_phraser.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASER_FILENAME)
        self.bigrams_phrases.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASES_FILENAME)

    def words_not_in_vocab(self, tok_doc, threshold):
        word_not_in_doc = set([
            x for x in tok_doc
            if self.trigrams_phrases.vocab[str.encode(x)] < threshold
        ])
        return word_not_in_doc
コード例 #15
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = []
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
コード例 #16
0
ファイル: data.py プロジェクト: smottahedi/toxic_comment
def build_vocab():
    start = time.time()
    test_path = os.path.join(config.DATA_PATH, 'test.csv')
    train_path = os.path.join(config.DATA_PATH, 'train.csv')
    normalized_text_path = os.path.join(config.PROCESSED_PATH, 'normalized_comments.txt')
    bigram_path = os.path.join(config.PROCESSED_PATH, 'bigram')
    bigram_comments_path = os.path.join(config.PROCESSED_PATH, 'bigram_commnets.txt')

    if config.PROCESSED_PATH not in os.listdir(config.DATA_PATH):
        try:
            os.mkdir(config.PROCESSED_PATH)
        except OSError:
            pass

    vocab = {}

    train_df = read_file(train_path)
    test_df = read_file(test_path)
    print('tokenizing vocab file')
    texts =  np.concatenate([train_df.comment_text.fillna('N/A').values,
                             test_df.comment_text.fillna('N/A').values])


    with open(normalized_text_path, 'w') as f:
        processed_text = parallelize_dataframe(texts, tokenizer)
        for line in processed_text:
            f.write(line + '\n')
    gc.collect()
    lines = LineSentence(normalized_text_path)
    bigram = Phrases(lines)
    bigram.save(bigram_path)
    phraser = Phraser(bigram)

    with open(bigram_comments_path, 'w', encoding='utf_8') as f:
       for comment in lines:
            comm = u' '.join(phraser[comment])
            f.write(comm + '\n')

    commnets = LineSentence(bigram_comments_path)
    bigram_dict = Dictionary(commnets)
    bigram_dict.filter_extremes(no_below=config.THRESHOLD)
    bigram_dict.save_as_text(config.VOCAB_PATH)
    bigram_dict.add_documents([['<pad>']])

    with open(os.path.join(config.ROOT, 'src', 'config.py'), 'a') as f:
        f.write('VOCAB_SIZE = {}'.format(len(bigram_dict)))

    print('time passed: {} minutes'.format((time.time() - start) / 60))
コード例 #17
0
def trainPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath):
    count = 0
    phrase = Phrases(None, min_count=15, threshold=10, max_vocab_size=40000000)
    if (priorPhrasePath is None):
        priorPhraser = None
    else:
        priorPhraser = Phraser(Phrases.load(priorPhrasePath))
    while (g_FinishRead.value == 0 or (not g_DataQueue.empty())):
        words = g_DataQueue.get()
        if (priorPhraser is None):  # 第一次训练
            phrase.add_vocab(words)
        else:  # 已经训练过一次,寻找个数更多的短语
            phrase.add_vocab(priorPhraser[words])
        del words
        gc.collect()
    phrase.save(savePath)
コード例 #18
0
ファイル: test_phrases.py プロジェクト: lopusz/gensim
    def testSaveLoad(self):
        """ Saving and loading a Phrases object."""

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])
コード例 #19
0
def get_bigrams(df, bigram_model_filepath, TRAIN):
    if TRAIN:  # train phrase model
        # Train the phrase model using the processed sentences (a list of list of strings)
        sentences_unigrams = df["processed_text"].tolist()
        bigram_phrase_model = Phrases(sentences_unigrams)
        # Use the Phraser function to turn the phrase model into a "Phraser" object,
        # which is optimized for speed and memory use
        bigram_phrase_model = Phraser(bigram_phrase_model)
        # Save the model for future use
        bigram_phrase_model.save(bigram_model_filepath)
    else:
        # Load the trained model from disk
        bigram_phrase_model = Phraser.load(bigram_model_filepath)

    # Get the first-order transofmred data
    df["bigrams"] = df["processed_text"].map(lambda x: bigram_phrase_model[x])
コード例 #20
0
def get_trigrams(df, trigram_model_filepath, TRAIN):
    if TRAIN:  # train phrase model
        # Train the phrase model using the bigram sentences (a list of list of strings)
        sentences_bigrams = df["bigrams"].tolist()
        trigram_phrase_model = Phrases(sentences_bigrams)
        # Use the Phraser function to turn the phrase model into a "Phraser" object,
        # which is optimized for speed and memory use
        trigram_phrase_model = Phraser(trigram_phrase_model)
        # Save the model for future use
        trigram_phrase_model.save(trigram_model_filepath)
    else:
        # Load the trained model from disk
        trigram_phrase_model = Phraser.load(trigram_model_filepath)

    # Get the second-order transformed data
    df["trigrams"] = df["bigrams"].map(lambda x: trigram_phrase_model[x])
コード例 #21
0
    def testSaveLoad(self):
        """Test saving and loading a Phrases object."""
        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            seen_scores = set(
                round(score, 3) for score in bigram_loaded.find_phrases(
                    test_sentences).values())

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])
コード例 #22
0
ファイル: test_phrases.py プロジェクト: vishalbelsare/gensim
    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
            bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            seen_scores = []
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'

        finally:
            if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
コード例 #23
0
def create_dictionary(texts,
                      dest_file: str,
                      build_bigram,
                      working_directory=DIR):
    """
    Reads the file specified by source_file, creates a dictionary and saves it to the dest_file
    path.
    :param working_directory: The path to the directory where the bigram model files should be saved.
    :param build_bigram: 1 if building a new phrases object is needed else an already processed bigram model will
                         be loaded.
    :param source_file: path to source text file.
    :param dest_file: path to save dictionary to.
    :return:
    """
    # collect statistics about all tokens
    stoplist = stopwords.words('english')
    if build_bigram:
        bigram = Phrases([tweet.split() for tweet in texts])
        bigram.save(working_directory + '/bigram_model.phrase')
    else:
        bigram = Phrases.load(working_directory + '/bigram_model.phrase')
    phraser = Phraser(bigram)
    # Build dictionary
    dictionary = corpora.Dictionary(phraser[line.lower().split()]
                                    for line in texts)
    # remove stop words and words that appear only once
    stop_ids = [
        dictionary.token2id[stopword] for stopword in stoplist
        if stopword in dictionary.token2id
    ]
    once_ids = [
        tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
        if docfreq == 1
    ]
    dictionary.filter_tokens(
        stop_ids +
        once_ids)  # remove stop words and words that appear only once
    dictionary.filter_extremes(no_below=0.3, no_above=0.85)
    dictionary.compactify(
    )  # remove gaps in id sequence after words that were removed
    dictionary.save(dest_file)
    print(dictionary)
    print(dictionary.token2id)
    return dictionary
コード例 #24
0
    def testSaveLoadCustomScorer(self):
        """Test saving and loading a Phrases object with a custom scorer."""
        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences,
                             min_count=1,
                             threshold=.001,
                             scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            seen_scores = list(
                bigram_loaded.find_phrases(test_sentences).values())

            assert all(score == 1 for score in seen_scores)
            assert len(
                seen_scores
            ) == 3  # 'graph minors' and 'survey human' and 'interface system'
コード例 #25
0
ファイル: test_phrases.py プロジェクト: vishalbelsare/gensim
    def testSaveLoad(self):
        """ Saving and loading a Phrases object."""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save("test_phrases_testSaveLoad_temp_save.pkl")
            bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl")
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoad_temp_save.pkl")
コード例 #26
0
def extract_phrases(reviews_sents, reviews_docs, save=False):
    logging.info("Extracting phrases...")
    bigram = Phrases(reviews_sents, threshold=5, min_count=5)
    trigram = Phrases(bigram[reviews_sents], threshold=3, min_count=3)
    if save:
        with open('../data/phrase/phrases_%d_%s' % (3, 'app_review'), 'wb') as fout:
            ph_dic = {}
            for phrase, score in bigram.export_phrases(reviews_sents):
                ph_dic[phrase] = score
            for phrase, score in trigram.export_phrases(bigram[reviews_sents]):
                ph_dic[phrase] = score
            for phrase, score in ph_dic.items():
                if re.search(r'\d+', phrase):  # remove digits
                    continue
                phrase = b"_".join(phrase.split(b' '))
                fout.write(phrase + b'\n')
        bigram.save("../model/bigram.model")
        trigram.save("../model/trigram.model")

    return trigram[bigram[reviews_docs]]
コード例 #27
0
def build_phrase_models(content, base_path, settings):
    """ Build and save the phrase models
    """

    ngram_level = int(settings['level'])

    # According to tee() docs, this may be inefficient in terms of memory.
    # We need to do this because we need multiple passes through the
    # content stream.
    content = chain.from_iterable(doc.tokenized_text for doc in content)
    cs1, cs2 = tee(content, 2)

    for i in range(ngram_level - 1):
        phrases = Phrases(cs1)
        path = "%s.%s" % (base_path, i + 2)  # save path as n-gram level
        logger.info("Phrase processor: Saving %s", path)
        phrases.save(path)
        # TODO: gensim complains about not using Phraser(phrases)
        content = phrases[cs2]  # tokenize phrases in content stream
        cs1, cs2 = tee(content, 2)
コード例 #28
0
    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences,
                             min_count=1,
                             threshold=.001,
                             scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = []
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(
                seen_scores
            ) == 3  # 'graph minors' and 'survey human' and 'interface system'
コード例 #29
0
ファイル: test_phrases.py プロジェクト: vishalbelsare/gensim
    def testSaveLoadNoScoring(self):
        """ Saving and loading a Phrases object with no scoring parameter.
        This should ensure backwards compatibility with old versions of Phrases"""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            del(bigram.scoring)
            bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
コード例 #30
0
    def get_trigram_model(self, recalculate=False, from_scratch=True):

        if not os.path.isfile(
                self.paths.trigram_model_filepath) or recalculate:

            if not from_scratch:
                raise ValueError(
                    'No trigram model file exists but from_scratch is False')

            print('Building tri-gram model...')
            bigram_sentences = LineSentence(
                self.paths.bigram_sentences_filepath)
            trigram_model = Phrases(bigram_sentences)
            trigram_model = Phraser(trigram_model)
            print('Writing model...')
            trigram_model.save(self.paths.trigram_model_filepath)
        else:
            print('Loading tri-gram model...')
            trigram_model = Phrases.load(self.paths.trigram_model_filepath)

        print('Done!')
        return trigram_model
コード例 #31
0
 def train_ngrams_models(self, sent_tokens):
     """
     Train bigrams,trigrams and dictionary and save them in cached models
     :param sent_tokens: concatenated overall complete dataframe
     """
     bigrams = Phrases(sentences=sent_tokens, min_count=1, threshold=1)
     trigrams = Phrases(sentences=bigrams[sent_tokens],
                        min_count=1,
                        threshold=1)
     sent_tokens_transformed = trigrams[bigrams[sent_tokens]]
     d = corpora.Dictionary(sent_tokens_transformed)
     bow_corpus = [
         d.doc2bow(sent_tokens) for sent_tokens in sent_tokens_transformed
     ]
     tfidf = TfidfModel(corpus=bow_corpus, id2word=d)
     try:
         bigrams.save('slm/app/cached_models/bigrams.gensim')
         trigrams.save('slm/app/cached_models/trigrams.gensim')
         d.save('slm/app/cached_models/dictionary.dict')
         tfidf.save('slm/app/cached_models/tfidf.gensim')
     except:
         pass
コード例 #32
0
    def testSaveLoad(self):
        """ Saving and loading a Phrases object."""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save("test_phrases_testSaveLoad_temp_save.pkl")
            bigram_loaded = Phrases.load(
                "test_phrases_testSaveLoad_temp_save.pkl")
            seen_scores = set()
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoad_temp_save.pkl")
コード例 #33
0
    def get_bigram_model(self, recalculate=False, from_scratch=True):

        if not os.path.isfile(self.paths.bigram_model_filepath) or recalculate:

            if not from_scratch:
                raise ValueError(
                    'No bigram model file exists but from_scratch is False')

            print('Building bi-gram model...')
            unigram_sentences = LineSentence(
                self.paths.unigram_sentences_filepath)
            bigram_model = Phrases(
                unigram_sentences
            )  # TODO look into supplying stop words here for better phrases
            bigram_model = Phraser(bigram_model)
            print('Writing model...')
            bigram_model.save(self.paths.bigram_model_filepath)
        else:
            print('Loading bi-gram model...')
            bigram_model = Phrases.load(self.paths.bigram_model_filepath)

        print('Done!')
        return bigram_model
コード例 #34
0
 def fit(self, sentencesPath):
     """
     train phrases
     :param sentencesPath:the path of text file, the text file should be the format: one line one sentence
     """
     self.phrasers = []
     # path detect
     for path in self.savePhraserPaths:
         if not os.path.exists(os.path.dirname(path)):
             raise FileNotFoundError(os.path.dirname(path) + " not exist")
     for path in self.savePhraserPaths:
         if not os.path.exists(path):  # need train
             self.phrasers = None
             break
     if self.phrasers is not None and self.file_overwrite == False:
         logging.info("models are already exist, will read it")
         for path in self.savePhraserPaths:
             self.phrasers.append(Phraser(Phrases.load(path)))
         return True
     self.phrasers = []
     c = 2
     for path in self.savePhraserPaths:
         logging.info("getting %d-gram phrase......" % c)
         c += 1
         phrase = Phrases(sentences=TxtIter(sentences=codecs.open(
             sentencesPath, mode="r", encoding="utf-8"),
                                            ngrams=self.phrasers),
                          min_count=self.min_count,
                          threshold=self.threshold,
                          max_vocab_size=self.max_vocab_size,
                          delimiter=self.delimiter,
                          scoring=self.scoring)
         phrase.save(path)
         phraser = Phraser(phrase)
         self.phrasers.append(phraser)
         del phrase
コード例 #35
0
ファイル: phrase.py プロジェクト: hujiewang/research
from gensim.models.phrases import Phrases
from gensim.models.word2vec import LineSentence

sentence_stream=LineSentence('./data/text_cleaned.txt')
bigram = Phrases(sentence_stream,threshold=50.0)
bigram.save('./data/bigram.dat')
trigram = Phrases(bigram[sentence_stream],threshold=50.0)
trigram.save('./data/trigram.dat')

コード例 #36
0
ファイル: gram_facade.py プロジェクト: diegoami/bankdomain_PY
class GramFacade:
    def __init__(self,
                 model_dir,
                 bigrams_threshold=0.88,
                 trigrams_threshold=0.88):
        self.model_dir = model_dir
        self.bigrams_threshold = bigrams_threshold
        self.trigrams_threshold = trigrams_threshold

    def load_models(self):
        self.bigrams_phraser = Phraser.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser = Phraser.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASER_FILENAME)

    def load_phrases(self):
        self.bigrams_phrases = Phrases.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases = Phrases.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASES_FILENAME)

    def export_bigrams(self, docs):
        return [self.bigrams_phraser[doc] for doc in docs]

    def export_trigrams(self, bigrams):
        return [self.trigrams_phraser[bigram] for bigram in bigrams]

    def phrase(self, doc):
        bigrams = self.bigrams_phraser[doc]
        trigrams = self.trigrams_phraser[bigrams]
        return trigrams

    def create_model(self, doc_list):
        self.bigrams_phrases = Phrases(doc_list,
                                       scoring='npmi',
                                       threshold=self.bigrams_threshold)
        self.bigrams_phraser = Phraser(self.bigrams_phrases)
        self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list],
                                        scoring='npmi',
                                        threshold=self.trigrams_threshold)
        self.trigrams_phraser = Phraser(self.trigrams_phrases)
        self.bigrams_phraser.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASER_FILENAME)
        self.bigrams_phrases.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASES_FILENAME)

    def words_not_in_vocab(self, tok_doc, threshold):
        word_not_in_doc = set([
            x for x in tok_doc
            if self.trigrams_phrases.vocab[str.encode(x)] < threshold
        ])
        return word_not_in_doc

    def retrieve_grams(self):
        pgrams = self.trigrams_phraser.phrasegrams
        gram_list = []
        for word, values in pgrams.items():
            gram = b'_'.join(word)
            count, score = values[0], values[1]
            gram_list.append({
                "gram": gram.decode("utf-8"),
                "count": count,
                "score": score
            })
        gram_sorted = sorted(gram_list, key=lambda x: x["score"], reverse=True)
        return gram_sorted
コード例 #37
0
seg = Segmenter()
# vocab = Dictionary()
phrases = Phrases()

text_path = sys.argv[1]

def get_data(text_path):

    for line in open(text_path, "r"):
        line = line.strip()

        if line:
            data = json.loads(line)

            yield data['abstract']

for ind, text in enumerate(get_data(text_path)):
    segments = seg(text, segment_len=1, segment_overlap=0)

    phrases.add_vocab(segments)
    # vocab.add_documents(segments, prune_at=2000000)

    if ind % 10000:
        print(f"\rProcessed:{ind}", end = "")
        break

# vocab.filter_extremes(no_below=5, no_above=0.5, keep_n=2000000)
# vocab.save("academic.dict")

phrases.save("academic.phrases")
コード例 #38
0
 def build_bigram_model(self,sentences,count):
     print "In Bigram Model"
     bigram = Phrases(sentences,min_count=count)
     dest = self.models + 'bigram_model'
     bigram.save(dest)
     return bigram
コード例 #39
0
class PmiPhraseDetector(object):
    """
    Detection using Pointwise Mutual Information (PMI)
    """
    def __init__(self, sentences, filename=None):

        # model parameters
        self.sentences = sentences
        self.dataset = "CASEREPORT"
        self.tokenizer = "RAW"
        self.prune_stopwords = stopwords("pubmed")
        self.phrases = None
        self.threshold = 250
        self.decay = 2
        self.bigram_iter = 3

        # data file path
        models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
        if filename is None:
            filename = "PHRASE_%s_%s_%s_%s" % (self.threshold, self.decay, self.dataset, self.tokenizer, )
        self.filepath = os.path.join(models_folder, filename)

        # does identical model already exists?
        model_exists = os.path.isfile(self.filepath)
        if model_exists:
            logging.info("LOADING - loading phrase data..")
            self.phrases = Phrases.load(self.filepath)
        else:
            logging.info("CREATE - creating phrase data..")
            self.build()

    def build(self):
        self.phrases = Phrases(self.sentences, min_count=1, threshold=self.threshold)
        # run additional merge rounds
        for i in range(2, self.bigram_iter + 1):
            self.phrases = Phrases(self.sentences, min_count=1, threshold=self.threshold*(1.0/self.decay)**(i-1))
        # prune phrases
        self.prune()
        # save model to file
        self.save()

    def save(self):
        self.phrases.save(self.filepath)

    def prune(self, min_reduce=1):
        """
        Remove phrases beginning or ending with a stopword.
        Also removes phrases appearing less frequently than a threshold.
        :param min_reduce: frequency threshold
        """
        multiword_phrases = [phrase for phrase in self.phrases.vocab if "_" in phrase]
        for phrase in multiword_phrases:
            words = phrase.split("_")
            first_word, last_word = words[0], words[-1]
            if first_word in self.prune_stopwords or last_word in self.prune_stopwords:
                del self.phrases.vocab[phrase]

        prune_vocab(self.phrases.vocab, min_reduce)

    def detect(self, sentence):
        return self.phrases[sentence]

    def print_phrases(self, threshold=100):
        for word in self.phrases.vocab:
            if "_" in word and self.phrases.vocab[word] > threshold:
                print word, self.phrases.vocab[word]