Python Phrases.Phrasesの例、gensim.models.phrases.Phrases.Phrases Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_phrases.py プロジェクト: zwytop/gensim

    def setUp(self):
        """Set up Phraser models for the tests."""
        bigram_phrases = Phrases(sentences, min_count=1, threshold=1)
        self.bigram = Phraser(bigram_phrases)

        bigram_default_phrases = Phrases(sentences)
        self.bigram_default = Phraser(bigram_default_phrases)

        bigram_utf8_phrases = Phrases(sentences, min_count=1, threshold=1)
        self.bigram_utf8 = Phraser(bigram_utf8_phrases)

        bigram_unicode_phrases = Phrases(unicode_sentences,
                                         min_count=1,
                                         threshold=1)
        self.bigram_unicode = Phraser(bigram_unicode_phrases)

コード例 #2

0

ファイルを表示

 def setUp(self):
     self.bigram = Phrases(self.sentences,
                           min_count=1,
                           threshold=1,
                           common_terms=self.common_terms)
     self.bigram_default = Phrases(self.sentences,
                                   common_terms=self.common_terms)
     self.bigram_utf8 = Phrases(self.sentences,
                                min_count=1,
                                threshold=1,
                                common_terms=self.common_terms)
     self.bigram_unicode = Phrases(self.unicode_sentences,
                                   min_count=1,
                                   threshold=1,
                                   common_terms=self.common_terms)

コード例 #3

0

ファイルを表示

ファイル: extract_phrase.py プロジェクト: armor-ai/MERIT

def extract_phrases(app_files, bigram_min, trigram_min):
    bigram_fp = os.path.join("..", "model", "bigram.model")
    trigram_fp = os.path.join("..", "model", "trigram.model")

    rst = build_input(app_files)
    gen = list(itertools.chain.from_iterable(rst))  # flatten

    bigram = Phrases(gen, threshold=5, min_count=bigram_min)
    trigram = Phrases(bigram[gen], threshold=3, min_count=trigram_min)

    w2v_model = Word2Vec(trigram[bigram[gen]], min_count=1, size=200)
    # write
    bigram.save(bigram_fp)
    trigram.save(trigram_fp)
    return w2v_model

コード例 #4

0

ファイルを表示

ファイル: starter_mixit.py プロジェクト: yoyonel/mixit_365talents_1

def build_ngrams(df, min_count=5, threshold=2):
    """
    This function builds bigram and ngrams.
    Please don't modify, it may explode.
    """

    print("Building Bigrams")
    phrases = Phrases(tqdm(df.clean), min_count=min_count, threshold=threshold)
    bigrams = Phraser(phrases)  # Phrases -> Phraser: lighter/faster object, but can't be updated
    df['bigrams'] = df.clean.progress_apply(lambda r: bigrams[r])

    print("Building Ngrams")
    phrases_2 = Phrases(tqdm(df.bigrams), min_count=min_count, threshold=threshold)
    ngrams = Phraser(phrases_2)
    df['ngrams'] = df.clean.progress_apply(lambda r: ngrams[r])

コード例 #5

0

ファイルを表示

 def create_model(self, doc_list):
     self.bigrams_phrases = Phrases(doc_list,
                                    min_count=self.min_count_bigrams)
     self.bigrams_phraser = Phraser(self.bigrams_phrases)
     self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list],
                                     min_count=self.min_count_trigrams)
     self.trigrams_phraser = Phraser(self.trigrams_phrases)
     self.bigrams_phraser.save(self.model_dir + '/' +
                               BIGRAMS_PHRASER_FILENAME)
     self.trigrams_phraser.save(self.model_dir + '/' +
                                TRIGRAMS_PHRASER_FILENAME)
     self.bigrams_phrases.save(self.model_dir + '/' +
                               BIGRAMS_PHRASES_FILENAME)
     self.trigrams_phrases.save(self.model_dir + '/' +
                                TRIGRAMS_PHRASES_FILENAME)

コード例 #6

0

ファイルを表示

 def init_phraser(self, components=False, **kwargs):
     sentences = LineSentence(self.path + 'sentences.txt.gz')
     phrases = Phrases(sentences, **kwargs)
     self.phraser = GensimPhraser(phrases)
     self.phraser.components = components
     self.phraser.save(self.path + 'phraser.pkl')
     del phrases

コード例 #7

0

ファイルを表示

def bigrammer(source_file,
              outfile,
              mincount=100,
              threshold=0.99,
              scoring='npmi',
              commonfile='common_tagged.txt'):
    """
    :param source_file:
    :param outfile:
    :param mincount:
    :param threshold:
    :param scoring:
    :param commonfile:
    :return:
    """
    common = set([word.strip() for word in open(commonfile, 'r').readlines()])
    data = LineSentence(source_file)
    bigram_transformer = Phrases(sentences=data,
                                 min_count=mincount,
                                 threshold=threshold,
                                 scoring=scoring,
                                 max_vocab_size=400000000,
                                 delimiter=b':::',
                                 progress_per=100000,
                                 common_terms=common)
    bigrams = Phraser(bigram_transformer)
    tempfile = smart_open(outfile, 'a')
    print('Writing bigrammed text to %s' % outfile, file=sys.stderr)
    for i in bigrams[data]:
        tempfile.write(' '.join(i) + '\n')
    tempfile.close()
    return len(bigrams.phrasegrams)

コード例 #8

0

ファイルを表示

ファイル: namespace_finder.py プロジェクト: munichbughunter/service-auto-analyzer

 def update_namespaces(self, project_id, log_words):
     all_words = self.minio_client.get_project_object(
         project_id, "project_log_unique_words")
     for word in log_words:
         all_words[word] = 1
     self.minio_client.put_project_object(all_words, project_id,
                                          "project_log_unique_words")
     phrases = Phrases([w.split(".") for w in all_words],
                       min_count=1,
                       threshold=1)
     potential_project_namespaces = {}
     for word in all_words:
         potential_namespace = phrases[word.split(".")][0]
         if "_" not in potential_namespace:
             continue
         if potential_namespace not in potential_project_namespaces:
             potential_project_namespaces[potential_namespace] = 0
         potential_project_namespaces[potential_namespace] += 1
     chosen_namespaces = {}
     for item, cnt in potential_project_namespaces.items():
         if cnt > 10:
             chosen_namespaces[item.replace("_", ".")] = cnt
     logger.debug("Chosen namespaces %s", chosen_namespaces)
     self.minio_client.put_project_object(chosen_namespaces, project_id,
                                          "chosen_namespaces")

コード例 #9

0

ファイルを表示

ファイル: UnsupervisedClassification.py プロジェクト: cmazzoni87/UnsupervisedTextClassif

 def __init__(self, df):
     self.sent = df.tolist()
     self.phrases = Phrases(self.sent, min_count=30, threshold=1)
     self.bigram = Phraser(self.phrases)
     self.sentences = self.bigram[self.sent]
     self.w2v_model = Word2Vec(min_count=30, window=3, size=252, sample=6e-5, alpha=0.01,   # sample=1e-5
                               min_alpha=0.0005, negative=5, workers=multiprocessing.cpu_count()-1)

コード例 #10

0

ファイルを表示

ファイル: mining.py プロジェクト: kemalcanbora/textometrics

def w2v():
    import multiprocessing
    cores = multiprocessing.cpu_count()

    txt_list = []
    df = pd.read_csv("bibs.csv")
    for doc in df["Abs"]:
        txt_list.append(cleaning(doc))

    df["clean"] = txt_list

    sent = [row.split() for row in df['clean']]
    phrases = Phrases(sent, min_count=10, progress_per=10)
    bigram = Phraser(phrases)
    sentences = bigram[sent]
    w2v_model = Word2Vec(min_count=4,
                         window=5,
                         size=10,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=4,
                         workers=cores - 1)
    w2v_model.build_vocab(sentences, progress_per=10)
    #aki = Acute Kidney Injury
    w2v_model.train(sentences,
                    total_examples=w2v_model.corpus_count,
                    epochs=30,
                    report_delay=1)
    w2v_model.save("word2vec.model")
    w2v_model.init_sims(replace=True)

    res = w2v_model.wv.most_similar(positive=["surgical"])
    pprint(res)

コード例 #11

0

ファイルを表示

ファイル: test_phrases.py プロジェクト: chitang1990/Gensim-3.1.0

    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        try:
            bigram = Phrases(self.sentences,
                             min_count=1,
                             threshold=.001,
                             scoring=dumb_scorer)
            bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            bigram_loaded = Phrases.load(
                "test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            seen_scores = []
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(
                seen_scores
            ) == 3  # 'graph minors' and 'survey human' and 'interface system'

        finally:
            if os.path.exists(
                    "test_phrases_testSaveLoadCustomScorer_temp_save.pkl"):
                os.remove(
                    "test_phrases_testSaveLoadCustomScorer_temp_save.pkl")

コード例 #12

0

ファイルを表示

ファイル: TCmodel.py プロジェクト: n1ck404/Topic-Cluster

 def _phrase(self, token):
     bigram = Phrases(token, min_count=5, threshold=100)
     bigram_mod = Phraser(bigram)
     # trigram = Phrases(bigram_mod[token],min_count=5,threshold=100)
     # trigram_mod = Phraser(trigram)
     # return [trigram_mod[bigram_mod[doc]] for doc in token]
     return [bigram_mod[doc] for doc in token]

コード例 #13

0

ファイルを表示

    def deserialize(self, type, name, language='en'):
    
        serializer = self.serializers[type]
        
        if type != "lda_model":
            with codecs.open(name, "r", encoding = "utf-8") as f:
                data = json.load(f)
         
        elif type == "lda_model":
            with open(name, "rb") as f:
                data = pickle.load(f)
            
        deserialized = serializer(data).deserialize()
        
        if type == "phrases":
            if language == 'en':
                common_terms = self.function_words_single
            else:
                common_terms = safe_get_stop_words(language)

            phrases = Phrases(delimiter="_", connector_words=common_terms)
            phrases.phrasegrams = deserialized
            deserialized = phrases        
        
        return deserialized

コード例 #14

0

ファイルを表示

ファイル: Doc2Vec.py プロジェクト: sgrvinod/Implement-a-Broad-Search-Feature-by-Combining-Word-and-Document-Embeddings

 def prepareDocs(self, phrases=1):
     preppedDocs = []
     # Clean
     for i, doc in enumerate(self.uncleanDocList):
         cleanedDoc = ftfy.fix_text(doc, normalization='NFKC')
         cleanedDoc = cleanedDoc.replace('?', ' ')
         cleanedDoc = ' '.join(cleanedDoc.splitlines())
         cleanedDoc = re.sub(r'http\S+', '', cleanedDoc)
         cleanedDoc = re.sub(r'https\S+', '', cleanedDoc)
         translator = str.maketrans(punctuation, ' ' * len(punctuation))
         cleanedDoc = cleanedDoc.translate(translator)
         cleanedDoc = cleanedDoc.translate({ord(k): None for k in digits})
         cleanedDoc = cleanedDoc.lower()
         cleanedDoc = ' '.join(cleanedDoc.split())
         preppedDocs.append(cleanedDoc)
         print('%d of %d documents cleaned.' %
               (i + 1, len(self.uncleanDocList)))
     # Detect phrases (optional)
     if phrases is not None:
         print('Phrase detection requested. Running...')
         tokenizedDocs = []
         for doc in preppedDocs:
             tokenizedDocs.append(doc.split())
         bigrammer = Phrases(tokenizedDocs)
         preppedDocs = []
         for tokdoc in tokenizedDocs:
             preppedDocs.append(' '.join(bigrammer[tokdoc]))
         print('Documents are now phrase-collocated.')
     # Save prepared documents to class instance
     self.preppedDocList = preppedDocs

コード例 #15

0

ファイルを表示

ファイル: gensim_word2vec.py プロジェクト: gaphex/yalg_ml_2k18

 def _train_phraser(self, min_count, phrase_threshold, delimiter):
     print("Training collocation detector...")
     return Phraser(
         Phrases(self.line_iterator,
                 min_count=min_count,
                 threshold=phrase_threshold,
                 delimiter=delimiter))

コード例 #16

0

ファイルを表示

def make_trigrams(bigram_sentences: list):
    from gensim.models.phrases import Phrases
    trigram_model = Phrases(bigram_sentences, threshold=40)
    results = []
    for doc in bigram_sentences:
        results.append(trigram_model[doc])
    return results

コード例 #17

0

ファイルを表示

ファイル: preprocessing.py プロジェクト: dunovank/nlp-stuff

 def __call__(self, docs):
     phrases = Phrases(docs, min_count=10)
     bigram = self.phraser(phrases)
     p = Pool(cores)
     docs = p.starmap(self.append_bigram, zip(docs, [bigram] * len(docs)))
     pool.close()
     return docs

コード例 #18

0

ファイルを表示

def advb_bigram_detect(sentences):
    # first build the list of maintenance words_by_alphebat
    list_of_adverb = Utility.read_words_file_into_list(
        save_folder_name + "/List_of_advb.txt", 1)

    phrases = Phrases(
        sentences,
        max_vocab_size=max_vocab_size,
        min_count=bigram_minimum_count_threshold,
        threshold=threshold,
        delimiter=delimiter,
        progress_per=progress_per
    )  # use # as delimiter to distinguish from ~ used in previous stages

    with open(save_folder_name + '/' + 'advb_bigram.txt',
              "w") as bigram_2_file:
        c = 1
        for key in phrases.vocab.keys():
            a = key.decode()
            a = a.split("#")
            if len(a) > 1:
                flag = False
                flag2 = False
                for w in a:
                    if w in list_of_adverb:
                        flag = True
                    if len(w) > 4 and w[-3:] == 'ing':
                        flag2 = True

                if flag and flag2:
                    s = key.decode()
                    print('{0}\t\t{1:<10}'.format(c, s), file=bigram_2_file)
                    c += 1

    logger.info("PROGRESS: Finished advb_bigram_detect")

コード例 #19

0

ファイルを表示

ファイル: EmailCorpus_Centroid.py プロジェクト: computational-culture-lab/HiringNLP

def createEmbeddingSpace(filename):
    # you need to remake key common phrases...
    # "new york" should really be "new_york" as a collective since "new" and "york" have different meanings
    # if they are used together vs separately

    # https://stackoverflow.com/questions/35716121/how-to-extract-phrases-from-corpus-using-gensim

    #sentencesAll = []
    with open(filename, 'r') as f:

        sentencesAll = [line.split(" ") for line in f if line != None]

    #takes about ~10 min
    random.shuffle(sentencesAll)

    phrases = Phrases(sentencesAll,
                      min_count=1,
                      threshold=2,
                      progress_per=10000)
    bigram = Phraser(phrases)
    sentences = bigram[sentencesAll]

    print(len(sentences))  #15,786,808
    print(sentences[0])

    # Building and Training the Model
    cores = multiprocessing.cpu_count()

    # I removed min_count... idk how to see which we not used
    w2v_model = Word2Vec(window=6,
                         size=100,
                         sample=6e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=20,
                         workers=cores - 1)

    t = time()

    w2v_model.build_vocab(sentences, progress_per=10000)

    print('Time to build vocab: {} mins'.format(round((time() - t) / 60,
                                                      2)))  #6.71 mins

    t = time()

    w2v_model.train(sentences,
                    total_examples=w2v_model.corpus_count,
                    epochs=30,
                    report_delay=1)

    print('Time to train the model: {} mins'.format(round((time() - t) / 60,
                                                          2)))

    print("Sentence[0]: in embedding Model {}".format(sentences[0]))
    print("Sentence[1]: in embedding Model {}".format(sentences[1]))
    print("Similarity is: {}".format(
        w2v_model.wv.wmdistance(sentences[0], sentences[1])))

    return w2v_model

コード例 #20

0

ファイルを表示

ファイル: test_phrases.py プロジェクト: chitang1990/Gensim-3.1.0

    def testSaveLoadNoScoring(self):
        """ Saving and loading a Phrases object with no scoring parameter.
        This should ensure backwards compatibility with old versions of Phrases"""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            del (bigram.scoring)
            bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            bigram_loaded = Phrases.load(
                "test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            seen_scores = set()
            test_sentences = [[
                'graph', 'minors', 'survey', 'human', 'interface', 'system'
            ]]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists(
                    "test_phrases_testSaveLoadNoScoring_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")

コード例 #21

0

ファイルを表示

ファイル: class_random_forests.py プロジェクト: tostre/eika2_data

def load_vector_data(dataset_name, bgr=False):
    sentences = pd.read_csv("../cleaned/" + dataset_name + "_stems.csv", delimiter=",").astype(str).fillna("").values.tolist()
    targets = pd.read_csv("../cleaned/" + dataset_name + "_clean.csv", delimiter=",", dtype=types).astype(str)["a"].tolist()
    vector_model = FastText.load("../models/word_embeddings/" + dataset_name + "_fasttext", binary=True)
    # replace placeholders (" "), make one-string-sentences
    for index, sample in enumerate(sentences):
        sentences[index] = list(filter((" ").__ne__, sample))
    inputs = [" ".join(sentence) for sentence in sentences]
    sentences

    if bgr:
        tokenized = [t.split() for t in inputs]
        phrases = Phrases(tokenized)
        bigram = Phraser(phrases)
        bigrammed = []
        # make bigrams for inputs
        for sentence in inputs:
            sentence = [t.split() for t in [sentence]]
            bigrammed.append(bigram[sentence[0]])
        inputs = []
        for sent in bigrammed:
            inputs.append(np.sum(vector_model.wv[sent], 0).tolist()) if sent else inputs.append(np.zeros(32))
    else:
        inputs = [vector_model.wv[sample] for sample in inputs]

    inputs = np.array(inputs)
    train_x, test_x, train_y, test_y = train_test_split(inputs, targets, test_size=0.2)
    return train_x, test_x, train_y, test_y

コード例 #22

0

ファイルを表示

def preprocess(segments, dct=None, bigram=None):
    processed_segments = []
    for seg in segments:
        processed_seg = []
        for word in seg:
            if True in [word.is_space, word.is_stop, word.is_punct]:
                continue
            word = word.lemma_
            word = word.lower()
            processed_seg.append(word)
        processed_segments.append(processed_seg)

    if bigram is None:
        phrases = Phrases(processed_segments, min_count=3, threshold=3)
        bigram = Phraser(phrases)

    processed_segments = bigram[processed_segments]

    if dct is None:
        dct = Dictionary(processed_segments)
    else:
        dct.add_documents(processed_segments)

    return [dct.doc2bow(line)
            for line in processed_segments], dct, processed_segments, bigram

コード例 #23

0

ファイルを表示

def word2vec_sentence(data, save_path):
    phrases = Phrases(data, min_count=1, progress_per=50000)
    bigrame = Phraser(phrases)
    sentences = bigrame[data]
    print(sentences)
    w2v_model = Word2Vec(min_count=3,
                         window=4,
                         size=300,
                         sample=1e-5,
                         alpha=0.03,
                         min_alpha=0.0007,
                         negative=20,
                         workers=multiprocessing.cpu_count() - 1)
    # init
    start = time()
    w2v_model.build_vocab(sentences, progress_per=50000)
    print('Time to build vocab: {} mins'.format(round((time() - start) / 60,
                                                      2)))
    # train
    start = time()
    w2v_model.train(sentences,
                    total_examples=w2v_model.corpus_count,
                    epochs=30,
                    report_delay=1)
    print('Time to train the model: {} mins'.format(
        round((time() - start) / 60, 2)))
    w2v_model.init_sims(replace=True)
    w2v_model.save("word2vec.model")
    w2v_model.wv.save_word2vec_format(save_path, binary=False)

コード例 #24

0

ファイルを表示

    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         common_terms=self.common_terms)

        seen_scores = set()

        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        min_count = float(bigram.min_count)
        len_vocab = float(len(bigram.vocab))
        graph = float(bigram.vocab[b"graph"])
        data = float(bigram.vocab[b"data"])
        data_and_graph = float(bigram.vocab[b"data_and_graph"])
        human = float(bigram.vocab[b"human"])
        interface = float(bigram.vocab[b"interface"])
        human_interface = float(bigram.vocab[b"human_interface"])

        assert seen_scores == set([
            # score for data and graph
            round((data_and_graph - min_count) / data / graph * len_vocab, 3),
            # score for human interface
            round(
                (human_interface - min_count) / human / interface * len_vocab,
                3),
        ])

コード例 #25

0

ファイルを表示

    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         connector_words=self.connector_words)
        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        seen_scores = set(
            round(score, 3)
            for score in bigram.find_phrases(test_sentences).values())

        min_count = float(bigram.min_count)
        len_vocab = float(len(bigram.vocab))
        graph = float(bigram.vocab["graph"])
        data = float(bigram.vocab["data"])
        data_and_graph = float(bigram.vocab["data_and_graph"])
        human = float(bigram.vocab["human"])
        interface = float(bigram.vocab["interface"])
        human_interface = float(bigram.vocab["human_interface"])

        assert seen_scores == set([
            # score for data and graph
            round((data_and_graph - min_count) / data / graph * len_vocab, 3),
            # score for human interface
            round(
                (human_interface - min_count) / human / interface * len_vocab,
                3),
        ])

コード例 #26

0

ファイルを表示

	def preprocess(self):
		from nltk import word_tokenize
		print("Starting to preprocess...")
		for split in ['train','test']:
			unigrams = [word_tokenize(sentence[0]) for sentence in self.data[split].values]
			ps = PorterStemmer()
			for idx,review in enumerate(unigrams):
				stemmedSentence=[]
				for word in review:
					#stemmedSentence.append(ps.stem(word)) # stemming takes too long ...
					stemmedSentence.append(word)
				self.data[split].iloc[idx,0]=" ".join(stemmedSentence)

		bigrams = Phrases(unigrams, min_count=2)
		bigram_phraser = Phraser(bigrams)
		if self.representation == 'GloVe':
			# let X be a list of tokenized texts (i.e. list of lists of tokens)
			self.word_model = gensim.models.Word2Vec(bigram_phraser[unigrams], min_count=1)
			self.w2v = dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0))
		elif self.representation == 'fasttext':
			self.word_model = FastText(bigram_phraser[unigrams], min_count=1)
			self.w2v=dict(zip(self.word_model.wv.index2word, self.word_model.wv.syn0))


		print("Finished preprocessing.")

コード例 #27

0

ファイルを表示

ファイル: PhraseDetection.py プロジェクト: yaldahashemi/DomainSpecificThesaurus

 def fit(self, sentencesPath):
     """
     train phrases
     :param sentencesPath:the path of text file, the text file should be the format: one line one sentence
     """
     self.phrasers = []
     # path detect
     for path in self.savePhraserPaths:
         if not os.path.exists(os.path.dirname(path)):
             raise FileNotFoundError(os.path.dirname(path) + " not exist")
     for path in self.savePhraserPaths:
         if not os.path.exists(path):  # need train
             self.phrasers = None
             break
     if self.phrasers is not None and self.file_overwrite == False:
         logging.info("models are already exist, will read it")
         for path in self.savePhraserPaths:
             self.phrasers.append(Phraser.load(path))
         return True
     self.phrasers = []
     c = 2
     for path in self.savePhraserPaths:
         logging.info("getting %d-gram phrase......" % c)
         c += 1
         phraser = Phraser(
             Phrases(sentences=TxtIter(sentences=codecs.open(
                 sentencesPath, mode="r", encoding="utf-8"),
                                       ngrams=self.phrasers),
                     min_count=self.min_count,
                     threshold=self.threshold,
                     max_vocab_size=self.max_vocab_size,
                     delimiter=self.delimiter,
                     scoring=self.scoring))
         phraser.save(path)
         self.phrasers.append(phraser)

コード例 #28

0

ファイルを表示

ファイル: train_word2vec.py プロジェクト: teeerrytan/EECS_510

def build_phrases(sentences):
    phrases = Phrases(
        sentences,
        min_count=2,
        threshold=10,
    )
    return Phraser(phrases)

コード例 #29

0

ファイルを表示

def learn_word_embeddings(corpus_fpath,
                          vectors_fpath,
                          cbow,
                          window,
                          iter_num,
                          size,
                          threads,
                          min_count,
                          detect_phrases=True):

    tic = time()
    sentences = GzippedCorpusStreamer(corpus_fpath)

    if detect_phrases:
        print("Extracting phrases from the corpus:", corpus_fpath)
        phrases = Phrases(sentences)
        bigram = Phraser(phrases)
        input_sentences = list(bigram[sentences])
        print("Time, sec.:", time() - tic)
    else:
        input_sentences = sentences

    print("Training word vectors:", corpus_fpath)
    model = Word2Vec(input_sentences,
                     min_count=min_count,
                     size=size,
                     window=window,
                     max_vocab_size=None,
                     workers=threads,
                     sg=(1 if cbow == 0 else 0),
                     iter=iter_num)
    model.wv.save_word2vec_format(vectors_fpath, binary=False)
    print("Vectors:", vectors_fpath)
    print("Time, sec.:", time() - tic)

コード例 #30

0

ファイルを表示

def collocation(in_path):
    """Creates corpus considering collocations, frequent co-occuring bigrams are merged (new york -> new_york)"""
    corpus = LineSentence(in_path)
    bigram = Phraser(Phrases(corpus))
    collocation_corpus = bigram[corpus]
    for sentence in collocation_corpus:
        print(' '.join(sentence))