Ejemplo n.º 1
0
    def __init__(self):

        '''
        Training parameters:
        '''

        self.w2v_dim=100
        self.num_feature=400
        self.batch_size=16
        self.num_epoch=30

        # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')

        self.index2word_set = set(self.w2v_model.index2word)

        #self.bigram=None
        #self.trigram=None

        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        print('Build model...')

        self.model = Sequential()
        self.model.add(Dropout(0.2,input_shape=(self.num_feature,)))
        self.model.add(Dense(3, input_dim=self.num_feature, init='orthogonal'))
        self.model.add(Activation('softmax'))


        self.model.compile(loss='categorical_crossentropy', optimizer='adam', class_mode="categorical")

        print('Model has been built!')
Ejemplo n.º 2
0
    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)

        seen_scores = set()

        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        min_count = float(bigram.min_count)
        len_vocab = float(len(bigram.vocab))
        graph = float(bigram.vocab[b"graph"])
        data = float(bigram.vocab[b"data"])
        data_and_graph = float(bigram.vocab[b"data_and_graph"])
        human = float(bigram.vocab[b"human"])
        interface = float(bigram.vocab[b"interface"])
        human_interface = float(bigram.vocab[b"human_interface"])

        assert seen_scores == set([
            # score for data and graph
            round((data_and_graph - min_count) / data / graph * len_vocab, 3),
            # score for human interface
            round((human_interface - min_count) / human / interface * len_vocab, 3),
        ])
 def build_trigram_model(self,sentences,bigram):
     print "In Trigram Model"
     trigram = Phrases(bigram[sentences])
     dest = self.models + 'trigram_model'
     trigram.save(dest)
     
     return trigram
Ejemplo n.º 4
0
 def build(self):
     self.phrases = Phrases(self.sentences, min_count=1, threshold=self.threshold)
     # run additional merge rounds
     for i in range(2, self.bigram_iter + 1):
         self.phrases = Phrases(self.sentences, min_count=1, threshold=self.threshold*(1.0/self.decay)**(i-1))
     # prune phrases
     self.prune()
     # save model to file
     self.save()
Ejemplo n.º 5
0
    def testExportPhrases(self):
        """Test Phrases bigram export_phrases functionality."""
        bigram = Phrases(sentences, min_count=1, threshold=1)

        seen_bigrams = set()

        for phrase, score in bigram.export_phrases(sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == {b'response time', b'graph minors', b'human interface'}
Ejemplo n.º 6
0
    def testMultipleBigramsSingleEntry(self):
        """ a single entry should produce multiple bigrams. """
        bigram = Phrases(self.sentences, min_count=1, threshold=1)
        seen_bigrams = set()

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == {b'graph minors', b'human interface'}
Ejemplo n.º 7
0
    def setUp(self):
        """Set up FrozenPhrases models for the tests."""
        bigram_phrases = Phrases(self.sentences,
                                 min_count=1,
                                 threshold=1,
                                 connector_words=self.connector_words)
        self.bigram = FrozenPhrases(bigram_phrases)

        bigram_default_phrases = Phrases(self.sentences,
                                         connector_words=self.connector_words)
        self.bigram_default = FrozenPhrases(bigram_default_phrases)
def generating_bigrams(final_df):
    eligibility_criteria = final_df['features']
    bigrams_input = [each_row.split() for each_row in eligibility_criteria]
    bigram_transformer = Phrases(bigrams_input, min_count=20, threshold=500)
    bigram_transformer.save("bigrams", pickle_protocol=4)

    fd = open("bigrams.txt", 'a')
    for phrase, score in bigram_transformer.export_phrases(bigrams_input):
        fd.write(u'{0}   {1}'.format(phrase, score))
    fd.close()

    return bigram_transformer
Ejemplo n.º 9
0
    def testCustomScorer(self):
        """ test using a custom scoring function """

        bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)

        seen_scores = []
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.append(score)

        assert all(seen_scores)  # all scores 1
        assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
Ejemplo n.º 10
0
def create_ngrams(category, lang):
    """Given a category, create n-grams for the text, clean the corpus and return the sentences cleaned

    Parameters
    ----------
    category : string 
        Name of the domain e.g :   "Santé","Business",etc.
        
    lang : string 
        default = "fr" 
    
    Returns
    -------
    sentences : list of string
            sentences[i] is a sentence in the corpus cleaned
    """
    tagger = treetaggerwrapper.TreeTagger(TAGLANG=lang)  #to lemmatize words
    sentences = []
    bigrams_model = Phrases(min_count=100, threshold=10.0,
                            delimiter="-")  #to create bigrams
    filename = "../data/" + category + ".txt"

    with open(filename, "r") as ins:
        for line in ins:
            line = line.decode("utf8")
            lines = line.split('.')
            for l in lines:
                sentence = nltk.word_tokenize(l)
                if sentence:
                    sentences.append(sentence)
                    #bigrams_model.add_vocab([sentence])
    bigrams = list(bigrams_model[sentences])
    #to create trigrams
    trigrams_model = Phrases(bigrams,
                             min_count=50,
                             threshold=10.0,
                             delimiter="-")
    sentences = list(trigrams_model[bigrams])

    n = len(sentences)
    for i in range(n):
        tags = tagger.tag_text(sentences[i])
        text = [
            tag.split('\t')[2] for tag in tags
            if tag.split('\t')[1] != "NUM" and tag.split('\t')[1] != "PUN"
        ]
        text = " ".join(text)
        text = clean_text_simple(text)
        sentences[i] = text
        if i % 10000 == True:
            print i, "sentences processsed"
    sentences = [sent for sent in sentences if len(sent) != 0]
    return sentences
Ejemplo n.º 11
0
    def testMultipleBigramsSingleEntry(self):
        """ a single entry should produce multiple bigrams. """
        bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)

        seen_bigrams = set()
        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_bigrams.add(phrase)
        assert seen_bigrams == set([
            b'data and graph',
            b'human interface',
        ])
Ejemplo n.º 12
0
    def test_save_load_with_connector_words(self):
        """Test saving and loading a Phrases object."""
        connector_words = frozenset({'of'})
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         connector_words=connector_words)
        with temporary_file("test.pkl") as fpath:
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)

        assert bigram_loaded.connector_words == connector_words
Ejemplo n.º 13
0
def show_phrases(corpus, threshold=1000, shown=1000):
    # Training the multi-word expression detector
    tokenized_sentences = tokenize_sentences(corpus)
    phrases = Phrases(tokenized_sentences, threshold=threshold)
    i = 0
    for phrase, score in phrases.export_phrases(tokenized_sentences):
        if i > shown:
            break
        else:
            print("Expression : {0}, score = {1}".format(
                phrase.decode('utf-8'), score))
        i = i + 1
Ejemplo n.º 14
0
    def testExportPhrases(self):
        """Test Phrases bigram export_phrases functionality."""
        bigram = Phrases(sentences, min_count=1, threshold=1)

        seen_bigrams = set()

        for phrase, score in bigram.export_phrases(sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == {
            b'response time', b'graph minors', b'human interface'
        }
Ejemplo n.º 15
0
 def test_create_and_decode_phrases(self):
     df = pd.read_csv('text_analytics/tests/NYT.Corruption')
     phrases = Phrases(
         sentences=read_clean(df),
         min_count=100,
         threshold=0.70,
         scoring="npmi",
         max_vocab_size=100000000,
         delimiter="_",
     )
     exported = phrases.export_phrases()
     return exported
Ejemplo n.º 16
0
def make_phraser(infile):
    """
    Train the phraser object and save it.
    :param infile: path to xml file with the wikipedia dump
    :return:
    """
    p = Phrases(
        tqdm((i.split() for i in file_yielder(infile)), desc="Phrase-finding"))
    p = Phraser(p)
    p.save("../models/phraser")

    return 0
Ejemplo n.º 17
0
def trainSOPhrase(g_DataQueue, g_FinishRead, savePath, priorPhrasePath):
    """

    :param g_DataQueue:全局变量存放数据库中的数据
    :param g_FinishRead:是否读取完数据库的标志
    :param savePath:短语学习器保存的位置
    :param priorPhrasePath:前一个学习器保存的位置
    :return:
    """
    count = 0
    phrase = Phrases(None, min_count=10, threshold=15)
    if (priorPhrasePath is None):
        priorPhraser = None
    else:
        priorPhraser = Phraser(Phrases.load(priorPhrasePath))
    while (g_FinishRead.value == 0 or (not g_DataQueue.empty())):
        data = g_DataQueue.get()
        count += len(data)
        print("have processed:", count)
        words = []
        reSub0 = re.compile(
            "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"
        )  # URL
        reSub1 = re.compile(
            "[()\"{},:/-]|[^a-z]'|'[^a-z;?.!]|'$")  # replace with " "
        reSub2 = re.compile(
            "'[.?;!]")  # replace with . 主要考虑所有格问题,核心思想单引号左右的各种复杂情况
        reSplit1 = re.compile("\.[^a-z0-9]|[?!;]")
        # 获取单词
        for t in data:
            if (t[0] is not None):
                st = re.sub(reSub0, " ", t[0].lower())
                st = re.sub(reSub1, ".", st)
                st = re.sub(reSub2, ".", st)
                for sentence in re.split(reSplit1, st):
                    sen_word = sentence.split()
                    if (len(sen_word) > 6):
                        words.append(sen_word)
            if (t[1] is not None):
                st = re.sub(reSub0, " ", t[1].lower())
                st = re.sub(reSub1, ".", st)
                st = re.sub(reSub2, ".", st)
                for sentence in re.split(reSplit1, st):
                    sen_word = sentence.split()
                    if (len(sen_word) > 6):
                        words.append(sen_word)
        del data
        gc.collect()
        # 训练短语
        if (priorPhraser is None):  # 第一次训练
            phrase.add_vocab(words)
        else:  # 已经训练过一次,寻找个数更多的短语
            phrase.add_vocab(priorPhraser[words])
        del words
        # print(len(phrase.vocab))
        gc.collect
    phrase.save(savePath)
Ejemplo n.º 18
0
    def setUp(self):
        """Set up Phraser models for the tests."""
        bigram_phrases = Phrases(sentences, min_count=1, threshold=1)
        self.bigram = Phraser(bigram_phrases)

        bigram_default_phrases = Phrases(sentences)
        self.bigram_default = Phraser(bigram_default_phrases)

        bigram_utf8_phrases = Phrases(sentences, min_count=1, threshold=1)
        self.bigram_utf8 = Phraser(bigram_utf8_phrases)

        bigram_unicode_phrases = Phrases(unicode_sentences, min_count=1, threshold=1)
        self.bigram_unicode = Phraser(bigram_unicode_phrases)
Ejemplo n.º 19
0
    def testExportPhrases(self):
        """Test Phrases bigram export phrases."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         delimiter=' ')
        seen_bigrams = set(bigram.find_phrases(self.sentences).keys())

        assert seen_bigrams == {
            'response time',
            'graph minors',
            'human interface',
        }
Ejemplo n.º 20
0
    def testCustomScorer(self):
        """ test using a custom scoring function """

        bigram = Phrases(self.sentences, min_count=1, threshold=.001,
                         scoring=dumb_scorer, common_terms=self.common_terms)

        seen_scores = []
        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.append(score)

        assert all(seen_scores)  # all scores 1
        assert len(seen_scores) == 2  # 'data and graph' 'survey for human'
Ejemplo n.º 21
0
    def __init__(self, dataset=CLASSIC3_JSON_DS):
        # loading the corpus
        corpus = Sentences(dataset)
        # Using a phrase model to refine the corpus
        bigram = Phraser(Phrases(corpus))
        trigram = Phraser(Phrases(bigram[corpus]))
        trig_corpus = trigram[bigram[corpus]]
        self.vocab = list(set([term for doc in trig_corpus for term in doc]))

        # creating standard Dictionary representation of corpus and creating standard doc-term matrix
        dct = Dictionary(trig_corpus)
        bow_corpus = [dct.doc2bow(line) for line in trig_corpus]
        self.doc_term_mat = corpus2csc(bow_corpus).T
Ejemplo n.º 22
0
 def __init__(self):
     reader = Reader()
     print('loading data')
     self.X_train = reader.getData(TRAIN)
     print('train data has been loaded!')
     self.X_valid = reader.getData(DEV)
     print('valid data has been loaded!')
     self.X_test = reader.getData(TEST)
     print('test data has been loaded!')
     self.c_title = []
     self.c_body = []
     self.bigram = Phrases.load('./data/bigram.dat')
     self.trigram = Phrases.load('./data/trigram.dat')
Ejemplo n.º 23
0
    def testScoringNpmi(self):
        """ test normalized pointwise mutual information scoring """
        bigram = Phrases(self.sentences, min_count=1, threshold=.5, scoring='npmi')

        seen_scores = set()
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == {
            .882,  # score for graph minors
            .714  # score for human interface
        }
Ejemplo n.º 24
0
 def __init__(self):
     reader = Reader()
     print('loading data')
     self.X_train=reader.getData(TRAIN)
     print('train data has been loaded!')
     self.X_valid=reader.getData(DEV)
     print('valid data has been loaded!')
     self.X_test=reader.getData(TEST)
     print('test data has been loaded!')
     self.c_title=[]
     self.c_body=[]
     self.bigram=Phrases.load('./data/bigram.dat')
     self.trigram=Phrases.load('./data/trigram.dat')
Ejemplo n.º 25
0
class GramFacade:
    def __init__(self, model_dir, min_count_bigrams=8, min_count_trigrams=7):
        self.model_dir = model_dir
        self.min_count_bigrams = min_count_bigrams
        self.min_count_trigrams = min_count_trigrams

    def load_models(self):
        self.bigrams_phraser = Phraser.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser = Phraser.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASER_FILENAME)

    def load_phrases(self):
        self.bigrams_phrases = Phrases.load(self.model_dir + '/' +
                                            BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases = Phrases.load(self.model_dir + '/' +
                                             TRIGRAMS_PHRASES_FILENAME)

    def export_bigrams(self, docs):
        return [self.bigrams_phraser[doc] for doc in docs]

    def export_trigrams(self, bigrams):
        return [self.trigrams_phraser[bigram] for bigram in bigrams]

    def phrase(self, doc):
        bigrams = self.bigrams_phraser[doc]
        trigrams = self.trigrams_phraser[bigrams]
        return trigrams

    def create_model(self, doc_list):
        self.bigrams_phrases = Phrases(doc_list,
                                       min_count=self.min_count_bigrams)
        self.bigrams_phraser = Phraser(self.bigrams_phrases)
        self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list],
                                        min_count=self.min_count_trigrams)
        self.trigrams_phraser = Phraser(self.trigrams_phrases)
        self.bigrams_phraser.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASER_FILENAME)
        self.trigrams_phraser.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASER_FILENAME)
        self.bigrams_phrases.save(self.model_dir + '/' +
                                  BIGRAMS_PHRASES_FILENAME)
        self.trigrams_phrases.save(self.model_dir + '/' +
                                   TRIGRAMS_PHRASES_FILENAME)

    def words_not_in_vocab(self, tok_doc, threshold):
        word_not_in_doc = set([
            x for x in tok_doc
            if self.trigrams_phrases.vocab[str.encode(x)] < threshold
        ])
        return word_not_in_doc
Ejemplo n.º 26
0
    def testScoringNpmi(self):
        """ test normalized pointwise mutual information scoring """
        bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi')

        seen_scores = set()

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == set([
            .882,  # score for graph minors
            .714  # score for human interface
        ])
Ejemplo n.º 27
0
    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences, min_count=1, threshold=1)

        seen_scores = set()

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == {
            5.167,  # score for graph minors
            3.444  # score for human interface
        }
Ejemplo n.º 28
0
    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = []
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
Ejemplo n.º 29
0
def get_ngram(n, sentence):
    """
    Function to get n grams to examine relationship between words in the news content
    """
    if n == 1:
        return sentence
    
    # create phrases model to find words and ngrams that occur at least once
    ngram = Phraser(Phrases(sentence, min_count=1, threshold=1))

    # for bigrams and higher grams
    for i in range(3,n):
        ngram = Phraser(Phrases(ngram[sentence], min_count=1, threshold=1))
    return ngram[sentence]
Ejemplo n.º 30
0
    def testScoringDefault(self):
        """ test the default scoring, from the mikolov word2vec paper """
        bigram = Phrases(self.sentences, min_count=1, threshold=1)

        seen_scores = set()

        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == {
            5.167,  # score for graph minors
            3.444  # score for human interface
        }
Ejemplo n.º 31
0
def build_model():
    """build doc2vec model from cases"""
    
    """get urls for cases"""
    urls = make_links()
    shuffle(urls)

    """async downloads"""
    loop = asyncio.get_event_loop()
    future = asyncio.ensure_future(coordinate_downloads(urls))
    cases = [c for c in loop.run_until_complete(future) if len(c[1]) > 25]
    print("retrieved {} usable cases".format(len(cases)))
    
    lls = []
    for label, case in cases:
        lls.append(LabeledSentence(words=case.split(), tags=label))
    
    model = Doc2Vec(size=300, window=10, min_count=5, workers=6, alpha=0.025, min_alpha=0.025)
    model.build_vocab(lls)
    
    for epoch in range(10):
        model.train(lls)

    print("trained")
    for dv in model.docvecs:
        print(dv)
    
    input()
    print(model.most_similar("court"))
    
    """make sentences"""
    print("preprocessing text...")
    sentences = []
    for c in cases:
        s = sentence_maker.split_into_sentences(c[1], lower=True)
        sentences.extend(sentence_maker.split_into_sentences(c[1], lower=True))
    
    print("found {} sentences".format(len(sentences)))
    
    """phrase pre-processing"""
    print("building phrases...")
    phrases = Phrases(sentences, min_count=5, threshold=100)
    bigramphraser = Phraser(phrases)
    """produce a representation of the text including 2 and 3 word phrases"""
    trg_phrases = Phrases(bigramphraser[sentences], min_count=5, threshold=100)
    trigram_phraser = Phraser(trg_phrases)
    phrased_sentences = list(trigram_phraser[list(bigramphraser[sentences])])
    print("building Word2Vec model...")
    return Word2Vec(phrased_sentences, min_count=10, workers=6)
Ejemplo n.º 32
0
    def testExportPhrases(self):
        """Test Phrases bigram export phrases."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         connector_words=self.connector_words,
                         delimiter=' ')
        seen_bigrams = set(bigram.find_phrases(self.sentences).keys())

        assert seen_bigrams == set([
            'human interface',
            'graph of trees',
            'data and graph',
            'lack of interest',
        ])
Ejemplo n.º 33
0
    def testCustomScorer(self):
        """Test using a custom scoring function."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=.001,
                         scoring=dumb_scorer)
        test_sentences = [[
            'graph', 'minors', 'survey', 'human', 'interface', 'system'
        ]]
        seen_scores = list(bigram.find_phrases(test_sentences).values())

        assert all(score == 1 for score in seen_scores)
        assert len(
            seen_scores
        ) == 3  # 'graph minors' and 'survey human' and 'interface system'
Ejemplo n.º 34
0
    def testScoringDefault(self):
        """Test the default scoring, from the mikolov word2vec paper."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         delimiter=' ')
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        seen_scores = set(
            round(score, 3)
            for score in bigram.find_phrases(test_sentences).values())

        assert seen_scores == {
            5.167,  # score for graph minors
            3.444  # score for human interface
        }
Ejemplo n.º 35
0
    def testScoringNpmi(self):
        """Test normalized pointwise mutual information scoring."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=.5,
                         scoring='npmi')
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
        seen_scores = set(
            round(score, 3)
            for score in bigram.find_phrases(test_sentences).values())

        assert seen_scores == {
            .882,  # score for graph minors
            .714  # score for human interface
        }
Ejemplo n.º 36
0
    def testScoringNpmi(self):
        """ test normalized pointwise mutual information scoring """
        bigram = Phrases(self.sentences, min_count=1, threshold=.5,
                         scoring='npmi', common_terms=self.common_terms)

        seen_scores = set()

        test_sentences = [['data', 'and', 'graph', 'survey', 'for', 'human', 'interface']]
        for phrase, score in bigram.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == set([
            .74,  # score for data and graph
            .894  # score for human interface
        ])
Ejemplo n.º 37
0
    def testExportPhrases(self):
        """Test Phrases bigram export_phrases functionality."""
        bigram = Phrases(self.sentences, min_count=1, threshold=1, common_terms=self.common_terms)

        seen_bigrams = set()

        for phrase, score in bigram.export_phrases(self.sentences):
            seen_bigrams.add(phrase)

        assert seen_bigrams == set([
            b'human interface',
            b'graph of trees',
            b'data and graph',
            b'lack of interest',
        ])
Ejemplo n.º 38
0
 def create_model(self, doc_list):
     self.bigrams_phrases = Phrases(doc_list,
                                    min_count=self.min_count_bigrams)
     self.bigrams_phraser = Phraser(self.bigrams_phrases)
     self.trigrams_phrases = Phrases(self.bigrams_phraser[doc_list],
                                     min_count=self.min_count_trigrams)
     self.trigrams_phraser = Phraser(self.trigrams_phrases)
     self.bigrams_phraser.save(self.model_dir + '/' +
                               BIGRAMS_PHRASER_FILENAME)
     self.trigrams_phraser.save(self.model_dir + '/' +
                                TRIGRAMS_PHRASER_FILENAME)
     self.bigrams_phrases.save(self.model_dir + '/' +
                               BIGRAMS_PHRASES_FILENAME)
     self.trigrams_phrases.save(self.model_dir + '/' +
                                TRIGRAMS_PHRASES_FILENAME)
Ejemplo n.º 39
0
 def setUp(self):
     self.bigram = Phrases(self.sentences,
                           min_count=1,
                           threshold=1,
                           common_terms=self.common_terms)
     self.bigram_default = Phrases(self.sentences,
                                   common_terms=self.common_terms)
     self.bigram_utf8 = Phrases(self.sentences,
                                min_count=1,
                                threshold=1,
                                common_terms=self.common_terms)
     self.bigram_unicode = Phrases(self.unicode_sentences,
                                   min_count=1,
                                   threshold=1,
                                   common_terms=self.common_terms)
Ejemplo n.º 40
0
def build_ngrams(df, min_count=5, threshold=2):
    """
    This function builds bigram and ngrams.
    Please don't modify, it may explode.
    """

    print("Building Bigrams")
    phrases = Phrases(tqdm(df.clean), min_count=min_count, threshold=threshold)
    bigrams = Phraser(phrases)  # Phrases -> Phraser: lighter/faster object, but can't be updated
    df['bigrams'] = df.clean.progress_apply(lambda r: bigrams[r])

    print("Building Ngrams")
    phrases_2 = Phrases(tqdm(df.bigrams), min_count=min_count, threshold=threshold)
    ngrams = Phraser(phrases_2)
    df['ngrams'] = df.clean.progress_apply(lambda r: ngrams[r])
Ejemplo n.º 41
0
def build_vocab():
    start = time.time()
    test_path = os.path.join(config.DATA_PATH, 'test.csv')
    train_path = os.path.join(config.DATA_PATH, 'train.csv')
    normalized_text_path = os.path.join(config.PROCESSED_PATH, 'normalized_comments.txt')
    bigram_path = os.path.join(config.PROCESSED_PATH, 'bigram')
    bigram_comments_path = os.path.join(config.PROCESSED_PATH, 'bigram_commnets.txt')

    if config.PROCESSED_PATH not in os.listdir(config.DATA_PATH):
        try:
            os.mkdir(config.PROCESSED_PATH)
        except OSError:
            pass

    vocab = {}

    train_df = read_file(train_path)
    test_df = read_file(test_path)
    print('tokenizing vocab file')
    texts =  np.concatenate([train_df.comment_text.fillna('N/A').values,
                             test_df.comment_text.fillna('N/A').values])


    with open(normalized_text_path, 'w') as f:
        processed_text = parallelize_dataframe(texts, tokenizer)
        for line in processed_text:
            f.write(line + '\n')
    gc.collect()
    lines = LineSentence(normalized_text_path)
    bigram = Phrases(lines)
    bigram.save(bigram_path)
    phraser = Phraser(bigram)

    with open(bigram_comments_path, 'w', encoding='utf_8') as f:
       for comment in lines:
            comm = u' '.join(phraser[comment])
            f.write(comm + '\n')

    commnets = LineSentence(bigram_comments_path)
    bigram_dict = Dictionary(commnets)
    bigram_dict.filter_extremes(no_below=config.THRESHOLD)
    bigram_dict.save_as_text(config.VOCAB_PATH)
    bigram_dict.add_documents([['<pad>']])

    with open(os.path.join(config.ROOT, 'src', 'config.py'), 'a') as f:
        f.write('VOCAB_SIZE = {}'.format(len(bigram_dict)))

    print('time passed: {} minutes'.format((time.time() - start) / 60))
Ejemplo n.º 42
0
    def testCompatibilty(self):
        phrases = Phrases.load(datapath("phrases-3.6.0.model"))
        phraser = FrozenPhrases.load(datapath("phraser-3.6.0.model"))
        test_sentences = ['trees', 'graph', 'minors']

        self.assertEqual(phrases[test_sentences], ['trees', 'graph_minors'])
        self.assertEqual(phraser[test_sentences], ['trees', 'graph_minors'])
Ejemplo n.º 43
0
    def testCustomScorer(self):
        """Test using a custom scoring function."""
        bigram = Phrases(
            self.sentences,
            min_count=1,
            threshold=.001,
            scoring=dumb_scorer,
            connector_words=self.connector_words,
        )
        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        seen_scores = list(bigram.find_phrases(test_sentences).values())

        assert all(seen_scores)  # all scores 1
        assert len(seen_scores) == 2  # 'data and graph' 'survey for human'
Ejemplo n.º 44
0
    def testMultipleBigramsSingleEntry(self):
        """Test a single entry produces multiple bigrams."""
        bigram = Phrases(self.sentences,
                         min_count=1,
                         threshold=1,
                         connector_words=self.connector_words,
                         delimiter=' ')
        test_sentences = [[
            'data', 'and', 'graph', 'survey', 'for', 'human', 'interface'
        ]]
        seen_bigrams = set(bigram.find_phrases(test_sentences).keys())

        assert seen_bigrams == set([
            'data and graph',
            'human interface',
        ])
Ejemplo n.º 45
0
def learn_word_embeddings(corpus_fpath,
                          vectors_fpath,
                          cbow,
                          window,
                          iter_num,
                          size,
                          threads,
                          min_count,
                          detect_phrases=True):

    tic = time()
    sentences = GzippedCorpusStreamer(corpus_fpath)

    if detect_phrases:
        print("Extracting phrases from the corpus:", corpus_fpath)
        phrases = Phrases(sentences)
        bigram = Phraser(phrases)
        input_sentences = list(bigram[sentences])
        print("Time, sec.:", time() - tic)
    else:
        input_sentences = sentences

    print("Training word vectors:", corpus_fpath)
    model = Word2Vec(input_sentences,
                     min_count=min_count,
                     size=size,
                     window=window,
                     max_vocab_size=None,
                     workers=threads,
                     sg=(1 if cbow == 0 else 0),
                     iter=iter_num)
    model.wv.save_word2vec_format(vectors_fpath, binary=False)
    print("Vectors:", vectors_fpath)
    print("Time, sec.:", time() - tic)
Ejemplo n.º 46
0
    def __init__(self,train_data,dev_data,test_data):
        self.train_data=train_data
        self.dev_data=dev_data
        self.test_data=test_data

        # Hyper-parameters
        self.learningRate=0.01
        self.trainSize=2000
        self.testSize=1000
        self.totalSize = self.trainSize + self.testSize
        self.maxEpochs=10000
        self.num_processed=-1

        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')
Ejemplo n.º 47
0
 def testSaveLoadNoCommonTerms(self):
     """ Ensure backwards compatibility with old versions of Phrases, before common_terms"""
     bigram_loaded = Phrases.load(datapath("phrases-no-common-terms.pkl"))
     self.assertEqual(bigram_loaded.common_terms, frozenset())
     # can make a phraser, cf #1751
     phraser = Phraser(bigram_loaded)  # does not raise
     phraser[["human", "interface", "survey"]]  # does not raise
Ejemplo n.º 48
0
    def testSaveLoad(self):
        """ Saving and loading a Phrases object."""

        with temporary_file("test.pkl") as fpath:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save(fpath)
            bigram_loaded = Phrases.load(fpath)
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])
Ejemplo n.º 49
0
    def __init__(self, sentences, filename=None):

        # model parameters
        self.sentences = sentences
        self.dataset = "CASEREPORT"
        self.tokenizer = "RAW"
        self.prune_stopwords = stopwords("pubmed")
        self.phrases = None
        self.threshold = 250
        self.decay = 2
        self.bigram_iter = 3

        # data file path
        models_folder = os.path.join(*[os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'data', 'models'])
        if filename is None:
            filename = "PHRASE_%s_%s_%s_%s" % (self.threshold, self.decay, self.dataset, self.tokenizer, )
        self.filepath = os.path.join(models_folder, filename)

        # does identical model already exists?
        model_exists = os.path.isfile(self.filepath)
        if model_exists:
            logging.info("LOADING - loading phrase data..")
            self.phrases = Phrases.load(self.filepath)
        else:
            logging.info("CREATE - creating phrase data..")
            self.build()
Ejemplo n.º 50
0
 def testExportPhrases(self):
     """Test Phrases bigram export_phrases functionality."""
     bigram = Phrases(sentences, min_count=1, threshold=1)
     
     # with this setting we should get response_time and graph_minors
     bigram1_seen = False
     bigram2_seen = False
     
     for phrase, score in bigram.export_phrases(sentences):
         if not bigram1_seen and b'response time' == phrase:
             bigram1_seen = True
         elif not bigram2_seen and b'graph minors' == phrase:
             bigram2_seen = True
         if bigram1_seen and bigram2_seen:
             break
     
     self.assertTrue(bigram1_seen)
     self.assertTrue(bigram2_seen)
Ejemplo n.º 51
0
    def testSaveLoadCustomScorer(self):
        """ saving and loading a Phrases object with a custom scorer """

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
            bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
            seen_scores = []
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.append(score)

            assert all(seen_scores)  # all scores 1
            assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'

        finally:
            if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
Ejemplo n.º 52
0
    def testSaveLoad(self):
        """ Saving and loading a Phrases object."""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            bigram.save("test_phrases_testSaveLoad_temp_save.pkl")
            bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl")
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoad_temp_save.pkl")
Ejemplo n.º 53
0
    def testCompatibilty(self):
        phr = Phraser.load(datapath("phraser-3.6.0.model"))
        model = Phrases.load(datapath("phrases-3.6.0.model"))

        test_sentences = ['trees', 'graph', 'minors']
        expected_res = ['trees', 'graph_minors']

        phr_out = phr[test_sentences]
        model_out = model[test_sentences]

        self.assertEqual(phr_out, expected_res)
        self.assertEqual(model_out, expected_res)
Ejemplo n.º 54
0
    def testSaveLoadStringScoring(self):
        """ Saving and loading a Phrases object with a string scoring parameter.
        This should ensure backwards compatibility with the previous version of Phrases"""
        bigram_loaded = Phrases.load(datapath("phrases-scoring-str.pkl"))
        seen_scores = set()
        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
        for phrase, score in bigram_loaded.export_phrases(test_sentences):
            seen_scores.add(round(score, 3))

        assert seen_scores == set([
            5.167,  # score for graph minors
            3.444  # score for human interface
        ])
Ejemplo n.º 55
0
    def testSaveLoadNoScoring(self):
        """ Saving and loading a Phrases object with no scoring parameter.
        This should ensure backwards compatibility with old versions of Phrases"""

        try:
            bigram = Phrases(self.sentences, min_count=1, threshold=1)
            del(bigram.scoring)
            bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
            seen_scores = set()
            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
            for phrase, score in bigram_loaded.export_phrases(test_sentences):
                seen_scores.add(round(score, 3))

            assert seen_scores == set([
                5.167,  # score for graph minors
                3.444  # score for human interface
            ])

        finally:
            if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"):
                os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
Ejemplo n.º 56
0
    def __init__(self):

        '''
        Training parameters:
        '''

        self.w2v_dim=100
        self.num_feature=400
        self.batch_size=16
        self.num_epoch=1

        #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('./data/word2vec/w2v.model')
        self.index2word_set = set(self.w2v_model.index2word)
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        print('Build model...')

        param_dist = {
            "n_estimators":sp_randint(20,250),
            "criterion": ["gini", "entropy"],
            "max_depth": sp_randint(10, 300),
            "min_samples_split": sp_randint(1, 30),
            "min_samples_leaf": sp_randint(1, 30),
            "max_features": sp_randint(1, 200),
            "bootstrap": [True, False],
            'random_state':sp_randint(1, 1000000),
        }
        # build a classifier
        clf = RandomForestClassifier(n_jobs=8)
        # run randomized search
        self.model=RandomizedSearchCV(clf, param_distributions=param_dist,
                                   n_iter=10,cv=9,n_jobs=8)

        print('Model has been built!')
    def update(self,new_corpus,count,wrkers,sze,wndow):
        sentences = Corpus_Sentence_Extractor(new_corpus)

        bigram = Phrases().load(self.models + 'bigram_model')
        trigram = Phrases().load(self.models + 'trigram_model')

        bigram.add_vocab(sentences)
        trigram.add_vocab(bigram[sentences])

        self.train(sentences,trigram,self.word2vec,count,wrkers,sze,wndow)
 def build_bigram_model(self,sentences,count):
     print "In Bigram Model"
     bigram = Phrases(sentences,min_count=count)
     dest = self.models + 'bigram_model'
     bigram.save(dest)
     return bigram
Ejemplo n.º 59
0
    def __init__(self):
        self.session = tf.Session()
        '''
        Training parameters:
        '''

        self.w2v_dim=30
        self.num_feature=400
        self.batch_size=32
        self.num_epoch=10000
        self.num_hidden_1=50
        self.num_hidden_2=3

        self.number_of_layers=1

        #self.max_len = 50
        self.max_len_title=6
        self.max_len_body=38

        #self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('data/word2vec/w2v.model')
        self.index2word_set = set(self.w2v_model.index2word)
        #self.bigram = None
        #self.trigram =None
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        # Model
        self.input_0=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim])
        self.input_1=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim])
        self.input_0_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim])
        self.input_1_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim])

        self.dropout_input = tf.placeholder(tf.float32)
        self.dropout_hidden = tf.placeholder(tf.float32)

        self.target = tf.placeholder(tf.float32, [self.batch_size, 3])

        input_0=array_ops.unpack(self.input_0)
        input_1=array_ops.unpack(self.input_1)
        input_0_=array_ops.unpack(self.input_0_)
        input_1_=array_ops.unpack(self.input_1_)


        def _rnn(inputs, reverse=False):
            with tf.variable_scope("GRU_RNN") as scope:
                cell=rnn_cell.GRUCell(self.w2v_dim)
                cell = rnn_cell.DropoutWrapper(cell, output_keep_prob=self.dropout_input)
                stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers)
                state = stacked_cell.zero_state(self.batch_size, tf.float32)
                if reverse:
                    inputs=reversed(inputs)
                for time, input_ in enumerate(inputs):
                    if time > 0: scope.reuse_variables()
                    output, state = stacked_cell(input_, state)
                return state

        with tf.variable_scope('Feature_Generator') as scope:
            state_0 = _rnn(input_0)
            scope.reuse_variables()
            state_1 = _rnn(input_1)
            state_0_ = _rnn(input_0_)
            state_1_ = _rnn(input_1_)
        '''
        with tf.variable_scope('Feature_Generator_body') as scope:
            state_0_ = _rnn(input_0_)
            scope.reuse_variables()
            state_1_ = _rnn(input_1_)
        '''
        '''
        with tf.variable_scope('Feature_Generator_body_reverse') as scope:
            state_0_reverse = _rnn(input_0_, reverse=True)
            scope.reuse_variables()
            state_1_reverse = _rnn(input_1_, reverse=True)
        '''

        '''
        with tf.variable_scope('Feature_Generator_title') as scope:
            state_0 = _rnn(input_0)
            scope.reuse_variables()
            state_1 = _rnn(input_1)

        with tf.variable_scope('Feature_Generator_body') as scope:
            state_0_ = _rnn(input_0_)
            scope.reuse_variables()
            state_1_ = _rnn(input_1_)


        # state=tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.mul(state_0,state_1),
        #                   tf.abs(tf.sub(state_0_,state_1_)),tf.mul(state_0_,state_1_)])


        # state=tf.concat(1,[state_0,state_1, state_0_, state_1_])
        # state = tf.ones([32,10])

        # state=tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.mul(state_0,state_1)])
        '''

         # 2-layer NN
        with tf.variable_scope("NN", initializer=tf.random_uniform_initializer(-1.0,1.0)):
            self.W_mul = tf.get_variable("W_mul", [state_0_.get_shape()[1]*2,self.num_hidden_1])
            self.W_sub = tf.get_variable("W_sub", [state_0_.get_shape()[1]*2,self.num_hidden_1])
            self.b = tf.get_variable("b", [self.num_hidden_1])

            self.W_softmax=tf.get_variable("W_softmax", [self.num_hidden_1,self.num_hidden_2])
            self.b_softmax = tf.get_variable("b_softmax", [self.num_hidden_2])

        # h_mul = tf.mul(state_0,state_1)
        # h_sub = tf.abs(tf.sub(state_0,state_1))
        h_mul = tf.concat(1,[tf.mul(state_0,state_1),tf.mul(state_0_,state_1_)])
        h_sub = tf.concat(1,[tf.abs(tf.sub(state_0,state_1)),tf.abs(tf.sub(state_0_,state_1_))])

        y_1 = tf.nn.sigmoid(tf.matmul(h_mul, self.W_mul)+tf.matmul(h_sub, self.W_sub)+self.b)
        y_2 = tf.matmul(y_1, self.W_softmax)+self.b_softmax

        # regularizers = (tf.nn.l2_loss(self.W_1) + tf.nn.l2_loss(self.b_1)+tf.nn.l2_loss(self.W_2) + tf.nn.l2_loss(self.b_2))

        '''
        state_0_title_normalized = tf.nn.l2_normalize(state_0, 1)
        state_1_title_normalized = tf.nn.l2_normalize(state_1, 1)
        state_0_body_normalized = tf.nn.l2_normalize(state_0_, 1)
        state_1_body_normalized = tf.nn.l2_normalize(state_1_, 1)

        dist_title_ = tf.mul(state_0_title_normalized, state_1_title_normalized)
        dist_body_ = tf.mul(state_0_body_normalized, state_1_body_normalized)s

        dist_title=tf.reduce_sum(dist_title_, 1, keep_dims=True)
        dist_body=tf.reduce_sum(dist_body_, 1, keep_dims=True)

        feature = tf.concat(1, [dist_title,dist_body])

        with tf.variable_scope("log_reg", initializer=tf.random_uniform_initializer()):
             self.W = tf.get_variable("W", [feature.get_shape()[1],3])
             self.b = tf.get_variable("b", [3])

        y_2 = tf.matmul(feature, self.W)+self.b
        '''
        '''
        with tf.variable_scope("log_reg", initializer=tf.random_uniform_initializer()):
            self.W_1 = tf.get_variable("W_1", [state.get_shape()[1],self.num_hidden_1])
            self.b_1 = tf.get_variable("b_1", [self.num_hidden_1])
            self.W_2 = tf.get_variable("W_2", [self.num_hidden_1,self.num_hidden_2])
            self.b_2 = tf.get_variable("b_2", [self.num_hidden_2])
        '''
        '''
        # Create model
        def multilayer_perceptron(_X, _weights, _biases):
            layer_1 = tf.nn.sigmoid(tf.add(tf.matmul(_X, _weights['h1']), _biases['b1'])) #Hidden layer with RELU activation
            layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, _weights['h2']), _biases['b2'])) #Hidden layer with RELU activation
            return tf.matmul(layer_2, _weights['out']) + _biases['out']

        # Store layers weight & bias
        weights = {
            'h1': tf.Variable(tf.random_normal([10, 10])),
            'h2': tf.Variable(tf.random_normal([10, 5])),
            'out': tf.Variable(tf.random_normal([5, 3]))
        }
        biases = {
            'b1': tf.Variable(tf.random_normal([10])),
            'b2': tf.Variable(tf.random_normal([5])),
            'out': tf.Variable(tf.random_normal([3]))
        }
        # Construct model
        self.y_pred = multilayer_perceptron(state, weights, biases)

        # Define loss and optimizer
        self.cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(self.y_pred, self.target)) # Softmax loss
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.1).minimize(self.cross_entropy) # Adam Optimizer
        '''

        # self.W = tf.Variable(tf.zeros([10, 3]))
        # self.b = tf.Variable(tf.zeros([3]))
        # y_1 = tf.sigmoid(tf.matmul(state, self.W_1)+self.b_1)
        # y_2 = tf.sigmoid(tf.matmul(y_1, self.W_2)+self.b_2)
        # self.y_pred = tf.nn.softmax(tf.nn.sigmoid(tf.add(tf.matmul(state, self.W),self.b)))
        self.y_pred=tf.nn.softmax(y_2)
        # self.y_pred = tf.nn.softmax(tf.nn.sigmoid(tf.matmul(state, self.W_1)+self.b_1))
        self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred))
        # self.optimizer = tf.train.AdamOptimizer().minimize(self.cross_entropy)
        # self.optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(self.cross_entropy)
        # self.optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(self.cross_entropy)
        # self.gradstep = self.optimizer.compute_gradients(self.cross_entropy)


        # Optimizer.

        global_step = tf.Variable(0)
        # optimizer = tf.train.GradientDescentOptimizer(0.1)
        optimizer = tf.train.AdagradOptimizer(0.1)
        gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy))
        gradients, _ = tf.clip_by_global_norm(gradients, 10)
        self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step)



        print('Model has been built!')
Ejemplo n.º 60
0
    def __init__(self):
        self.session = tf.Session()
        '''
        Training parameters:
        '''

        self.w2v_dim=10
        self.num_feature=400
        self.batch_size=32
        self.num_epoch=10000
        self.num_hidden_1=100
        self.num_hidden_2=50
        self.num_hidden_3=3

        self.number_of_layers=1

        #self.max_len = 50
        self.max_len_title=13
        self.max_len_body=50

        # self.w2v_model=Word2Vec.load_word2vec_format('./data/word2vec/GoogleNews-vectors-negative300.bin', binary=True)
        self.w2v_model=Word2Vec.load('data/word2vec/w2v.model')
        self.index2word_set = set(self.w2v_model.index2word)
        #self.bigram = None
        #self.trigram =None
        self.bigram=Phrases.load('./data/bigram.dat')
        self.trigram=Phrases.load('./data/trigram.dat')

        # Model
        self.input_0=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim])
        self.input_1=tf.placeholder(tf.float32,[self.max_len_title,self.batch_size,self.w2v_dim])
        self.input_0_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim])
        self.input_1_=tf.placeholder(tf.float32,[self.max_len_body,self.batch_size,self.w2v_dim])

        self.dropout_input = tf.placeholder(tf.float32)
        self.dropout_hidden_1 = tf.placeholder(tf.float32)

        self.target = tf.placeholder(tf.float32, [self.batch_size, 3])

        input_0=array_ops.unpack(self.input_0)
        input_1=array_ops.unpack(self.input_1)
        input_0_=array_ops.unpack(self.input_0_)
        input_1_=array_ops.unpack(self.input_1_)


        def _encoder(inputs, reverse=False):
            with tf.variable_scope("GRU_RNN") as scope:
                cell=rnn_cell.BasicLSTMCell(self.w2v_dim)
                stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers)
                # state = tf.zeros([1, cell.state_size])
                state = stacked_cell.zero_state(self.batch_size, tf.float32)
                if reverse:
                    inputs=reversed(inputs)
                for time, input_ in enumerate(inputs):
                    if time > 0: scope.reuse_variables()
                    output, state = stacked_cell(input_, state)
                return state
        def _decoder(state, inputs):
            with tf.variable_scope("GRU_RNN") as scope:
                cell=rnn_cell.BasicLSTMCell(self.w2v_dim)
                stacked_cell = rnn_cell.MultiRNNCell([cell] * self.number_of_layers*2)

                for time, input_ in enumerate(inputs):
                    if time > 0: scope.reuse_variables()
                    output, state = stacked_cell(input_, state)
                return output

        with tf.variable_scope('Encoder') as scope:
            state = _encoder(input_0_)
            scope.reuse_variables()
            state_reversed = _encoder(input_0_, reverse=True)


        with tf.variable_scope('Decoder') as scope:
            state = _decoder(tf.concat(1,[state,state_reversed]), input_1_)

        with tf.variable_scope("to_score", initializer=tf.random_uniform_initializer()):
             self.W = tf.get_variable("W", [state.get_shape()[1],3])
             self.b = tf.get_variable("b", [3])

        score = tf.matmul(state, self.W)+self.b

        # score_1 = tf.sigmoid(tf.matmul(out_1, self.W)+self.b)
        # state=tf.concat(1,[score_0,score_1])
        '''
        with tf.variable_scope("to_final", initializer=tf.random_uniform_initializer()):
             self.W = tf.get_variable("W", [state.get_shape()[1],3])
             self.b = tf.get_variable("b", [3])

        final = tf.matmul(state, self.W)+self.b
        '''

        self.y_pred=tf.nn.softmax(score)
        self.cross_entropy = -tf.reduce_mean(self.target*tf.log(self.y_pred))


        # Optimizer.

        global_step = tf.Variable(0)
        optimizer = tf.train.GradientDescentOptimizer(0.1)
        # optimizer = tf.train.AdamOptimizer(0.1)
        gradients, v = zip(*optimizer.compute_gradients(self.cross_entropy))
        gradients, _ = tf.clip_by_global_norm(gradients, 20)
        self.optimizer= optimizer.apply_gradients(zip(gradients, v), global_step=global_step)



        print('Model has been built!')