def main():
    # -------------------------------------------------------------------------------
    # Parameters

    # the script will most likely work if we swap the TEXTS variable
    # with any iterable of text (where one element represents a document,
    # and the whole iterable is the corpus)
    newsgroups_train = fetch_20newsgroups(subset='train')
    TEXTS = newsgroups_train.data

    # spacy's english model for text preprocessing
    NLP = spacy.load('en')

    # a set of stopwords built-in to spacy, we can always
    # expand this set for the problem that we are working on,
    # here we include python built-in string punctuation mark
    STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set(
        ENGLISH_STOP_WORDS)

    # create a directory called 'model' to store all outputs in later section
    MODEL_DIR = 'model'
    UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')
    PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model')
    BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt')
    WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec')

    # -------------------------------------------------------------------------------
    logger.info('job started')
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if not os.path.exists(UNIGRAM_PATH):
        logger.info('preprocessing text')
        export_unigrams(UNIGRAM_PATH,
                        texts=TEXTS,
                        parser=NLP,
                        stopwords=STOPWORDS)

    if os.path.exists(PHRASE_MODEL_CHECKPOINT):
        phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT)
    else:
        logger.info('training phrase model')
        # use LineSetence to stream text as oppose to loading it all into memory
        unigram_sentences = LineSentence(UNIGRAM_PATH)
        phrase_model = Phrases(unigram_sentences)
        phrase_model.save(PHRASE_MODEL_CHECKPOINT)

    if not os.path.exists(BIGRAM_PATH):
        logger.info('converting words to phrases')
        export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)

    if os.path.exists(WORD2VEC_CHECKPOINT):
        word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT)
    else:
        logger.info('training word2vec')
        sentences = LineSentence(BIGRAM_PATH)
        word2vec = Word2Vec(sentences, workers=cpu_count())
        word2vec.save(WORD2VEC_CHECKPOINT)

    logger.info('job completed')
def _phrase_detection_(fpath=fpathroot + fpathappend,
                       passes=2,
                       returnmodels=True,
                       threshold=10.):
    """
    This function does pharse modeling. User specifies the number of passes.
    Each pass detects longer phrases. The maximum detectable phrase length for
    each pass, n, is 2*n.

    Returns the list of models by default. Also saves models and intermediary
    phrased sentences for each pass.
    """
    generpath = fpath + '_sent_gram_0.txt'
    ngram = list()
    for it in range(passes):
        gen = LineSentence(generpath)
        gram = Phrases(gen, threshold=threshold)
        ngram.append(gram)
        modelpath = fpath + 'phrase_model_gram_' + str(it + 1)
        generpath = fpath + 'sent_gram_' + str(it + 1) + '.txt'
        gram.save(modelpath)
        # Write sentence gram
        with codecs.open(generpath, 'w', encoding='utf_8') as f:
            for sent in gen:
                new_sent = u' '.join(gram[sent])
                f.write(new_sent + '\n')

    if returnmodels == True:
        return ngram
    def build_trigrams(self):

        self.bigram_model = Phrases([
            doc.split(" ") for doc in self.lemmatized_sentence_corpus(
                self.df_prod['reviews'].values)
        ],
                                    min_count=2)
        bigram_sentences = []
        for unigram_sentence in self.lemmatized_sentence_corpus(
                self.df_prod['reviews'].values):
            bigram_sentences.append(' '.join(
                self.bigram_model[unigram_sentence.split(" ")]))
        self.trigram_model = Phrases(
            [doc.split(" ") for doc in bigram_sentences], min_count=2)

        self.trigrams_doc = []
        for doc in self.df_prod['reviews'].values:
            parsed_doc = self.wp.nlp(doc)
            bigram_doc = ' '.join(
                self.bigram_model[(token.lemma_ for token in parsed_doc
                                   if self.keep_token(token))])
            trigram_doc = ' '.join(
                self.trigram_model[(token for token in bigram_doc.split(" "))])
            self.trigrams_doc.append(
                self.trigram_model[(token for token in bigram_doc.split(" "))])
Esempio n. 4
0
    def process_tokens(self, lemmatize=True, lower=True, phrases=True):

        tokens = [
            [
                token for token in raw_token
                # TODO: Add like_num option?
                if (token.pos_ in self.keep_pos) and (not token.is_stop) and (
                    token.is_alpha)
            ] for raw_token in tqdm(
                self.docs, total=self.n_docs, desc="Processing tokens")
        ]

        if lemmatize:
            tokens = [[token.lemma_ for token in doc] for doc in tokens]
        else:
            tokens = [[token.text for token in doc] for doc in tokens]

        if lower:
            tokens = [[token.lower() for token in doc] for doc in tokens]

        if phrases:
            # TODO: Add n-gram pattern matching with spacy
            bigrams = Phrases(tokens, delimiter=b"_", min_count=2)
            trigrams = Phrases(bigrams[tokens], delimiter=b"_", min_count=2)

            # extract bigrams and trigrams
            tokens = [bigrams[doc] for doc in tokens]
            tokens = [trigrams[doc] for doc in tokens]

        return tokens
Esempio n. 5
0
    def trigramGenerator(self):
        corpusStream = self.sentenceStream()
        biGramPhrases = Phrases(corpusStream,
                                min_count=self.bigramMinCount,
                                threshold=self.thresholdBigram)
        bigram = Phraser(biGramPhrases)

        inputStream = self.sentenceStream()
        bigramSentenceList = (bigram[sentence] for sentence in inputStream)

        triGramPhrases = Phrases(bigramSentenceList,
                                 min_count=self.trigramMinCount,
                                 threshold=self.thresholdTrigram)
        trigram = Phraser(triGramPhrases)

        inputStream = self.sentenceStream()
        bigramSentenceList = (bigram[sentence] for sentence in inputStream)
        trigramSentenceList = (trigram[sentence]
                               for sentence in bigramSentenceList)

        trigramList = set()
        for trigramSentence in trigramSentenceList:
            for item in trigramSentence:
                if "_" in item:
                    trigramList.add(item)

        print("Number of Unique Trigrams = ", len(trigramList))
        for item in sorted(trigramList):
            if not os.path.exists(self.trainingLocation):
                os.makedirs(self.trainingLocation)
            with open(
                    os.path.join(self.trainingLocation,
                                 "TC-phrases-bi-tri.txt"), "a") as outFile:
                outFile.write(item + "\n")
Esempio n. 6
0
def visualize_lda_model():
    data = preprocess_to_lemmatization()
    stopwords_verbs = [
        'say', 'get', 'go', 'know', 'may', 'need', 'like', 'make', 'see',
        'want', 'come', 'take', 'use', 'would', 'can'
    ]
    stopwords_other = [
        'one', 'mr', 'bbc', 'image', 'getty', 'de', 'en', 'caption', 'also',
        'copyright', 'something'
    ]
    my_stopwords = stopwords.words(
        'english') + stopwords_verbs + stopwords_other
    data['tokens'] = data['tokens_sentences_lemmatized'].map(
        lambda sentences: list(chain.from_iterable(sentences)))
    data['tokens'] = data['tokens'].map(lambda tokens: [
        token.lower() for token in tokens if token.isalpha() and token.lower()
        not in my_stopwords and len(token) > 1
    ])
    tokens = data['tokens'].tolist()
    bigram_model = Phrases(tokens)
    trigram_model = Phrases(bigram_model[tokens], min_count=1)
    tokens = list(trigram_model[bigram_model[tokens]])

    dictionary_LDA = corpora.Dictionary(tokens)
    dictionary_LDA.filter_extremes(no_below=3)
    corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]
    np.random.seed(123456)
    num_topics = 20
    lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                      id2word=dictionary_LDA, \
                                      passes=4, alpha=[0.01]*num_topics, \
                                      eta=[0.01]*len(dictionary_LDA.keys()))
    lda_viz = gensimvis.prepare(lda_model, corpus, dictionary_LDA)
    pyLDAvis.enable_notebook()
    return pyLDAvis.display(lda_viz)
Esempio n. 7
0
def getTopics(jobs_):
  
    bigram_model = Phrases.load('data/bigram_model_all')
    trigram_model = Phrases.load('data/trigram_model_all')
    trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict')
    lda = LdaMulticore.load('data/lda_model_all')

    topic_names = {0:u'Risk Management Bank', 
                   1:u'Big Data Report', 
                   2:u'Automotive SAP', 
                   3:u'Microsoft Java Scrum', 
                   4:u'Medical Consultant', 
                   5:u'Java Engineer', 
                   6:u'Computer Vision Developer', 
                   7:u'Data Analyst', 
                   8:u'BI SAP BW', 
                   9:u'IOT Reporting R', 
                   10:u'Global Project Presentation',
                   11:u'Cloud Engineer IOT', 
                   12:u'Industry 4.0', 
                   13:u'Risk Consulting', 
                   14:u'Machine Learning Data Science'}
    
    topics_ = []
    
    for job_ in jobs_:
      if job_ is not None:
        #print(job_[0])
        topics_.append(lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, job_[1], job_[0]))
    def _preprocess(self, text, min_tok_len=1):
        stop_words = set(nltk.corpus.stopwords.words('english'))
        lemm_stemm = lambda tok: WordNetLemmatizer().lemmatize(tok, pos='v')

        result = []

        #remove proper nouns
        tagged_sent = pos_tag(text.split())
        noProper = [word for word, pos in tagged_sent if pos != 'NNP']
        noProper = ' '.join(noProper)

        for token in simple_preprocess(noProper):
            if len(token) > min_tok_len and token not in stop_words:
                result.append(lemm_stemm(token))

        # Build the bigram and trigram models
        bigram = Phrases(result, min_count=5,
                         threshold=10)  # higher threshold fewer phrases.
        trigram = Phrases(bigram[result], threshold=10)

        # Faster way to get a sentence clubbed as a trigram/bigram
        bigram_mod = Phraser(bigram)
        trigram_mod = Phraser(trigram)

        result = trigram_mod[bigram_mod[result]]

        return [result]
def train_bigram(unigram_txt_filepath, bigram_model_filepath, savebigram,
                 bigram_txt_filepath):
    print('reading unigram text file.....')

    unigram_txt = LineSentence(unigram_txt_filepath)
    print('training bigram model.....')

    bigram_model = Phrases(unigram_txt)
    print('saving bigram model.....')

    bigram_model.save(bigram_model_filepath)

    # load the finished model from disk
    # bigram_model = Phrases.load(bigram_model_filepath)

    if savebigram:
        print('saving bigram processed text file....')

        with codecs.open(bigram_txt_filepath, 'w', encoding='utf_8') as f:
            i = 0
            for unigram_sentence in tqdm(unigram_txt):

                bigram_sentence = u' '.join(bigram_model[unigram_sentence])

                f.write(bigram_sentence + '\n')

                i = i + 1

                if (i % 10000 == 0):
                    print('Bigram Processed ' + str(i) + ' articles')
Esempio n. 10
0
def bigrams(corpus, output_prefix):
    print("----- Bigram -----")
    if os.path.exists(output_prefix + "_bigram_phrases"):
        bigram_phrases = Phrases.load(output_prefix + "_bigram_phrases")
        print("Loaded bigram phrases")
    else:
        bigram_phrases = Phrases(corpus, min_count=CONFIG["bigram_phrase_min_count"], threshold=CONFIG["bigram_phrase_threshold"], progress_per=CONFIG["bigram_phrase_progress_per"], delimiter=CONFIG["bigram_phrase_delimiter"])
        bigram_phrases.save(output_prefix + "_bigram_phrases")
    bigram_transformer = Phraser(bigram_phrases)

    dct = Dictionary(bigram_transformer[corpus])
    dct.save(output_prefix + "_dictionary_bigram")
    print("Training tf-idf from bigrams")
    bow_corpus = [dct.doc2bow(line) for line in bigram_transformer[corpus]]
    tfidf = gensim.models.TfidfModel(bow_corpus, smartirs='ntc')
    tfidf.save(output_prefix + "_tfidf_bigram")
    print("Training word2vec model with bigrams (may be unnecessary if trigrams work as expected)")
    start_time = time()
    bigram_model = gensim.models.Word2Vec(bigram_transformer[corpus], size=CONFIG['vector_size'], window=CONFIG['window_size'],
                                   min_count=CONFIG['min_count'], workers=CONFIG['worker_count'], sg=CONFIG['sg'],
                                   negative=CONFIG['negative_size'], alpha=CONFIG['alpha'], min_alpha = CONFIG['min_alpha'],
                                   iter=CONFIG['train_epoch'])
    bigram_model.save(output_prefix + "_bigram")
    print("Time :", format_time(time() - start_time))
    return bigram_model
Esempio n. 11
0
def main():

    input = LineSentence('cleaned_judgments2')
    bigram = Phrases(input)
    trigram = Phrases(bigram[input])
    model = Word2Vec(trigram[bigram[input]], sg=0, size=300, window=5, min_count=3, workers=8)
    model.save('model2')
Esempio n. 12
0
def train_phrases(paths, out='data/bigram_model.phrases', **kwargs):
    """
    Train a bigram phrase model on a list of files.
    """
    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    # Change to use less memory. Default is 40m.
    max_vocab_size = 40000000

    print('Training bigrams...')
    bigram = Phrases(_phrase_doc_stream(paths, n), max_vocab_size=max_vocab_size, threshold=8.)

    print('Saving...')
    bigram.save(out)

    print('Some examples:')
    docs = [
        ['the', 'new', 'york', 'times', 'is', 'a', 'newspaper'],
        ['concern', 'is', 'rising', 'in', 'many', 'quarters', 'that', 'the', 'united', 'states', 'is', 'retreating', 'from', 'global', 'economic', 'leadership', 'just', 'when', 'it', 'is', 'needed', 'most'],
        ['the', 'afghan', 'president', 'ashraf', 'ghani', 'blamed', 'the', 'islamic', 'state', 'group'],
        ['building', 'maintenance', 'by', 'the', 'hrynenko', 'family', 'which', 'owns', 'properties', 'in', 'the', 'east', 'village'],
        ['a', 'telegram', 'from', 'the', 'american', 'embassy', 'in', 'constantinople', 'to', 'the', 'state', 'department', 'in', 'washington']
    ]
    for r in bigram[docs]:
        print(r)
Esempio n. 13
0
def main():
    get_args()

    def sentences():
        return chain.from_iterable(
            (read_slice(data) for data in read_corpus()))

    bigram = Phrases(sentences(), min_count=1, threshold=1, delimiter=b' ')
    bigram_phraser = Phraser(bigram)

    bigrammed = map(lambda x: bigram_phraser[x], sentences())

    trigram = Phrases(bigrammed, min_count=1, threshold=1, delimiter=b' ')
    trigram_phraser = Phraser(trigram)

    only_trigrams = {b' '.join(trigram_tuple): score for (trigram_tuple, score) in \
        trigram_phraser.phrasegrams.items() if b' '.join(trigram_tuple).count(b' ') == 2}

    for key, value in sorted(only_trigrams.items(),
                             key=lambda item: item[1],
                             reverse=True)[:10]:
        print(key, value)

    scores = list(only_trigrams.values())
    print("""
    Unique trigrams: {unique}
    Mean score:{mean}
    Max score:{max}
    Min score:{min}
    """.format(unique=len(only_trigrams),
               mean=mean(scores) if len(scores) != 0 else 0,
               max=max(scores) if len(scores) != 0 else 0,
               min=min(scores) if len(scores) != 0 else 0))
Esempio n. 14
0
def get_top_phrases(documents):
    documents_split = [doc.split() for doc in documents]
    remove_from_stop_words = [
        "would", "what", "which", "who", "whom", "when", "where", "why", "how",
        "could"
    ]
    words_to_remove = [
        'yeah', 'okay', 'like', 'oh', 'also', 'and', 'so', 'hey', 'hello'
    ]
    custom_stopwords = [
        sw for sw in stopwords.words('english')
        if sw not in remove_from_stop_words and sw not in words_to_remove
    ]

    bigram = Phrases(documents_split,
                     min_count=1,
                     delimiter=b' ',
                     common_terms=custom_stopwords)
    trigram = Phrases(bigram[documents_split],
                      min_count=1,
                      delimiter=b' ',
                      common_terms=custom_stopwords)

    cnt = Counter([
        t for sent in documents_split for t in trigram[bigram[sent]]
        if t.count(' ') >= 1
    ])
    return cnt.most_common()
def get_test_reviews():
    doc_reviews = {}
    sent_reivews = {}
    num_docs = 0
    num_words = 0
    apk_path = os.path.join("..", "data", "raw")
    apk_lst_path = os.path.join(apk_path, "package_names.txt")
    # load phrases
    bigram = Phrases.load(os.path.join("..", "model", "bigram.model"))
    trigram = Phrases.load(os.path.join("..", "model", "trigram.model"))
    with open(apk_lst_path) as fin:
        apk_lst = [apk_name.strip() for apk_name in fin.readlines()]
    for apk_name in apk_lst:
        file = os.path.join(apk_path, "mongodb", apk_name, "review.txt")
        with open(file) as fin:
            reviews_sent = []
            reviews_doc = []
            for line in fin.readlines():
                words_sents, wc = extractSentenceWords(line)
                reviews_sent.append(words_sents)
                reviews_doc.append(list(itertools.chain.from_iterable(words_sents)))
                num_docs += 1
                num_words += wc
            sent_reivews[apk_name] = trigram[bigram[reviews_sent]]
            doc_reviews[apk_name] = trigram[bigram[reviews_doc]]

    logging.info("Read %d docs, %d words!" % (num_docs, num_words))
    return sent_reivews, doc_reviews
    def fetch_document_bigrams(self, document_lemmas, number_of_bigrams=100):
        """
        Given a number of lemmas identifying a document, it calculates N bigrams
        found in that document, where N=number_of_bigrams.
        """
        if not self.include_bigrams:
            return []

        bigram = Phrases()
        bigram.add_vocab([document_lemmas])
        bigram_counter = Counter()

        for key in bigram.vocab.keys():
            if key not in STOPWORDS_BYTES:
                if len(key.split("_")) > 1:
                    bigram_counter[key] += bigram.vocab[key]

        bigram_iterators = [
            repeat(bigram, bigram_count)
            for bigram, bigram_count
            in bigram_counter.most_common(number_of_bigrams)
        ]
        found_bigrams = list(chain(*bigram_iterators))
        known_bigrams = [bigram for bigram in found_bigrams if bigram in self.top_bigrams]

        return known_bigrams
Esempio n. 17
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    in_dir, model_out = sys.argv[1:]
    sentences = Corpus(in_dir)
    phrases = Phrases(sentences)
    phrases.save(model_out)
Esempio n. 18
0
def load_model():
    bigram = Phrases.load(os.path.join("..", "model", "bigram.model"))
    trigram = Phrases.load(os.path.join("..", "model", "trigram.model"))
    wv_model = Word2Vec.load(
        os.path.join("..", "model", "appreviews_word2vec.model"))
    logging.info("Load word2vec model finished")
    return bigram, trigram, wv_model
Esempio n. 19
0
def tokeniseAll(posts, stopWords, urduNames):
    '''Function to tokenise all comments in the file, including ngrams
    
    Parameters
    ---------------------------------------
    comments: the pandas data frame column containing the comments, transformed into a list
    
    stopWords: A list of stopwords
    
    urduNames: A list of common Urdu names'''

    #posts = comments.tolist()
    n_grams = 3
    tokenized_corp = []
    for doc in posts:
        tokenized_corp.append(createToken(doc, stopWords, urduNames))

    # Add n_grams
    bigram = Phrases(tokenized_corp, min_count=5, threshold=10)
    trigram = Phrases(bigram[tokenized_corp], threshold=10)
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)

    if n_grams > 1:
        for i, doc in enumerate(tokenized_corp):
            tokenized_corp[i] = bigram_mod[doc]
            if n_grams > 2:
                tokenized_corp[i] = trigram_mod[bigram_mod[doc]]
    return tokenized_corp
Esempio n. 20
0
def add_lda_topics(data):
    # LDA Model
    lda = models.LdaModel.load('classifiers/lda_model/lda_model')

    # get tokens from text
    corpus = data['text'].to_list()
    data['sentences'] = data.text.map(sent_tokenize)
    data['tokens_sentences'] = data['sentences'].map(
        lambda sentences: [word_tokenize(sentence) for sentence in sentences])
    data['POS_tokens'] = data['tokens_sentences'].map(
        lambda tokens_sentences:
        [pos_tag(tokens) for tokens in tokens_sentences])
    data['tokens_sentences_lemmatized'] = data['POS_tokens'].map(
        lambda list_tokens_POS: [[
            lemmatizer.lemmatize(el[0], get_wordnet_pos(el[1]))
            if get_wordnet_pos(el[1]) != '' else el[0] for el in tokens_POS
        ] for tokens_POS in list_tokens_POS])
    stopwords_custom = [
        '[', ']', 'RT', '#', '@', ',', '.', '!', 'http', 'https'
    ]
    my_stopwords = list(spacy_stopwords) + stopwords_custom

    data['tokens'] = data['tokens_sentences_lemmatized'].map(
        lambda sentences: list(chain.from_iterable(sentences)))
    data['tokens'] = data['tokens'].map(lambda tokens: [
        token.lower() for token in tokens if token.isalpha() and token.lower()
        not in my_stopwords and len(token) > 1
    ])

    tokens = data['tokens'].tolist()
    bigram_model = Phrases(tokens)
    trigram_model = Phrases(bigram_model[tokens], min_count=1)
    tokens = list(trigram_model[bigram_model[tokens]])

    # create new_corpus
    dictionary_LDA = corpora.Dictionary(tokens)
    unseen_corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]

    # run model on new_corpus
    np.random.seed(123456)
    num_topics = 20
    lda_model = models.LdaModel(unseen_corpus, num_topics=num_topics, \
                                    id2word=dictionary_LDA, \
                                    passes=4, alpha=[0.01]*num_topics, \
                                    eta=[0.01]*len(dictionary_LDA.keys()))

    # get document topic and append to df
    topics = [lda_model[unseen_corpus[i]] for i in range(len(data))]

    # like TF-IDF, create a matrix of topic weighting, with documents as rows and topics as columns
    document_topic = \
    pd.concat([topics_document_to_dataframe(topics_document, num_topics=num_topics) for topics_document in topics]).reset_index(drop=True).fillna(0)

    data = pd.concat([data, document_topic], axis=1, sort=False)
    data = data.drop([
        'sentences', 'tokens_sentences', 'POS_tokens',
        'tokens_sentences_lemmatized', 'tokens'
    ], 1)

    return data
Esempio n. 21
0
def visulaizer_of_gensim(content_list):
    stop_words = stopwords.words('english')

    data_words = list(sent_to_words(content_list))

    bigram = Phrases(data_words, min_count=5, threshold=100)
    trigram = Phrases(bigram[data_words], threshold=100)
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)

    data_words_nostops = remove_stopwords(data_words, stop_words)
    data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)
    data_words_trigrams = make_trigrams(data_words_bigrams, bigram_mod,
                                        trigram_mod)
    data_lemmatized = lemmatization(
        data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

    id2word = corpora.Dictionary(data_lemmatized)
    texts = data_lemmatized
    corpus = [id2word.doc2bow(text) for text in texts]

    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=20,
                         random_state=100,
                         update_every=1,
                         chunksize=100,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True)

    vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

    return vis
Esempio n. 22
0
    def get_trigrams(self):
        """
        Builds unigram, bigram, and trigram models respectively.
        Writes the text of each model to a seperate file.

        """
        unigram_sentences = LineSentence(self.unigram_sentences_filepath)
        bigram_model = Phrases(unigram_sentences)
        bigram_model.save(self.bigram_model_filepath)
        bigram_model = Phrases.load(self.bigram_model_filepath)
        with open(self.bigram_sentences_filepath, 'w', encoding="utf-8") as f:
            for unigram_sentence in unigram_sentences:
                bigram_sent = " ".join(bigram_model[unigram_sentence])  # a bit confused by this.
                f.write(bigram_sent)
        bigram_sentences = LineSentence(self.bigram_sentences_filepath)
        trigram_model = Phrases(bigram_sentences)
        trigram_model.save(self.trigram_model_filepath)
        trigram_model = Phrases.load(self.trigram_model_filepath)
        with open(self.trigram_sentences_filepath, 'w', encoding="utf-8") as f:
            for bigram_sentence in bigram_sentences:
                trigram_sentence = " ".join(trigram_model[bigram_sentence])
                f.write(trigram_sentence + '\n')
        trigram_sentences = LineSentence(self.trigram_sentences_filepath)
        with open(self.trigram_articles_filepath, 'w', encoding="utf-8") as f:
            for parsed_article in self.line_article("../data/article_texts"):
                unigram_article = [token.lemma_ for token in self.nlp(parsed_article)
                                   if not self.punct_space(token)]
                bigram_article = bigram_model[unigram_article]
                trigram_article = trigram_model[bigram_article]
                trigram_article = [term for term in trigram_article
                                    if term not in STOP_WORDS]
                trigram_article = " ".join(trigram_article)
                f.write(trigram_article + '\n')
Esempio n. 23
0
    def add_n_grams(self, n=2, min_count=1):
        logging.info('Performing normalization.')
        logging.debug('n=' + str(n))
        logging.debug('min_count=' + str(min_count))

        logging.info('Adding 2-grams')
        bigram = Phrases(self.corpus, min_count=min_count, delimiter=b' ')

        if n == 3:
            logging.info('Adding 3-grams')
            trigram = Phrases(bigram[self.corpus], min_count=1, delimiter=b' ')
            for document in range(self.N):
                self.corpus[document] = [
                    n_gram for n_gram in trigram[bigram[self.corpus[document]]]
                    if n_gram.count(' ') < n
                ]
        elif n == 2:
            for document in range(self.N):
                self.corpus[document] = [
                    n_gram for n_gram in bigram[self.corpus[document]]
                    if n_gram.count(' ') < n
                ]
        else:
            logging.warning('Invalid parameter! Skipping n-grams...')
        return
 def train_with_trigrams(self):
     trigram_model = Phrases.load(self.trigram_model_filepath)
     bigram_model = Phrases.load(self.bigram_model_filepath)
     for doc, id in self.es_docs():
         unigrams = text_cleaner.clean_tokens(doc)
         bigrams = bigram_model[unigrams]
         trigrams = trigram_model[bigrams]
         trigrams = text_cleaner.filter_terms(trigrams)
         td = TaggedDocument(trigrams, [id])
         self.taggeddoc.append(td)
     print('Data Loading finished')
     print(len(self.taggeddoc), type(self.taggeddoc))
     model = gensim.models.Doc2Vec(self.taggeddoc,
                                   dm=0,
                                   iter=1,
                                   window=15,
                                   seed=1337,
                                   min_count=5,
                                   workers=4,
                                   alpha=0.025,
                                   size=200,
                                   min_alpha=0.025)
     for epoch in range(200):
         if epoch % 20 == 0:
             print('Now training epoch %s' % epoch)
         model.train(self.taggeddoc,
                     total_examples=model.corpus_count,
                     epochs=model.iter)
         model.alpha -= 0.002  # decrease the learning rate
         model.min_alpha = model.alpha  # fix the learning rate, no decay
     model.save(self.model_file)
     model.save_word2vec_format(self.model_file + '.word2vec')
def generate_bow(corpus_filename, category, use_bigrams, no_above, no_below):
    if not os.path.exists('./data/%s' % category):
        os.makedirs('./data/%s' % category)

    tokens = [
        utils.tokenize(line)
        for line, label in zip(open('./data/%s.csv' % corpus_filename),
                               open('./data/corpus-labels.csv'))
        if category in label
    ]
    print 'First token', tokens[1]

    category_filename = corpus_filename.replace('corpus', 'category')

    #Each category gets its own dictionary and its own corpus, but uses the same bigram model
    #that was computed on all the abstracts
    if use_bigrams:
        if not os.path.exists('./data/%s/bigram.bin' % category):
            bigram = Phrases(
                utils.tokenize(line)
                for line, label in zip(open('./data/%s.csv' % corpus_filename),
                                       open('./data/corpus-labels.csv'))
                if category in label)
            Phrases.save(bigram, './data/%s/bigram.bin' % category)
        else:
            bigram = Phrases.load('./data/%s/bigram.bin')

        tokens = [bigram[token] for token in tokens]
        print 'First bigram token', tokens[1]

    #Make the dictionary, a collection of statistics about all tokens in the corpus
    #This is the mapping from words to their id's. It's the lookup table for features.
    dictionary = corpora.Dictionary(tokens)

    # words that appear only once
    dictionary.filter_extremes(
        no_above, no_below)  #no_above=0.05, no_below=10 yielded good results
    # remove gaps in id sequence after words that were removed
    dictionary.compactify()

    # store the dictionary, for future reference
    dictionary.save('./data/%s/%s.dict' % (category, category_filename))

    # memory-friendly bag-of-words class
    class BOW(object):
        def __iter__(self):
            for line, label in zip(open('./data/%s.csv' % corpus_filename),
                                   open('./data/corpus-labels.csv')):
                # assume there's one document per line, tokens separated by whitespace
                if category in label:
                    yield dictionary.doc2bow(utils.tokenize(line))
                else:
                    pass

    # Now we can make a bag of words and do something with it by iterating over it
    arxiv_bow = BOW()
    corpora.MmCorpus.serialize('./data/%s/%s.mm' %
                               (category, category_filename),
                               arxiv_bow)  # store to disk, for later use
Esempio n. 26
0
    def get_ngrams(cls,
                   tokens: List[List[str]],
                   n: int = 2,
                   min_count: int = 3,
                   delimiter: str = b' ',
                   stop: Optional[List[str]] = None) -> List[List[str]]:
        """Add up to tri-grams to a list of tokens.
        
        Args:
            tokens:
                The list of paragraph tokens from which to search for ngrams.
            n:
                Optional, either '2' or '3'; Up to bigrams or trigrams. The default is to
                add up to bigrams.
            min_count: 
                Optional; The minimum amount of occurances for an ngram to be 
                added. The default is to add ngrams that occur at least 3 times.
            delimiter:
                Optional; The byte string to separate words in an n-gram. The
                default is to separate words in an n-gram with a space.
            stop:
                Optional; A list of stop words.
        
        Returns:
            A list of sentence tokens plus ngrams.
        """
        # Break down the list of paragraph tokens into a list of sentences tokens
        tokens = [token for paragraph in tokens for token in paragraph]
        sentences = [
            list(token) for delimiter, token in groupby(
                tokens,
                lambda token: re.match(cls.is_sentence_delimiter, token))
            if not delimiter
        ]
        amt_sentences = len(sentences)

        # Find the bigrams
        bigram = Phrases(sentences,
                         min_count=min_count,
                         delimiter=delimiter,
                         common_terms=stop)

        if n == 3:
            # Find the trigrams
            trigram = Phrases(bigram[sentences],
                              min_count=1,
                              delimiter=delimiter,
                              common_terms=stop)
            for sentence in range(amt_sentences):
                sentences[sentence] = [
                    n_gram for n_gram in trigram[bigram[sentences[sentence]]]
                ]
        else:
            for sentence in range(amt_sentences):
                sentences[sentence] = [
                    n_gram for n_gram in bigram[sentences[sentence]]
                ]

        return sentences
def create_ngram_models(documents):
    bigram = Phrases(documents, min_count=5, threshold=100)
    trigram = Phrases(bigram[documents], threshold=100)

    bigram_model = Phraser(bigram)
    trigram_model = Phraser(trigram)

    return bigram_model, trigram_model
Esempio n. 28
0
def get_bigram_model():
    model_exists = os.path.exists(bigram_model_filepath)
    if model_exists:
        bigram_model = Phrases.load(bigram_model_filepath)
    else:
        unigram_sentences = get_unigram_sentences()
        bigram_model = Phrases(unigram_sentences)
        bigram_model.save(bigram_model_filepath)
    return bigram_model
def main():
    # -------------------------------------------------------------------------------
    # Parameters

    # the script will most likely work if we swap the TEXTS variable
    # with any iterable of text (where one element represents a document,
    # and the whole iterable is the corpus)
    newsgroups_train = fetch_20newsgroups(subset = 'train')
    TEXTS = newsgroups_train.data

    # spacy's english model for text preprocessing
    NLP = spacy.load('en')

    # a set of stopwords built-in to spacy, we can always
    # expand this set for the problem that we are working on,
    # here we include python built-in string punctuation mark
    STOPWORDS = spacy.en.STOP_WORDS | set(punctuation) | set(ENGLISH_STOP_WORDS)

    # create a directory called 'model' to store all outputs in later section
    MODEL_DIR = 'model'
    UNIGRAM_PATH = os.path.join(MODEL_DIR, 'unigram.txt')
    PHRASE_MODEL_CHECKPOINT = os.path.join(MODEL_DIR, 'phrase_model')
    BIGRAM_PATH = os.path.join(MODEL_DIR, 'bigram.txt')
    WORD2VEC_CHECKPOINT = os.path.join(MODEL_DIR, 'word2vec')

    # -------------------------------------------------------------------------------
    logger.info('job started')
    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if not os.path.exists(UNIGRAM_PATH):
        logger.info('preprocessing text')
        export_unigrams(UNIGRAM_PATH, texts = TEXTS, parser = NLP, stopwords = STOPWORDS)

    if os.path.exists(PHRASE_MODEL_CHECKPOINT):
        phrase_model = Phrases.load(PHRASE_MODEL_CHECKPOINT)
    else:
        logger.info('training phrase model')
        # use LineSetence to stream text as oppose to loading it all into memory
        unigram_sentences = LineSentence(UNIGRAM_PATH)
        phrase_model = Phrases(unigram_sentences)
        phrase_model.save(PHRASE_MODEL_CHECKPOINT)

    if not os.path.exists(BIGRAM_PATH):
        logger.info('converting words to phrases')
        export_bigrams(UNIGRAM_PATH, BIGRAM_PATH, phrase_model)

    if os.path.exists(WORD2VEC_CHECKPOINT):
        word2vec = Word2Vec.load(WORD2VEC_CHECKPOINT)
    else:
        logger.info('training word2vec')
        sentences = LineSentence(BIGRAM_PATH)
        word2vec = Word2Vec(sentences, workers = cpu_count())
        word2vec.save(WORD2VEC_CHECKPOINT)

    logger.info('job completed')
def learnMultiword(ret):
    print("Learning multiword expressions")
    bigram = Phrases(ret)
    bigram.save("phrase_all.model")

    print("Sanity checking multiword expressions")
    test = "i like donald trump and hate muslims , go hillary , i like jesus , jesus , against , abortion "
    sent = test.split(" ")
    print(bigram[sent])
    return bigram[ret]
Esempio n. 31
0
def learnMultiword(ret):
    print("Learning multiword expressions")
    bigram = Phrases(ret)
    bigram.save("phrase_all.model")

    print("Sanity checking multiword expressions")
    test = "i like donald trump and hate muslims , go hillary , i like jesus , jesus , against , abortion "
    sent = test.split(" ")
    print(bigram[sent])
    return bigram[ret]
Esempio n. 32
0
def n_gram(df):
    """ 
    Trigram model of word probabilities (old-fashioned TM)
    @param df: DataFrame with a 'token' column
    """
    token = df.token.tolist()
    bigram_model = Phrases(token)
    trigram_model = Phrases(bigram_model[token], min_count=1)
    token_list = list(trigram_model[bigram_model[token]])
    return token_list
Esempio n. 33
0
class OverkillTokenizer(Tokenizer):
    def __init__(self, lemmatize=True, n_jobs=1, bigram=None, trigram=None, min_count=5, threshold=10.):
        self.lemmatize = lemmatize
        self.n_jobs = n_jobs
        self.bigram = bigram
        self.trigram = trigram
        self.min_count = min_count
        self.threshold = threshold

    def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        #print('RAKE tokenizing...')
        pre_tdocs = RAKETokenizer(n_jobs=self.n_jobs).tokenize(docs)

        for i, tdoc in enumerate(pre_tdocs):
            for t in tdoc:
                if t.startswith('one'):
                    print(t)
                    print(i)

        #print('Additional Tokenizing docs...')
        if self.n_jobs == 1:
            tdocs = [pre_tokenize(doc, tdoc, lem=lem) for doc, tdoc in zip(docs, pre_tdocs)]
        else:
            tdocs = parallel(partial(pre_tokenize, lem=lem), zip(docs, pre_tdocs), self.n_jobs, expand_args=True)

        #print('Training bigram...')
        if self.bigram is None:
            self.bigram = Phrases(tdocs,
                                  min_count=self.min_count,
                                  threshold=self.threshold,
                                  delimiter=b' ')
        else:
            self.bigram.add_vocab(tdocs)

        #print('Training trigram...')
        if self.trigram is None:
            self.trigram = Phrases(self.bigram[tdocs],
                                   min_count=self.min_count,
                                   threshold=self.threshold,
                                   delimiter=b' ')
        else:
            self.trigram.add_vocab(self.bigram[tdocs])

        return [tdoc for tdoc in self.trigram[self.bigram[tdocs]]]
Esempio n. 34
0
def train_phrases(paths, out='data/bigram_model.phrases', tokenizer=word_tokenize, **kwargs):
    """
    Train a bigram phrase model on a list of files.
    """
    n = 0
    for path in paths:
        print('Counting lines for {0}...'.format(path))
        n += sum(1 for line in open(path, 'r'))
    print('Processing {0} lines...'.format(n))

    # Change to use less memory. Default is 40m.
    kwargs = {
        'max_vocab_size': 40000000,
        'threshold': 8.
    }.update(kwargs)

    print('Training bigrams...')
    bigram = Phrases(_phrase_doc_stream(paths, n, tokenizer=word_tokenize), **kwargs)

    print('Saving...')
    bigram.save(out)
Esempio n. 35
0
File: dev.py Progetto: frnsys/geiger
def phrases():
    print('Loading phrases model...')
    bigram = Phrases.load('data/nyt/bigram_model.phrases')

    print('Creating listener...')
    address = ('localhost', 6001)
    with Listener(address, authkey=b'password') as listener:
        while True:
            with listener.accept() as conn:
                print('connection accepted from {0}'.format(listener.last_accepted))
                while True:
                    try:
                        msg = conn.recv()
                        conn.send(bigram[msg])
                    except (EOFError, ConnectionResetError):
                        break
Esempio n. 36
0
 def __init__(self, num_topics=100, min_word_count=20, 
              top_most_common_words=10, min_doc_length=40, 
              max_doc_length=1000, random_state=None):
     self.num_topics = num_topics
     self.min_word_count = min_word_count
     self.top_most_common_words = top_most_common_words
     
     assert max_doc_length > min_doc_length, \
            "max_doc_length must be greater than min_doc_length"
     self.min_doc_length = min_doc_length
     self.max_doc_length = max_doc_length
     self.random_state = random_state
     
     # natural language processing
     self.stop_words = self.getEnglishStopWords()
     self.bigramizer = Phrases()
Esempio n. 37
0
    def __init__(self, remote):
        global _phrases
        global _phrases_conn

        self.remote = remote
        if not remote and _phrases is None:
            print('Loading phrases model...')

            # Trained on 100-200k NYT articles
            _phrases = Phrases.load('data/nyt/bigram_model.phrases')
            print('Done loading phrases')
        elif _phrases_conn is None:
            print('Connecting to phrases process...')
            address = ('localhost', 6001)
            _phrases_conn = Client(address, authkey=b'password')
            print('Done connecting to phrases')
        self.conn = _phrases_conn
def extractW2VFeaturesSim(w2vmodelfile, phrasemodel, tweets, targets, labels):
    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodelfile)

    inv_topics = {v: k for k, v in tokenize_tweets.TOPICS_LONG.items()}


    for i, tweet in enumerate(tweets):

        # get the neut/pos/neg hashtags
        neut = KEYWORDS_NEUT[inv_topics[targets[i]]]
        pos = KEYWORDS_POS[inv_topics[targets[i]]]
        neg = KEYWORDS_NEG[inv_topics[targets[i]]]

        tokenised_tweet = tokenize(tweet.lower())
        words = filterStopwords(tokenised_tweet)

        neutcnt, poscnt, negcnt = 0, 0, 0
        neutsc, possc, negsc = 0.0, 0.0, 0.0


        # transform, as earlier, with the phrase model
        for token in phmodel[words]:
            try:
                neutsim = w2vmodel.similarity(neut, token)
                neutcnt += 1
                neutsc += neutsim
            except KeyError:
                neutsim = 0
            try:
                possim = w2vmodel.similarity(pos, token)
                possc += possim
                poscnt += 1
            except KeyError:
                possim = 0
            try:
                negsim = w2vmodel.similarity(neg, token)
                negsc += negsim
                negcnt += 1
            except KeyError:
                negsim = 0
            #print targets[i], "\t", token, "\t", neutsim, "\t", possim, "\t", negsim
        neutsc_tweet = neutsc/neutcnt
        possc_tweet = possc/poscnt
        negsc_tweet = negsc/negcnt
        print(targets[i], "\t", labels[i], "\t", neutsc_tweet, "\t", possc_tweet, "\t", negsc_tweet)
def extractFeaturesW2V(w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model", useDev = False):

    if useDev == False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2)

    phmodel = Phrases.load(phrasemodel)
    w2vmodel = word2vec.Word2Vec.load(w2vmodel)
    features_train_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_train, targets_train, labels_train)
    features_dev_w2v = extractW2VAggrFeatures(w2vmodel, phmodel, tweets_dev, targets_dev, labels_dev)

    return features_train_w2v, labels_train, features_dev_w2v, labels_dev
def extractFeaturesMulti(features=["auto_false", "bow", "targetInTweet", "emoticons", "affect", "w2v", "bow_phrase"]
        , automodel="model.ckpt", w2vmodel="skip_nostop_multi_300features_10minwords_10context", phrasemodel="phrase.model",
        useDev=True):
    if useDev==False:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
    else:
        tweets_train, targets_train, labels_train = readTweetsOfficial(tokenize_tweets.FILETRAIN, 'windows-1252', 2)
        tweets_origdev, targets_origdev, labels_origdev = readTweetsOfficial(tokenize_tweets.FILEDEV, 'windows-1252', 2)
        tweets_train.extend(tweets_origdev)
        targets_train.extend(targets_origdev)
        labels_train.extend(labels_origdev)
        tweets_dev, targets_dev, labels_dev = readTweetsOfficial(tokenize_tweets.FILETEST, 'windows-1252', 2)

    features_final = []

    if features.__contains__("bow"):
        features_final = extractFeatureVocab(tweets_train)
        features_train = extractFeaturesBOW(tweets_train, targets_train, features_final)
        features_dev = extractFeaturesBOW(tweets_dev, targets_dev, features_final)
    elif features.__contains__("targetInTweet"):
        features_train = extractFeaturesCrossTweetTarget(tweets_train, targets_train)
        features_dev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev)
        features_final.append("targetInTweet")

    if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"):
        if features.__contains__("bow_phrase"):
            features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True)
            features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True)
            features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True)
        elif features.__contains__("bow_phrase_anon"):
            features_vocab = extractFeatureVocab(tweets_train, usephrasemodel=True, anon_targets=True)
            features_train_phrbow = extractFeaturesBOW(tweets_train, targets_train, features_vocab, usephrasemodel=True, anon_targets=True)
            features_dev_phrbow = extractFeaturesBOW(tweets_dev, targets_dev, features_vocab, usephrasemodel=True, anon_targets=True)
        features_final.extend(features_vocab)

    if features.__contains__("auto_added"):
        useph=False
        if "phrase" in automodel:
            useph=True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "added", usephrasemodel=useph)
    elif features.__contains__("auto_true"):
        useph=False
        if "phrase" in automodel:
            useph=True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "true", usephrasemodel=useph)
    elif features.__contains__("auto_false"):
        useph=False
        if "phrase" in automodel:
            useph=True
        features_train_auto, labels_train, features_dev_auto, labels_dev = extractFeaturesAutoencoder(automodel, tweets_train, targets_train, labels_train, tweets_dev, targets_dev, labels_dev, "false", usephrasemodel=useph)

    targetInTweetTrain = []
    targetInTweetDev = []
    if features.__contains__("targetInTweet") and features.__contains__("bow"):
        targetInTweetTrain = extractFeaturesCrossTweetTarget(tweets_train, targets_train)
        targetInTweetDev = extractFeaturesCrossTweetTarget(tweets_dev, targets_dev)
        features_final.append("targetInTweet")
    if features.__contains__("emoticons"):
        emoticons_train, emoticons_vocab = extractEmoticons(tweets_train)
        emoticons_dev, emoticons_vocab = extractEmoticons(tweets_dev)
        for emo in emoticons_vocab:
            features_final.append("Emoticon_" + emo)
    if features.__contains__("affect"):
        affect_train, affect_vocab = getAffect(tweets_train)
        affect_dev, affect_vocab = getAffect(tweets_dev)
        for aff in affect_vocab:
            features_final.append("WNaffect_" + aff)

    if features.__contains__("hash"):
        phmodel = Phrases.load(phrasemodel)
        w2vmodel = word2vec.Word2Vec.load(w2vmodel)
        features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_train, targets_train, labels_train)
        features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "hash", tweets_dev, targets_dev, labels_dev)
    elif features.__contains__("w2v_hash"): # this contains hash
        phmodel = Phrases.load(phrasemodel)
        w2vmodel = word2vec.Word2Vec.load(w2vmodel)
        features_train_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_train, targets_train, labels_train)
        features_dev_w2v, features_w2v_vocab = extractW2VHashFeatures(w2vmodel, phmodel, "w2v_hash", tweets_dev, targets_dev, labels_dev)

    # combine features
    for i, featvec in enumerate(features_train):#features_train_auto)
        if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"):
            features_train[i] = np.append(features_train[i], features_train_auto[i])  # numpy append works as extend works for python lists
        if features.__contains__("targetInTweet") and features.__contains__("bow"):
            features_train[i] = np.append(features_train[i], targetInTweetTrain[i])
        if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"):
            features_train[i] = np.append(features_train[i], features_train_phrbow[i])
        if features.__contains__("emoticons"):
            features_train[i] = np.append(features_train[i], emoticons_train[i])
        if features.__contains__("affect"):
            features_train[i] = np.append(features_train[i], affect_train[i])
        if features.__contains__("w2v_hash") or features.__contains__("hash"):
            features_train[i] = np.append(features_train[i], features_train_w2v[i])
    for i, featvec in enumerate(features_dev):#features_dev_auto):
        if features.__contains__("auto_added") or features.__contains__("auto_true") or features.__contains__("auto_false"):
            features_dev[i] = np.append(features_dev[i], features_dev_auto[i])
        if features.__contains__("targetInTweet") and features.__contains__("bow"):
            features_dev[i] = np.append(features_dev[i], targetInTweetDev[i])
        if features.__contains__("bow_phrase") or features.__contains__("bow_phrase_anon"):
            features_dev[i] = np.append(features_dev[i], features_dev_phrbow[i])
        if features.__contains__("emoticons"):
            features_dev[i] = np.append(features_dev[i], emoticons_dev[i])
        if features.__contains__("affect"):
            features_dev[i] = np.append(features_dev[i], affect_dev[i])
        if features.__contains__("w2v_hash") or features.__contains__("hash"):
            features_dev[i] = np.append(features_dev[i], features_dev_w2v[i])


    return features_train, labels_train, features_dev, labels_dev, features_final
  w2v_model = Word2Vec.load(model_filepath)  # C binary format
except IndexError:
  print("using default model")
  current_dir = os.path.dirname(__file__)
  model_filepath = os.path.join(current_dir, 'model_sentences_raw_words_trigrams_min_count_50_size_200_downsampling_0.001.bin')
  w2v_model = Word2Vec.load(model_filepath)  # C binary format
print("using model from " + model_filepath)

bigrams_model_name = 'bigrams_model_nyt_sentences_5.5M_5.bin'
trigrams_model_name = "trigrams_model_nyt_sentences_5.5M_5.bin"
ngrams_models = {
  "bigrams": bigrams_model_name,
  "trigrams": trigrams_model_name
}
which_ngrams_model = "trigrams"
ngrams_model = Phrases.load(ngrams_models[which_ngrams_model])


print("finish loading w2v" +  str(datetime.now()))
print("loading w2v took  " + str((datetime.now() - start).seconds) + " seconds")

@w2v_api.route("/")
def hello():
    return json.dumps({"loaded": True})

@w2v_api.route("/similarize/<word>")
def similarize(word):
  try:
    try: 
      similar_words = cached_synonyms[word]
    except KeyError:
                extra_testing_mat[row - N_TRAINING, zips2id[line[0]]] = 1
            for cuisine in line[3:]:
                if cuisine in cuisine2id:
                    extra_testing_mat[row - N_TRAINING, cuisine2id[cuisine]] = 1
with open("extra_testing_matrix.pyobject", "wb") as f:
    pickle.dump(extra_testing_mat, f)
finish = time()
print("Complete!")
print("Running time: %.2f seconds" % (finish - start,))
print()


# BIGRAMS & TRIGRAMS
print("Creating n-gram corpus from training corpus...")
start = time()
phrases = Phrases(min_count=3, threshold=10.0)
with open("training_corpus.txt", "rt") as f:
    for line in f:
        phrases.add_vocab([line.rstrip().split()])
    _ = f.seek(0)
    with open("bigram_training_corpus.txt", "wt") as g:
        for line in f:
            word_list = phrases[line.rstrip().split()]
            g.write(" ".join(word_list) + "\n")
phrases = Phrases(min_count=3, threshold=10.0)
with open("bigram_training_corpus.txt", "rt") as f:
    for line in f:
        phrases.add_vocab([line.rstrip().split()])
    _ = f.seek(0)
    with open("trigram_training_corpus.txt", "wt") as g:
        for line in f:
Esempio n. 43
0
class TopicModel(object):
    '''
    This module preprocesses a corpus of documents and runs
    Latent Dirichlet Allocation (LDA) on a corpus of documents.
    
    Parameters
    ----------
    num_topics: int, default: 100
        input parameter to LDA
    
    min_word_count: int, default: 20
        if a token has fewer than min_word_count occurences 
        in the entire corpus, then it will be pruned from the 
        processed corpus
    
    top_most_common_words: int, default: 10
        prune tokens that are within the top_most_common_words 
        throughout the entire corpus 
    
    min_doc_length: int, default: 40
        if the number of tokens within a processed document 
        is less than min_doc_length, then the document is excluded
    
    max_doc_length: int, default: 1000
        if the number of tokens within a processed document 
        is greater than max_doc_length, then the document is excluded
    
    random_state: default: None
        the random seed for the Gensim LDA object
    
    Attributes
    ----------
    bigramizer: 
        the trained Gensim bigramizer
    
    tokens: 
        list of list of strings
    
    dictionary: 
        mapping from id to token
    
    corpus: 
        bag of words vectorization of the tokens
    
    lda: 
        the Gensim LDA object
      
    dominant_topic_ids: 
        list of dominant topic ids, in decreasing order of dominance
    '''

    def __init__(self, num_topics=100, min_word_count=20, 
                 top_most_common_words=10, min_doc_length=40, 
                 max_doc_length=1000, random_state=None):
        self.num_topics = num_topics
        self.min_word_count = min_word_count
        self.top_most_common_words = top_most_common_words
        
        assert max_doc_length > min_doc_length, \
               "max_doc_length must be greater than min_doc_length"
        self.min_doc_length = min_doc_length
        self.max_doc_length = max_doc_length
        self.random_state = random_state
        
        # natural language processing
        self.stop_words = self.getEnglishStopWords()
        self.bigramizer = Phrases()
        
    def fit(self, documents):
        '''
        parameters:
          documents: list of strings, each represents a document
        '''
        
        # tokens, dictionary, corpus for LDA
        self.tokens = self.preProcessCorpus(documents)
        self.dictionary = corpora.Dictionary(self.tokens)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.tokens]
        
        self.lda = self.getLDA(dictionary=self.dictionary, 
                               corpus=self.corpus, 
                               num_topics=self.num_topics, 
                               random_state=self.random_state)
        
        self.num_dominant_topics=min(10, self.num_topics)
        self.dominant_topic_ids = self.getDominantTopics(self.corpus, 
                                                         self.lda, 
                                                         self.num_dominant_topics)


    def __str__(self):
        description = ("topic model:\n\ttoken length = {0:,d}\n\tdictionary length = {1:,d}"
                       "\n\tnum_topics = {2:,d}\n\tmin_word_count = {3:,d}"
                       "\n\ttop_most_common_words = {4:,d}\n\tmin_doc_length = {5:,d}"
                       "\n\tmax_doc_length = {6:,d}")
        return description.format(len(self.tokens), 
                                  len(self.dictionary),
                                  self.num_topics, 
                                  self.min_word_count, 
                                  self.top_most_common_words, 
                                  self.min_doc_length, 
                                  self.max_doc_length)

    @staticmethod
    def getEnglishStopWords():
        '''
        returns a set of stop words for NLP pre-processing
        from nltk.corpus.stopwords()
        Also, some words and letters are added to the set,
        such as "please", "sincerely", "u", etc...
        '''
        stop_words = set(stopwords.words("english"))
        
        stop_words.add('please')
        stop_words.add('would')
        stop_words.add('use')
        stop_words.add('also')
        stop_words.add('thank')
        stop_words.add('sincerely')
        stop_words.add('regards')
        stop_words.add('hi')
        stop_words.add('hello')
        stop_words.add('greetings')
        stop_words.add('hey')
        stop_words.add('attachment')
        stop_words.add('attached')
        stop_words.add('attached_file')
        stop_words.add('see')
        stop_words.add('file')
        stop_words.add('comment')
        for item in 'abcdefghijklmnopqrstuvwxyz':
            stop_words.add(item)
        return stop_words
    
    
    @staticmethod
    def getFrequencies(tokens):
        """
        input: tokens, a list of list of tokens
        output: a collections.Counter() object that contains token counts
        """
        frequencies = Counter()
        for row in tokens:
            frequencies.update(row)
        return frequencies
    
    @staticmethod
    def getLowFreqWords(frequencies, countCutOff):
        """
        input: 
          frequencies: a collections.Counter() object
          countCutOff: the minimum frequency below which tokens are added to the set
                       of low frequency tokens
        """
        lowFreqTokens = set()
        for token, freq in frequencies.iteritems():
            if freq <= countCutOff:
                lowFreqTokens.add(token)
        return lowFreqTokens


    def preProcessCorpus(self, documents, min_word_count=None, 
                         top_most_common_words=None, min_doc_length=None, 
                         max_doc_length=None):
        '''
        this function pre-processes the documents and converts them into a list of list of tokens
        
        input: 
          documents: a list of strings (each string represents a document)
          min_word_count: if the frequency count of a token in the corpus is less 
                          than min_word_count then it is pruned
          top_most_common_words: if the frequency count of a token in the corpus
                                 exceeds top_most_common_words then it is pruned 
          min_doc_length: if the number of tokens within a processed document 
                          is less than min_doc_length, then the document is excluded
          max_doc_length: if the number of tokens within a processed document 
                          is greater than max_doc_length, then the document is excluded
        output:
          a list of list of tokens
        '''
        if min_word_count is None:
            min_word_count = self.min_word_count
        if top_most_common_words is None:
            top_most_common_words = self.top_most_common_words
        if min_doc_length is None:
            min_doc_length = self.min_doc_length
        if max_doc_length is None:
            max_doc_length = self.max_doc_length
        
        tokens = [tokenizer(document) for document in documents]
        
        # exclude comments that are longer than max_doc_length
        tokens = [tkn for tkn in tokens if len(tkn) < max_doc_length]
        
        # train Gensim Phrases model for bigrams
        self.bigramizer.add_vocab(tokens)
        
        # apply Gensim Phrases model to generate bigrams
        tokens = [self.bigramizer[tkn] for tkn in tokens]
        
        # exclude stop words
        tokens = [[t for t in tkn if t not in self.stop_words] for tkn in tokens]
        
        # exclude tokens that are shorter than min_doc_length
        tokens = [tkn for tkn in tokens if len(tkn) > min_doc_length]
        
        # calculate token frequencies to exclude low and high frequency tokens
        freqs = self.getFrequencies(tokens)
        low_freq_tokens = set(x[0] for x in freqs.iteritems() if x[1] < min_word_count)
        high_freq_tokens = [word[0] for word in freqs.most_common(top_most_common_words)]
        
        tokens =  [[t for t in tkn if t not in low_freq_tokens] for tkn in tokens]
        tokens =  [[t for t in tkn if t not in high_freq_tokens] for tkn in tokens]
        
        print '\nnumber of low frequency tokens pruned = {:,d}'\
              .format(len(low_freq_tokens))
        print 'min_word_count = {:d}, top_most_common_words = {:,d}'\
              .format(min_word_count, top_most_common_words)
        print 'number of high frequency tokens pruned = {:,d}'\
              .format(len(high_freq_tokens))
        print 'tokens = {:,d} rows'.format(len(tokens))
        print 'text pre-processing is complete\n'
        return tokens


    def getLDA(self, dictionary=None, corpus=None, num_topics=None, 
               random_state=None):
        # get LDA for dictionary_all and corpus_all
        print 'computing LDA...'
        
        if dictionary is None:
            dictionary = self.dictionary
        if corpus is None:
            corpus = self.corpus
        if num_topics is None:
            num_topics = self.num_topics
        
        lda = models.ldamodel.LdaModel(corpus=corpus, 
                                       alpha='auto', 
                                       id2word=dictionary, 
                                       num_topics=num_topics,
                                       random_state=random_state)
        return lda


    def getDominantTopics(self, corpus, lda, num_dominant_topics=None):
        
        print 'computing dominant topics...'
        if corpus is None:
            corpus = self.corpus
        if lda is None:
            lda = self.lda
        if num_dominant_topics is None:
            num_dominant_topics = self.num_dominant_topics
        
        # get topic weight matrix using lda.inference
        # the matrix has dimensions (num documents) x (num topics)
        inference = lda.inference(corpus)
        inference = inference[0] # the inference is a tuple, need the first term
        num_topics = lda.num_topics
        
        # find dominant topics across documents (vertical sum)
        column_sum_of_weights = np.sum(inference, axis=0)
        sorted_weight_indices = np.argsort(column_sum_of_weights)
        idx = np.arange(num_topics - num_dominant_topics, num_topics)
        dominant_topic_ids = sorted_weight_indices[idx]
        
        # the dominant_topic_ids store the ids in descending order of dominance
        dominant_topic_ids = dominant_topic_ids[::-1]
        
        # convert from numpy array to list and return
        return dominant_topic_ids.tolist()
Esempio n. 44
0
def phrases():
    p = Phrases(sentences=process_corpus('/Users/valeriyischenko/local/projects/lingua_hack/Text'))
    p.save('../wiki/text_phrase_model_p3')
Esempio n. 45
0
    if not 0<=k<=len(seq):
        for e in seq:
            yield e
    else:
        numbersPicked = 0
        for i,number in enumerate(seq):
            prob = (k-numbersPicked)/(len(seq)-i)
            if random.random() < prob:
                yield number
                numbersPicked += 1

f = open("tokenizer.pk", "rb")
tokenizer = pickle.load(f)
f.close()

bigram = Phrases.load('bigrams.pk')
trigram = Phrases.load('trigrams.pk')
ngram = Phrases.load('ngrams.pk')

print 'SemEval data'
for semeval_file in semeval_files:
    print 'File', semeval_file
    with open(semeval_file, 'r') as f:
        st = []
        for line in f:
            st += [line.strip()]
        text = read_visit_sem(st)
        text = [nltk.word_tokenize(s.lower()) for s in tokenizer.tokenize(text)]
        text = ngram[trigram[bigram[text]]]
        for sent in text:
            print '->', ' '.join(sent)
Esempio n. 46
0



from gensim.models import Phrases
from nytnlp.keywords import rake
from textblob import Blobber
from textblob_aptagger import PerceptronTagger
blob = Blobber(pos_tagger=PerceptronTagger())
stops = stopwords.words('english')
lem = WordNetLemmatizer()
dash_map = {ord(p): ' ' for p in '—-'}
punct_map = {ord(p): '' for p in string.punctuation + '“”—’‘'}

# Trained on 100-200k NYT articles
bigram = Phrases.load('data/bigram_model.phrases')

def clean_doc(doc):
    doc = doc.lower()
    doc = doc.replace('\'s ', ' ')
    doc = doc.translate(dash_map)
    doc = doc.translate(punct_map)
    return doc


def keyword_tokenize(doc):
    """
    Tokenizes a document so that only keywords and phrases
    are returned. Keywords are returned as lemmas.
    """
    doc = clean_doc(doc)
Esempio n. 47
0
                    ct += 1
                    if ct % 50000 == 0:
                        print ct
                    if line.strip() == '</VISIT>':
                        text = read_visit(st)
                        text = tokenizer.tokenize(text)
                        for sent in text:
                            yield nltk.word_tokenize(sent.lower())
                        st = []
                    elif line.strip() != '<VISIT>':
                        st += [line.strip()]
        except IOError:
            pass

f = open("tokenizer.pk", "rb")
tokenizer = pickle.load(f)
f.close()

print 'BIGRAMS'
bigram = Phrases(next_note(tokenizer), delimiter='')
bigram.save('bigrams.pk')

print 'TRIGRAMS'
trigram = Phrases(bigram[next_note(tokenizer)], delimiter='')
trigram.save('trigrams.pk')

print '4GRAMS'
ngram = Phrases(trigram[next_note(tokenizer)], delimiter='')
ngram.save('ngrams.pk')

Esempio n. 48
0
import string
import bz2
import nltk
from collections import Counter
from gensim.models import Phrases
from gensim.models import Word2Vec
from nltk.corpus import stopwords

sentences = []
bigram = Phrases()

with bz2.BZ2File('./2009.csv.bz2') as file_:
    for i, line in enumerate(file_):
        sentence = [word
                    for word in nltk.word_tokenize(line.decode("utf-8").lower())
                    if word not in string.punctuation]
        sentences.append(sentence)
        bigram.add_vocab([sentence])

bigram_model = Word2Vec(bigram[sentences])
bigram_model_counter = Counter()

bigram_model.save('ok.w2v')

for key in bigram_model.vocab.keys():
    if key not in stopwords.words("english"):
        if len(key.split("_")) > 1:
            bigram_model_counter[key] += bigram_model.vocab[key].count

for key, counts in bigram_model_counter.most_common(50):
    print('{0: <20} {1}'.format(key.encode("utf-8"), counts))
Esempio n. 49
0
docs = [
    ['the', 'new', 'york', 'times', 'is', 'a', 'newspaper'],
    ['concern', 'is', 'rising', 'in', 'many', 'quarters', 'that', 'the', 'united', 'states', 'is', 'retreating', 'from', 'global', 'economic', 'leadership', 'just', 'when', 'it', 'is', 'needed', 'most'],
    ['the', 'afghan', 'president', 'ashraf', 'ghani', 'blamed', 'the', 'islamic', 'state', 'group'],
    ['building', 'maintenance', 'by', 'the', 'hrynenko', 'family', 'which', 'owns', 'properties', 'in', 'the', 'east', 'village'],
    ['a', 'telegram', 'from', 'the', 'american', 'embassy', 'in', 'constantinople', 'to', 'the', 'state', 'department', 'in', 'washington']
]


# Change to use less memory. Default is 40m.
max_vocab_size = 40000000

# Train up to trigrams.
print('Training bigrams...')
bigram = Phrases(doc_stream(paths, n), max_vocab_size=max_vocab_size, threshold=8.)

print('Saving...')
bigram.save('bigram_model.phrases')

print('Training trigrams...')
trigram = Phrases(bigram[doc_stream(paths, n)], max_vocab_size=max_vocab_size, threshold=10.)

print('Saving...')
trigram.save('trigram_model.phrases')
print('Done.')


#print('Loading bigrams...')
#bigram = Phrases.load('bigram_model.phrases')
Esempio n. 50
0
import string 
import re
import numpy as np
from numpy import prod, dot
from gensim.models import Doc2Vec, Phrases
root = settings.root_path
big_file_dir = os.path.expanduser('~')+'/model/corpra/'
if sys.platform=='darwin':
    root = root.replace(os.path.expanduser('~'),
                        os.path.expanduser('~')+'/Dropbox')

########################################################################
# Find nearest neighbors in product space
#######################################################################
model = Doc2Vec.load(root+"model/movie_space/idf_reddit")
bigram = Phrases.load(big_file_dir+'movies_bigram_large.p','rb')
book_data = pickle.load( open(root+"model/movie_space/book_meta_data.p", "rb" ) )
title2asin = pickle.load( open(root+"model/movie_space/title2asin.p", "rb" ) )

def get_similar(query_book, pos_words, neg_words, topn=100):
    try:
        pos_vecs = []
        all_query_words = []
        for book in query_book:
            if book in title2asin:
                print "\tFound book: ", title2asin[book]            
                all_query_words.append(title2asin[book])
                pos_vecs.append(model.docvecs[title2asin[book]])

        for word in bigram[pos_words.replace(',', ' ').lower().split()]:
            if word in model:
from gensim.models import Phrases
from gensim.models import Word2Vec
from gensim.utils import lemmatize
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from collections import Counter

print("Reading input file 'input/audits_with_content.csv'")
with open('input/audits_with_content.csv', 'r') as f:
    reader = csv.reader(f)
    raw_documents = list(reader)

print("Prepare documents")
documents = [doc[2] for doc in raw_documents if doc[2] != '']
sentences = []
bigram = Phrases()

for document in documents:
    raw_text = document.lower()
    tokens = lemmatize(raw_text, stopwords=STOPWORDS)
    sentences.append(tokens)
    bigram.add_vocab([tokens])

bigram_counter = Counter()
for key in bigram.vocab.keys():
    if key not in stopwords.words("english"):
        if len(key.split("_")) > 1:
            bigram_counter[key] += bigram.vocab[key]

for key, counts in bigram_counter.most_common(200):
    print '{0: <20} {1}'.format(key.encode("utf-8"), counts)
Esempio n. 52
0
    def __init__(self):
        super().__init__(multithreaded=False)

        print('Loading phrases model...')
        self.bigram = Phrases.load('data/bigram_model.phrases')