Esempio n. 1
0
def create_tfidf(oversample=False, description=False):
    print("Reading the data...")

    if oversample:
        df_train = get_oversampled_train()
    else:
        df_train = pd.read_csv("data/train_raw.csv")

    df_test = pd.read_csv("data/test_raw.csv")

    print("Creating the corpus...")
    corpus_train = textacy.Corpus(lang='en',
                                  texts=df_train['description'].tolist())
    corpus_test = textacy.Corpus(lang='en',
                                 texts=df_test['description'].tolist())

    tokenized_docs_train = (doc.to_terms_list(ngrams=1,
                                              named_entities=True,
                                              as_strings=True)
                            for doc in corpus_train)
    tokenized_docs_test = (doc.to_terms_list(ngrams=1,
                                             named_entities=True,
                                             as_strings=True)
                           for doc in corpus_test)

    print("Generating TF-IDF...")
    vectorizer = textacy.Vectorizer(apply_idf=True,
                                    norm="l2",
                                    min_df=4,
                                    max_df=.95)
    tfidf_train = vectorizer.fit_transform(tokenized_docs_train)
    tfidf_test = vectorizer.transform(tokenized_docs_test)

    tfidf_train = pd.DataFrame(tfidf_train.toarray())
    tfidf_test = pd.DataFrame(tfidf_test.toarray())

    if description:
        pd.concat([tfidf_train, df_train['label']],
                  axis=1).to_csv("data/tfidf_train_description.csv",
                                 index=False)
        pd.concat([tfidf_test, df_test['label']],
                  axis=1).to_csv("data/tfidf_test_description.csv",
                                 index=False)
    else:
        pd.concat([tfidf_train, df_train['label']],
                  axis=1).to_csv("data/tfidf_train.csv", index=False)
        pd.concat([tfidf_test, df_test['label']],
                  axis=1).to_csv("data/tfidf_test.csv", index=False)
Esempio n. 2
0
def load_vecorizer():
    if not os.path.isfile(BOW_VECTORIZER_PATH):
        vectorizer = textacy.Vectorizer(min_df=2, max_df=0.95, norm='l2')
        with open(BOW_VECTORIZER_PATH, 'wb') as file:
            log.info('Training vectorizer on data from %s...', DATA_PATH)
            docs = iter_text(DATA_PATH)
            vectorizer = vectorizer.fit(docs)
            log.info('Vectorizer was trained, writing it here %s...',
                     BOW_VECTORIZER_PATH)
            pickle.dump(vectorizer, file)
    else:
        with open(BOW_VECTORIZER_PATH, 'rb') as file:
            log.info('Loading vectorizer from %s', BOW_VECTORIZER_PATH)
            vectorizer = pickle.load(file)
            log.info('Vectorizer was loaded from %s', BOW_VECTORIZER_PATH)
    return vectorizer
Esempio n. 3
0
 def test_vectorization_and_topic_modeling_functionality(self):
     n_topics = 10
     top_n = 10
     vectorizer = textacy.Vectorizer(
         weighting='tfidf', normalize=True, smooth_idf=True,
         min_df=2, max_df=0.95)
     doc_term_matrix = vectorizer.fit_transform(
         (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
          for doc in self.corpus))
     model = textacy.TopicModel('nmf', n_topics=n_topics)
     model.fit(doc_term_matrix)
     doc_topic_matrix = model.transform(doc_term_matrix)
     self.assertIsInstance(doc_term_matrix, sp.csr_matrix)
     self.assertIsInstance(doc_topic_matrix, np.ndarray)
     self.assertEqual(doc_topic_matrix.shape[1], n_topics)
     for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_n):
         self.assertIsInstance(topic_idx, int)
         self.assertEqual(len(top_terms), top_n)
Esempio n. 4
0
 def vectorize(self,
               weighting='tf',
               min_df=0.1,
               max_df=0.95,
               max_n_terms=100000,
               exclude_pos=['PUNCT', 'SPACE']):
     '''
     DESC: Creates tf/tfidf/binary matrix of textacy corpus.
         weighting = (str) tf, tfidf, bindary
         min_df = (float/int) exclude terms that appear in less than precentage/number of documents
         max_df = (float/int) exclude terms that appear in more than precentage/number of documents
         max_n_terms = (int) max terms (features) to include in matrix
         exclude_pos = (lst of strs) list of POS tags to remove from vocabulary when creating matrix
     --Output--
         Returns creates tf/tfidf/binary matrix of textacy corpus.
     '''
     for doc in self.corpus:
         self.terms_list.append(list(doc.to_terms_list(n_grams=1, named_entities=True, \
                                             normalize='lemma', as_strings=True, \
                                             filter_stops=True, filter_punct=True, exclude_pos=exclude_pos)))
     self.vectorizer = textacy.Vectorizer(weighting=weighting, normalize=True, \
                                         smooth_idf=True, min_df=min_df, max_df=max_df, max_n_terms=max_n_terms)
     self.tfidf = self.vectorizer.fit_transform(self.terms_list)
     return self.tfidf
Esempio n. 5
0
    def fit(self):

        if self.refit:
            try:
                os.remove('./vectorizer.pkl')
            except FileNotFoundError:
                pass
        try:
            self.vectorized = joblib.load('vectorizer.pkl')
        except Exception as e:

            vectorizer = textacy.Vectorizer(weighting='tfidf',
                                            normalize=True,
                                            smooth_idf=True,
                                            min_df=10,
                                            max_df=0.95,
                                            max_n_terms=10000)

            self.vectorized.doc_term_matrix = vectorizer.fit_transform(
                (self.preprocess(doc) for doc in self.text_))
            self.vectorized.feature_names = vectorizer.feature_names
            self.vectorized.vectorizer = vectorizer
            open('keywords.txt', 'w').write(str(vectorizer.feature_names))
            joblib.dump(self.vectorized, 'vectorizer.pkl')
Esempio n. 6
0
def start_cluster_batch():
    topic_list_query = "SELECT * from sharelock.topic_list"
    topic_rows = session.execute(topic_list_query)
    topic_row_list = list(topic_rows)
    topic_frames = pd.DataFrame(topic_row_list)
    for idx, frame in topic_frames.iterrows():
        topic = frame['topic']
        category = frame['category']
        query = "SELECT * from sharelock.active_tweets where topic='" + topic + "'order by inserted_at desc limit 30"
        rows = session.execute(query)
        ent_dict = {}
        sorted_json = {}

        row_list = []
        for row in rows:
            xd = json.loads(row.tweet_batch)
            row_list = row_list + xd

        sorted_result = df = pd.DataFrame(data=row_list)
        sorted_result.set_index('tweet_id')
        sorted_result = sorted_result.drop_duplicates(subset='tweet_id',
                                                      keep='first')

        # Clean results by dropping items with similarity score o.98 or higher

        sorted_result['tweet_tokens'] = sorted_result['tweet_text'].apply(nlp)
        sorted_result['tweet_clean_text'] = sorted_result['tweet_text'].apply(
            get_cleaned_text)
        sorted_result['tweet_clean_tokens'] = sorted_result[
            'tweet_clean_text'].apply(nlp)
        sorted_result = remove_duplicate_posts(sorted_result)

        corpus = textacy.Corpus(lang="en_core_web_lg",
                                texts=list(sorted_result['tweet_text']),
                                metadatas=list(sorted_result['tweet_id']))

        terms_list = (doc.to_terms_list(ngrams=(1, 2, 3),
                                        named_entities=True,
                                        normalize=u'lemma',
                                        lemmatize=True,
                                        lowercase=True,
                                        as_strings=True,
                                        filter_stops=True,
                                        filter_punct=True,
                                        min_freq=1,
                                        exclude_pos=("PRON", "X", "PUNCT",
                                                     "SYM")) for doc in corpus)

        vectorizer = textacy.Vectorizer(tf_type='linear',
                                        apply_idf=True,
                                        idf_type='smooth')

        textacy.text_utils.clean_terms(terms_list)

        doc_term_matrix = vectorizer.fit_transform(terms_list)

        num_topics = int(len(sorted_result) / 10)

        model = textacy.tm.TopicModel('nmf', n_topics=num_topics)
        model.fit(doc_term_matrix)

        doc_topic_matrix = model.transform(doc_term_matrix)

        topic_cluster = {}
        for topic_idx, top_terms in model.top_topic_terms(
                vectorizer.id_to_term, topics=-1, top_n=8, weights=True):
            dct = dict(top_terms)
            tt_list = []
            for j in dct.keys():
                tt_list.append({"term": j, "weight": dct[j]})
            topic_cluster["topic-" + str(topic_idx)] = {"terms": tt_list}

        for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix,
                                                        topics=-1,
                                                        top_n=6,
                                                        weights=True):
            dct = dict(top_docs)
            tweet_in_topic_list = []
            for j in dct.keys():
                query_str = "tweet_id=" + corpus[j].metadata
                curr = sorted_result[sorted_result['tweet_id'] ==
                                     corpus[j].metadata]
                curr_frame_row = curr.iloc[0]
                is_attached_to_topic = False
                for prev_topic in topic_cluster:
                    if 'tweets' in topic_cluster[prev_topic]:
                        tweet_list = topic_cluster[prev_topic]['tweets']
                        for tweet in tweet_list:
                            if tweet['tweet_id'] == curr.iloc[0]['tweet_id']:
                                is_attached_to_topic = True
                                break

                if not is_attached_to_topic:
                    tweet_in_topic_list.append({
                        "tweet_id":
                        curr.iloc[0]['tweet_id'],
                        "tweet_text":
                        curr.iloc[0]['tweet_text'],
                        "user_score":
                        str(curr.iloc[0]['user_score']),
                        "raw_score":
                        str(curr.iloc[0]['raw_score'])
                    })
            if tweet_in_topic_list:
                topic_cluster["topic-" +
                              str(topic_idx)]['tweets'] = tweet_in_topic_list

        for curr_topic in topic_cluster:
            if 'tweets' in topic_cluster[curr_topic]:
                sent_weights = []
                for tweet in topic_cluster[curr_topic]['tweets']:
                    sent_weights = sent_weights + get_sent_weights(
                        tweet, topic_cluster[curr_topic]['terms'])
                sent_weights = sorted(sent_weights,
                                      key=lambda x: x['final_score'],
                                      reverse=True)
                top_sents = sent_weights[0:2]
                sorted_top_sents = sorted(sent_weights,
                                          key=lambda x: x['ent_score'],
                                          reverse=True)
                topic_title = ""
                topic_title_list = []
                for sent in sorted_top_sents:
                    if sent['structure_penalty'] < 50 and sent[
                            'word_score'] > 0:
                        topic_title_list.append(sent['text'].strip('\n'))
                topic_cluster[curr_topic]['title'] = topic_title_list

        result_dict = {}
        for k in topic_cluster.keys():
            if 'tweets' in topic_cluster[k]:
                result_dict[k] = topic_cluster[k]

        insert_at = datetime.datetime.now().timestamp()

        insert_values = [topic, category, insert_at, json.dumps(result_dict)]

        sql_query = "INSERT into sharelock.topic_clusters (topic, category, inserted_at, tweet_cluster) values (?, ?, ?, ?)"
        try:
            prepared = session.prepare(sql_query)
            session.execute(prepared, (insert_values))
        except Exception as e:
            print(e)
Esempio n. 7
0
def compute(corpus, tick=utility.noop, method='sklearn_lda', vec_args=None, term_args=None, tm_args=None, **args):
    
    tick()
    
    vec_args = utility.extend({}, DEFAULT_VECTORIZE_PARAMS, vec_args)
    
    terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ]
    fx_terms = lambda: terms # [ doc for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ]
            
    perplexity_score = None
    coherence_score = None
    vectorizer = None
    doc_topic_matrix = None
    doc_term_matrix = None
    
    documents = textacy_utility.get_corpus_documents(corpus)

    if method.startswith('sklearn'):
        
        vectorizer = textacy.Vectorizer(**vec_args)
        doc_term_matrix = vectorizer.fit_transform(fx_terms())

        model = textacy.TopicModel(method.split('_')[1], **tm_args)
        model.fit(doc_term_matrix)
        
        tick()
        
        doc_topic_matrix = model.transform(doc_term_matrix)
        
        tick()
        
        id2word = vectorizer.id_to_term
        bow_corpus = gensim.matutils.Sparse2Corpus(doc_term_matrix, documents_columns=False)
        
        # FIXME!!!
        perplexity_score = None
        coherence_score = None
        
    elif method.startswith('gensim_'):
        
        algorithm = method.split('_')[1].upper()
        
        id2word = gensim.corpora.Dictionary(fx_terms())
        bow_corpus = [ id2word.doc2bow(tokens) for tokens in fx_terms() ]
        
        if args.get('tfidf_weiging', False):
            # assert algorithm != 'MALLETLDA', 'MALLET training model cannot (currently) use TFIDF weighed corpus'
            tfidf_model = gensim.models.tfidfmodel.TfidfModel(bow_corpus)
            bow_corpus = [ tfidf_model[d] for d in bow_corpus ]
        
        algorithms = setup_gensim_algorithms(corpus, bow_corpus, id2word, tm_args)
        
        engine = algorithms[algorithm]['engine']
        engine_options = algorithms[algorithm]['options']
        
        model = engine(**engine_options)
        
        if hasattr(model, 'log_perplexity'):
            perplexity_score = model.log_perplexity(bow_corpus, len(bow_corpus))
        
        try:
            coherence_model_lda =  gensim.models.CoherenceModel(model=model, texts=fx_terms(), dictionary=id2word, coherence='c_v')
            coherence_score = coherence_model_lda.get_coherence()
        except Exception as ex:
            logger.error(ex)
            coherence_score = None
            
    processed = topic_model_utility.compile_metadata(
        model,
        bow_corpus,
        id2word,
        documents,
        vectorizer=vectorizer,
        doc_topic_matrix=doc_topic_matrix,
        n_tokens=200
    )
    
    model_data = types.SimpleNamespace(
        topic_model=model,
        id2term=id2word,
        bow_corpus=bow_corpus,
        doc_term_matrix=doc_term_matrix,
        #doc_topic_matrix=doc_topic_matrix,
        #vectorizer=vectorizer,
        processed=processed,
        perplexity_score=perplexity_score,
        coherence_score=coherence_score,
        options=dict(method=method, vec_args=vec_args, term_args=term_args, tm_args=tm_args, **args),
        coherence_scores=None
    )
    
    tick(0)
    
    return model_data
Esempio n. 8
0
        for sgr in singlerank:
            print(sgr)
        print("==textrank==")
        for sgr in textrank:
            print(sgr)
        print("==key_terms_from_semantic_network==")
        for trip in key_terms_from_semantic_network:
            print(trip)
        print("==matches==")
        for match in matches:
            print(match)
        print("\n")

    vectorizer = textacy.Vectorizer(weighting='tfidf',
                                    normalize=True,
                                    smooth_idf=True,
                                    min_df=3,
                                    max_df=0.95)
    doc_term_matrix = vectorizer.fit_transform(
        (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
         for doc in corpus))
    print(repr(doc_term_matrix))

    models = ['nmf', 'lda', 'lsa']
    for m in models:
        model = textacy.TopicModel(m, n_topics=10)
        model.fit(doc_term_matrix)
        doc_topic_matrix = model.transform(doc_term_matrix)
        print("==", m, "==")
        print(doc_topic_matrix.shape)
        for topic_idx, top_terms in model.top_topic_terms(
nlp = spacy.load('en')

debates = []
data = csv.reader(open('../debate_csvs/HanDeSeT.csv', 'r'))
for row in data:
    # adapted for handeset which features 7 columns of text per document (row)
    debates.append(
        [row[1], row[6] + row[7] + row[8] + row[9] + row[10] + row[11]])

df = pd.DataFrame(debates, columns=['title', 'text'])
chat_concat = (df.sort_values('title').groupby('title')['text'].agg(
    lambda col: '\n'.join(col.astype(str))))
docs = list(chat_concat.apply(lambda x: nlp(x)))
corpus = textacy.corpus.Corpus(nlp, docs=docs)
vectorizer = textacy.Vectorizer(tf_type='linear',
                                apply_idf=True,
                                idf_type='smooth',
                                norm='l2',
                                min_df=2,
                                max_df=5)
doc_term_matrix = vectorizer.fit_transform(
    (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True)
     for doc in corpus))
model = textacy.TopicModel('nmf', n_topics=10)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)

for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term,
                                                  top_n=10):
    print('topic', topic_idx, ':', '   '.join(top_terms))