def create_tfidf(oversample=False, description=False): print("Reading the data...") if oversample: df_train = get_oversampled_train() else: df_train = pd.read_csv("data/train_raw.csv") df_test = pd.read_csv("data/test_raw.csv") print("Creating the corpus...") corpus_train = textacy.Corpus(lang='en', texts=df_train['description'].tolist()) corpus_test = textacy.Corpus(lang='en', texts=df_test['description'].tolist()) tokenized_docs_train = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus_train) tokenized_docs_test = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus_test) print("Generating TF-IDF...") vectorizer = textacy.Vectorizer(apply_idf=True, norm="l2", min_df=4, max_df=.95) tfidf_train = vectorizer.fit_transform(tokenized_docs_train) tfidf_test = vectorizer.transform(tokenized_docs_test) tfidf_train = pd.DataFrame(tfidf_train.toarray()) tfidf_test = pd.DataFrame(tfidf_test.toarray()) if description: pd.concat([tfidf_train, df_train['label']], axis=1).to_csv("data/tfidf_train_description.csv", index=False) pd.concat([tfidf_test, df_test['label']], axis=1).to_csv("data/tfidf_test_description.csv", index=False) else: pd.concat([tfidf_train, df_train['label']], axis=1).to_csv("data/tfidf_train.csv", index=False) pd.concat([tfidf_test, df_test['label']], axis=1).to_csv("data/tfidf_test.csv", index=False)
def load_vecorizer(): if not os.path.isfile(BOW_VECTORIZER_PATH): vectorizer = textacy.Vectorizer(min_df=2, max_df=0.95, norm='l2') with open(BOW_VECTORIZER_PATH, 'wb') as file: log.info('Training vectorizer on data from %s...', DATA_PATH) docs = iter_text(DATA_PATH) vectorizer = vectorizer.fit(docs) log.info('Vectorizer was trained, writing it here %s...', BOW_VECTORIZER_PATH) pickle.dump(vectorizer, file) else: with open(BOW_VECTORIZER_PATH, 'rb') as file: log.info('Loading vectorizer from %s', BOW_VECTORIZER_PATH) vectorizer = pickle.load(file) log.info('Vectorizer was loaded from %s', BOW_VECTORIZER_PATH) return vectorizer
def test_vectorization_and_topic_modeling_functionality(self): n_topics = 10 top_n = 10 vectorizer = textacy.Vectorizer( weighting='tfidf', normalize=True, smooth_idf=True, min_df=2, max_df=0.95) doc_term_matrix = vectorizer.fit_transform( (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in self.corpus)) model = textacy.TopicModel('nmf', n_topics=n_topics) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) self.assertIsInstance(doc_term_matrix, sp.csr_matrix) self.assertIsInstance(doc_topic_matrix, np.ndarray) self.assertEqual(doc_topic_matrix.shape[1], n_topics) for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=top_n): self.assertIsInstance(topic_idx, int) self.assertEqual(len(top_terms), top_n)
def vectorize(self, weighting='tf', min_df=0.1, max_df=0.95, max_n_terms=100000, exclude_pos=['PUNCT', 'SPACE']): ''' DESC: Creates tf/tfidf/binary matrix of textacy corpus. weighting = (str) tf, tfidf, bindary min_df = (float/int) exclude terms that appear in less than precentage/number of documents max_df = (float/int) exclude terms that appear in more than precentage/number of documents max_n_terms = (int) max terms (features) to include in matrix exclude_pos = (lst of strs) list of POS tags to remove from vocabulary when creating matrix --Output-- Returns creates tf/tfidf/binary matrix of textacy corpus. ''' for doc in self.corpus: self.terms_list.append(list(doc.to_terms_list(n_grams=1, named_entities=True, \ normalize='lemma', as_strings=True, \ filter_stops=True, filter_punct=True, exclude_pos=exclude_pos))) self.vectorizer = textacy.Vectorizer(weighting=weighting, normalize=True, \ smooth_idf=True, min_df=min_df, max_df=max_df, max_n_terms=max_n_terms) self.tfidf = self.vectorizer.fit_transform(self.terms_list) return self.tfidf
def fit(self): if self.refit: try: os.remove('./vectorizer.pkl') except FileNotFoundError: pass try: self.vectorized = joblib.load('vectorizer.pkl') except Exception as e: vectorizer = textacy.Vectorizer(weighting='tfidf', normalize=True, smooth_idf=True, min_df=10, max_df=0.95, max_n_terms=10000) self.vectorized.doc_term_matrix = vectorizer.fit_transform( (self.preprocess(doc) for doc in self.text_)) self.vectorized.feature_names = vectorizer.feature_names self.vectorized.vectorizer = vectorizer open('keywords.txt', 'w').write(str(vectorizer.feature_names)) joblib.dump(self.vectorized, 'vectorizer.pkl')
def start_cluster_batch(): topic_list_query = "SELECT * from sharelock.topic_list" topic_rows = session.execute(topic_list_query) topic_row_list = list(topic_rows) topic_frames = pd.DataFrame(topic_row_list) for idx, frame in topic_frames.iterrows(): topic = frame['topic'] category = frame['category'] query = "SELECT * from sharelock.active_tweets where topic='" + topic + "'order by inserted_at desc limit 30" rows = session.execute(query) ent_dict = {} sorted_json = {} row_list = [] for row in rows: xd = json.loads(row.tweet_batch) row_list = row_list + xd sorted_result = df = pd.DataFrame(data=row_list) sorted_result.set_index('tweet_id') sorted_result = sorted_result.drop_duplicates(subset='tweet_id', keep='first') # Clean results by dropping items with similarity score o.98 or higher sorted_result['tweet_tokens'] = sorted_result['tweet_text'].apply(nlp) sorted_result['tweet_clean_text'] = sorted_result['tweet_text'].apply( get_cleaned_text) sorted_result['tweet_clean_tokens'] = sorted_result[ 'tweet_clean_text'].apply(nlp) sorted_result = remove_duplicate_posts(sorted_result) corpus = textacy.Corpus(lang="en_core_web_lg", texts=list(sorted_result['tweet_text']), metadatas=list(sorted_result['tweet_id'])) terms_list = (doc.to_terms_list(ngrams=(1, 2, 3), named_entities=True, normalize=u'lemma', lemmatize=True, lowercase=True, as_strings=True, filter_stops=True, filter_punct=True, min_freq=1, exclude_pos=("PRON", "X", "PUNCT", "SYM")) for doc in corpus) vectorizer = textacy.Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth') textacy.text_utils.clean_terms(terms_list) doc_term_matrix = vectorizer.fit_transform(terms_list) num_topics = int(len(sorted_result) / 10) model = textacy.tm.TopicModel('nmf', n_topics=num_topics) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) topic_cluster = {} for topic_idx, top_terms in model.top_topic_terms( vectorizer.id_to_term, topics=-1, top_n=8, weights=True): dct = dict(top_terms) tt_list = [] for j in dct.keys(): tt_list.append({"term": j, "weight": dct[j]}) topic_cluster["topic-" + str(topic_idx)] = {"terms": tt_list} for topic_idx, top_docs in model.top_topic_docs(doc_topic_matrix, topics=-1, top_n=6, weights=True): dct = dict(top_docs) tweet_in_topic_list = [] for j in dct.keys(): query_str = "tweet_id=" + corpus[j].metadata curr = sorted_result[sorted_result['tweet_id'] == corpus[j].metadata] curr_frame_row = curr.iloc[0] is_attached_to_topic = False for prev_topic in topic_cluster: if 'tweets' in topic_cluster[prev_topic]: tweet_list = topic_cluster[prev_topic]['tweets'] for tweet in tweet_list: if tweet['tweet_id'] == curr.iloc[0]['tweet_id']: is_attached_to_topic = True break if not is_attached_to_topic: tweet_in_topic_list.append({ "tweet_id": curr.iloc[0]['tweet_id'], "tweet_text": curr.iloc[0]['tweet_text'], "user_score": str(curr.iloc[0]['user_score']), "raw_score": str(curr.iloc[0]['raw_score']) }) if tweet_in_topic_list: topic_cluster["topic-" + str(topic_idx)]['tweets'] = tweet_in_topic_list for curr_topic in topic_cluster: if 'tweets' in topic_cluster[curr_topic]: sent_weights = [] for tweet in topic_cluster[curr_topic]['tweets']: sent_weights = sent_weights + get_sent_weights( tweet, topic_cluster[curr_topic]['terms']) sent_weights = sorted(sent_weights, key=lambda x: x['final_score'], reverse=True) top_sents = sent_weights[0:2] sorted_top_sents = sorted(sent_weights, key=lambda x: x['ent_score'], reverse=True) topic_title = "" topic_title_list = [] for sent in sorted_top_sents: if sent['structure_penalty'] < 50 and sent[ 'word_score'] > 0: topic_title_list.append(sent['text'].strip('\n')) topic_cluster[curr_topic]['title'] = topic_title_list result_dict = {} for k in topic_cluster.keys(): if 'tweets' in topic_cluster[k]: result_dict[k] = topic_cluster[k] insert_at = datetime.datetime.now().timestamp() insert_values = [topic, category, insert_at, json.dumps(result_dict)] sql_query = "INSERT into sharelock.topic_clusters (topic, category, inserted_at, tweet_cluster) values (?, ?, ?, ?)" try: prepared = session.prepare(sql_query) session.execute(prepared, (insert_values)) except Exception as e: print(e)
def compute(corpus, tick=utility.noop, method='sklearn_lda', vec_args=None, term_args=None, tm_args=None, **args): tick() vec_args = utility.extend({}, DEFAULT_VECTORIZE_PARAMS, vec_args) terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ] fx_terms = lambda: terms # [ doc for doc in textacy_utility.extract_corpus_terms(corpus, term_args) ] perplexity_score = None coherence_score = None vectorizer = None doc_topic_matrix = None doc_term_matrix = None documents = textacy_utility.get_corpus_documents(corpus) if method.startswith('sklearn'): vectorizer = textacy.Vectorizer(**vec_args) doc_term_matrix = vectorizer.fit_transform(fx_terms()) model = textacy.TopicModel(method.split('_')[1], **tm_args) model.fit(doc_term_matrix) tick() doc_topic_matrix = model.transform(doc_term_matrix) tick() id2word = vectorizer.id_to_term bow_corpus = gensim.matutils.Sparse2Corpus(doc_term_matrix, documents_columns=False) # FIXME!!! perplexity_score = None coherence_score = None elif method.startswith('gensim_'): algorithm = method.split('_')[1].upper() id2word = gensim.corpora.Dictionary(fx_terms()) bow_corpus = [ id2word.doc2bow(tokens) for tokens in fx_terms() ] if args.get('tfidf_weiging', False): # assert algorithm != 'MALLETLDA', 'MALLET training model cannot (currently) use TFIDF weighed corpus' tfidf_model = gensim.models.tfidfmodel.TfidfModel(bow_corpus) bow_corpus = [ tfidf_model[d] for d in bow_corpus ] algorithms = setup_gensim_algorithms(corpus, bow_corpus, id2word, tm_args) engine = algorithms[algorithm]['engine'] engine_options = algorithms[algorithm]['options'] model = engine(**engine_options) if hasattr(model, 'log_perplexity'): perplexity_score = model.log_perplexity(bow_corpus, len(bow_corpus)) try: coherence_model_lda = gensim.models.CoherenceModel(model=model, texts=fx_terms(), dictionary=id2word, coherence='c_v') coherence_score = coherence_model_lda.get_coherence() except Exception as ex: logger.error(ex) coherence_score = None processed = topic_model_utility.compile_metadata( model, bow_corpus, id2word, documents, vectorizer=vectorizer, doc_topic_matrix=doc_topic_matrix, n_tokens=200 ) model_data = types.SimpleNamespace( topic_model=model, id2term=id2word, bow_corpus=bow_corpus, doc_term_matrix=doc_term_matrix, #doc_topic_matrix=doc_topic_matrix, #vectorizer=vectorizer, processed=processed, perplexity_score=perplexity_score, coherence_score=coherence_score, options=dict(method=method, vec_args=vec_args, term_args=term_args, tm_args=tm_args, **args), coherence_scores=None ) tick(0) return model_data
for sgr in singlerank: print(sgr) print("==textrank==") for sgr in textrank: print(sgr) print("==key_terms_from_semantic_network==") for trip in key_terms_from_semantic_network: print(trip) print("==matches==") for match in matches: print(match) print("\n") vectorizer = textacy.Vectorizer(weighting='tfidf', normalize=True, smooth_idf=True, min_df=3, max_df=0.95) doc_term_matrix = vectorizer.fit_transform( (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)) print(repr(doc_term_matrix)) models = ['nmf', 'lda', 'lsa'] for m in models: model = textacy.TopicModel(m, n_topics=10) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) print("==", m, "==") print(doc_topic_matrix.shape) for topic_idx, top_terms in model.top_topic_terms(
nlp = spacy.load('en') debates = [] data = csv.reader(open('../debate_csvs/HanDeSeT.csv', 'r')) for row in data: # adapted for handeset which features 7 columns of text per document (row) debates.append( [row[1], row[6] + row[7] + row[8] + row[9] + row[10] + row[11]]) df = pd.DataFrame(debates, columns=['title', 'text']) chat_concat = (df.sort_values('title').groupby('title')['text'].agg( lambda col: '\n'.join(col.astype(str)))) docs = list(chat_concat.apply(lambda x: nlp(x))) corpus = textacy.corpus.Corpus(nlp, docs=docs) vectorizer = textacy.Vectorizer(tf_type='linear', apply_idf=True, idf_type='smooth', norm='l2', min_df=2, max_df=5) doc_term_matrix = vectorizer.fit_transform( (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)) model = textacy.TopicModel('nmf', n_topics=10) model.fit(doc_term_matrix) doc_topic_matrix = model.transform(doc_term_matrix) for topic_idx, top_terms in model.top_topic_terms(vectorizer.id_to_term, top_n=10): print('topic', topic_idx, ':', ' '.join(top_terms))