def word_unigrams_5000_concat_tf_l2_holdout_unlbled_test(headlines, bodies): """ Simple bag of words feature extraction with term freq of words as feature vectors, length 5000 head + 5000 body, concatenation of head and body, l2 norm and bleeding (BoW = train+test+holdout+unlabeled test set). """ def combine_head_and_body(headlines, bodies): head_and_body = [ headline + " " + body for i, (headline, body) in enumerate(zip(headlines, bodies)) ] return head_and_body def get_features(vocab): vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=True, norm="l2", stop_words='english') X_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=True, norm="l2", stop_words='english') X_body = vectorizer_body.fit_transform(bodies) X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1) return X # get headlines and bodies of train, test and holdout set h, b = word_ngrams.get_head_body_tuples(include_holdout=True) # add the unlabeled test data words to the BoW of test+train+holdout data h_unlbled_test, b_unlbled_test = word_ngrams.get_head_body_tuples_unlbled_test( ) h.extend(h_unlbled_test) b.extend(b_unlbled_test) # create the vocab out of the BoW tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=5000, use_idf=True, norm='l2') tfidf.fit_transform(combine_head_and_body(h, b)) vocab = tfidf.vocabulary_ X = get_features(vocab) return X
def word_unigrams_5000_concat_tf_l2_holdout_unlbled_test(headlines, bodies): """ Simple bag of words feature extraction with term freq of words as feature vectors, length 5000 head + 5000 body, concatenation of head and body, l2 norm and bleeding (BoW = train+test+holdout+unlabeled test set). """ def combine_head_and_body(headlines, bodies): head_and_body = [headline + " " + body for i, (headline, body) in enumerate(zip(headlines, bodies))] return head_and_body def get_features(vocab): vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=True, norm="l2", stop_words='english') X_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=True, norm="l2", stop_words='english') X_body = vectorizer_body.fit_transform(bodies) X = np.concatenate([X_head.toarray(), X_body.toarray()], axis=1) return X # get headlines and bodies of train, test and holdout set h, b = word_ngrams.get_head_body_tuples(include_holdout=True) # add the unlabeled test data words to the BoW of test+train+holdout data h_unlbled_test, b_unlbled_test = word_ngrams.get_head_body_tuples_unlbled_test() h.extend(h_unlbled_test) b.extend(b_unlbled_test) # create the vocab out of the BoW tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=5000, use_idf=True, norm='l2') tfidf.fit_transform(combine_head_and_body(h, b)) vocab = tfidf.vocabulary_ X = get_features(vocab) return X
def NMF_topics(headlines, bodies, n_topics=300, include_holdout=False, include_unlbled_test = False, cosinus_dist=True): """ Implements non negative matrix factorization. Calculates the cos distance between the resulting head and body vector or just concatenates them. Links: http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-topics-extraction-with-nmf-lda-py https://pypi.python.org/pypi/lda on bottom see suggestions like MALLET, hca https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730 https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text """ # TODO check https://people.cs.umass.edu/~wallach/posters/bbow.pdf # TODO use bigrams, too # TODO use topics extracted with glove / word2vec # TODO use wikipeadia corpus?! features_dir = "%s/data/fnc-1/features" % ( path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))) def combine_head_and_body(headlines, bodies): head_and_body = [headline + " " + body for i, (headline, body) in enumerate(zip(headlines, bodies))] return head_and_body def get_all_data(head_and_body, filename): if not (os.path.exists(features_dir + "/" + filename + ".vocab")): vectorizer_all = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', use_idf=True, norm='l2') X_all = vectorizer_all.fit_transform(head_and_body) vocab = vectorizer_all.vocabulary_ print("NMF_topics: complete vocabulary length=" + str(len(list(vocab.keys())))) with open(features_dir + "/" + filename + ".vocab", 'wb') as handle: pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL) return X_all, vocab else: with open(features_dir + "/" + filename + ".vocab", 'rb') as handle: vocab = pickle.load(handle) vectorizer_all = TfidfVectorizer(vocabulary=vocab, norm='l2') X_all = vectorizer_all.fit_transform(head_and_body) return X_all, vectorizer_all.vocabulary_ def get_vocab(head_and_body, filename): if not (os.path.exists(features_dir + "/" + filename + ".vocab")): vectorizer_all = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', use_idf=True, norm='l2') X_all = vectorizer_all.fit_transform(head_and_body) vocab = vectorizer_all.vocabulary_ print("NMF_topics: complete vocabulary length=" + str(len(X_all[0]))) with open(features_dir + "/" + filename + ".vocab", 'wb') as handle: pickle.dump(vocab, handle, protocol=pickle.HIGHEST_PROTOCOL) return vocab else: with open(features_dir + "/" + filename + ".vocab", 'rb') as handle: return pickle.load(handle) def get_features(head_and_body): filename = "NMF_topics" + str(n_topics) + "topics" if include_holdout == True: filename += "_holdout" if include_unlbled_test == True: filename += "unlbled_test" if not (os.path.exists(features_dir + "/" + filename + ".pkl")): X_all, vocab = get_all_data(head_and_body, filename) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic nfm = NMF(n_components=n_topics, random_state=1, alpha=.1) print("NMF_topics: fit and transform body") t0 = time() nfm.fit_transform(X_all) print("done in %0.3fs." % (time() - t0)) with open(features_dir + "/" + filename + ".pkl", 'wb') as handle: joblib.dump(nfm, handle, protocol=pickle.HIGHEST_PROTOCOL) else: vocab = get_vocab(head_and_body, filename) with open(features_dir + "/" + filename + ".pkl", 'rb') as handle: nfm = joblib.load(handle) vectorizer_head = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) print("NMF_topics: transform head and body") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar nfm_head_matrix = nfm.transform(X_train_head) nfm_body_matrix = nfm.transform(X_train_body) if cosinus_dist == False: return np.concatenate([nfm_head_matrix, nfm_body_matrix], axis=1) else: # calculate cosine distance between the body and head X = [] for i in range(len(nfm_head_matrix)): X_head_vector = np.array(nfm_head_matrix[i]).reshape((1, -1)) # 1d array is deprecated X_body_vector = np.array(nfm_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X h, b = word_ngrams.get_head_body_tuples(include_holdout=include_holdout) if include_unlbled_test == True: h_unlbled_test, b_unlbled_test = word_ngrams.get_head_body_tuples_unlbled_test() h.extend(h_unlbled_test) b.extend(b_unlbled_test) head_and_body = combine_head_and_body(h, b) X = get_features(head_and_body) return X
def get_features(n_topics): features_dir = "%s/data/fnc-1/features" % (path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))) filename = "lsi_gensim_concat_" + str(n_topics) + "topics" if include_holdout == True: filename += "_holdout" if include_unlbled_test == True: filename += "_unlbled_test" h, b = word_ngrams.get_head_body_tuples(include_holdout=include_holdout) if include_unlbled_test == True: h_unlbled_test, b_unlbled_test = word_ngrams.get_head_body_tuples_unlbled_test() h.extend(h_unlbled_test) b.extend(b_unlbled_test) head_and_body = combine_and_tokenize_head_and_body(h, b, file_path=features_dir + "/" + filename + ".tokens") if (os.path.exists(features_dir + "/" + filename + ".dict")): print("latent_semantic_indexing_gensim_concat: dict found and load") dictionary = corpora.Dictionary.load(features_dir + "/" + filename + ".dict") else: print("latent_semantic_indexing_gensim_concat: create new dict") dictionary = corpora.Dictionary(head_and_body) dictionary.save(features_dir + "/" + filename + ".dict") if (os.path.exists(features_dir + "/" + filename + ".lsi")): print("latent_semantic_indexing_gensim_concat: found lsi model") lsi = models.LsiModel.load(features_dir + "/" + filename + ".lsi") else: print("latent_semantic_indexing_gensim_concat: build corpus and tfidf corpus") corpus = [dictionary.doc2bow(text) for text in head_and_body] tfidf = models.TfidfModel(corpus) # https://stackoverflow.com/questions/6287411/lsi-using-gensim-in-python corpus_tfidf = tfidf[corpus] print("latent_semantic_indexing_gensim_concat: create new lsi model") lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=n_topics) lsi.save(features_dir + "/" + filename + ".lsi") # get tfidf corpus of head and body corpus_train = [dictionary.doc2bow(text) for text in combine_and_tokenize_head_and_body(headlines, bodies)] tfidf_train = models.TfidfModel(corpus_train) corpus_train_tfidf = tfidf_train[corpus_train] corpus_lsi = lsi[corpus_train_tfidf] X_head = [] X_body = [] i = 0 for doc in corpus_lsi: if i < int(len(corpus_lsi) / 2): X_head_vector_filled = np.zeros(n_topics, dtype=np.float64) for id, prob in doc: X_head_vector_filled[id] = prob X_head.append(X_head_vector_filled) else: X_body_vector_filled = np.zeros(n_topics, dtype=np.float64) for id, prob in doc: X_body_vector_filled[id] = prob X_body.append(X_body_vector_filled) i += 1 X = np.concatenate([X_head, X_body], axis=1) return X
def latent_dirichlet_allocation_cos(headlines, bodies, n_topics=25, include_holdout=False, use_idf=False, term_freq=True, incl_unlbled_test = False): """ Sklearn LDA implementation based on the 5000 most important words (based on train+test data's term freq => bleeding). Returns feature vector of cosinus distances between the topic models of headline and bodies. Links: https://pypi.python.org/pypi/lda, bottom see suggestions like MALLET, hca https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730 https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text """ # TODO check https://people.cs.umass.edu/~wallach/posters/bbow.pdf # TODO check with bigrams, too # TODO try to use embeddings glove / word2vec def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): print("Topic #%d:" % topic_idx) print(", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print() def combine_head_and_body(headlines, bodies): head_and_body = [headline + " " + body for i, (headline, body) in enumerate(zip(headlines, bodies))] return head_and_body def get_features(vocab): vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3) print("latent_dirichlet_allocation_cos: fit and transform body") t0 = time() lda_body_matrix = lda_body.fit_transform(X_train_body) print("done in %0.3fs." % (time() - t0)) print("latent_dirichlet_allocation_cos: transform head") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar lda_head_matrix = lda_body.transform(X_train_head) #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100) print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body') # calculate cosine distance between the body and head X = [] for i in range(len(lda_head_matrix)): X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X if incl_unlbled_test == True: h, b = word_ngrams.get_head_body_tuples(include_holdout=True) h_test, b_test = word_ngrams.get_head_body_tuples_unlbled_test() h.extend(h_test) b.extend(b_test) tfidf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english', max_features=5000, use_idf=use_idf, norm='l2') tfidf.fit_transform(combine_head_and_body(h, b)) vocab = tfidf.vocabulary_ else: vocab = word_ngrams.create_word_ngram_vocabulary(ngram_range=(1, 1), max_features=5000, lemmatize=False, term_freq=term_freq, norm='l2', include_holdout=include_holdout, use_idf=use_idf) X = get_features(vocab) return X