def updated_topic_extraction(corpus, tm_obj, cluster_num): ''' Main function of topic modeling when a new document is assigned to the nearest cluster ''' n_topics = int(os.getenv('TOPIC_NUMBER_PER_CLUSTER')) print("Cluster #{}:".format(cluster_num)) norm_corpus = normalize_corpus(corpus) vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf') feature_names = vectorizer.get_feature_names() # Update the model object tm_obj.fit_transform(tfidf_matrix) weights = tm_obj.components_ topics = get_topics_terms_weights(weights, feature_names) print_topics_udf(topics=topics, total_topics=n_topics, num_terms=10, display_weights=True) # Return the updated model object return tm_obj
def topic_extraction(documents, labels): ''' Main function of topic modeling ''' num_clusters = len(set(labels)) n_topics = int(os.getenv('TOPIC_NUMBER_PER_CLUSTER')) matched = False tm_obj = [] for c in range(num_clusters): print("=" * 70) print("Cluster #{}:".format(c)) corpus = [ document for i, document in enumerate(documents) if labels[i] == c ] norm_corpus = normalize_corpus(corpus) vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf') feature_names = vectorizer.get_feature_names() if os.getenv('TOPIC_MODELING') == "lda": # Use Latent Dirichlet Allocation for topic modeling lda = LatentDirichletAllocation(n_components=n_topics, max_iter=1000, learning_method='online', learning_offset=10., random_state=42) lda.fit(tfidf_matrix) weights = lda.components_ matched = True tm_obj.append(lda) if os.getenv('TOPIC_MODELING') == "nmf": # Use Nonnegative Matrix Factorization for topic modeling nmf = NMF(n_components=n_topics, random_state=42, alpha=.1, l1_ratio=.5) nmf.fit(tfidf_matrix) weights = nmf.components_ matched = True tm_obj.append(nmf) if not matched: raise ValueError("Unknown topic modeling algorithm!") topics = get_topics_terms_weights(weights, feature_names) print_topics_udf(topics=topics, total_topics=n_topics, num_terms=10, display_weights=True) return tm_obj
def lsa_text_summarizer(documents, num_sentences=2, num_topics=2, feature_type='frequency', sv_threshold=0.5): vec, dt_matrix = build_feature_matrix(documents, feature_type=feature_type) td_matrix = dt_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics) min_sigma_value = max(s) * sv_threshold s[s < min_sigma_value] = 0 salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt))) top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1] top_sentence_indices.sort() for index in top_sentence_indices: print(sentences[index])
def textrank_text_summarizer(documents, num_sentences=2, feature_type='frequency'): vec, dt_matrix = build_feature_matrix(norm_sentences, feature_type='tfidf') similarity_matrix = (dt_matrix * dt_matrix.T) similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix) scores = networkx.pagerank(similarity_graph) ranked_sentences = sorted( ((score, index) for index, score in scores.items()), reverse=True) top_sentence_indices = [ ranked_sentences[index][1] for index in range(num_sentences) ] top_sentence_indices.sort() for index in top_sentence_indices: print sentences[index]
def lsa_text_summarizer(documents, num_sentences=2, num_topics=2, feature_type='frequency', sv_threshold=0.5): vec, dt_matrix = build_feature_matrix(documents, feature_type=feature_type) td_matrix = dt_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics) min_sigma_value = max(s) * sv_threshold s[s < min_sigma_value] = 0 salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt))) top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1] top_sentence_indices.sort() for index in top_sentence_indices: print sentences[index]
def textrank_text_summarizer(documents, num_sentences=2, feature_type='frequency'): vec, dt_matrix = build_feature_matrix(norm_sentences, feature_type='tfidf') similarity_matrix = (dt_matrix * dt_matrix.T) similarity_graph = networkx.from_scipy_sparse_matrix(similarity_matrix) scores = networkx.pagerank(similarity_graph) ranked_sentences = sorted(((score, index) for index, score in scores.items()), reverse=True) top_sentence_indices = [ranked_sentences[index][1] for index in range(num_sentences)] top_sentence_indices.sort() for index in top_sentence_indices: print sentences[index]
def run(): answers = [ 'Functions are used as one-time processing snippet for inling and jumbling the code.', 'Functions are used for reusing, inlining and jumbling the code.', 'Functions are used as one-time processing snippet for inlining and organizing the code.', 'Functions are used as one-time processing snippet for modularizing and jumbling the code.', 'Functions are used for reusing, inling and organizing the code.', 'Functions are used as one-time processing snippet for modularizing and organizing the code.', 'Functions are used for reusing, modularizing and jumbling the code.', 'Functions are used for reusing, modularizing and organizing the code.' ] model_answer = [ "Functions are used for reusing, modularizing and organizing the code." ] # normalize answers norm_corpus = normalize_corpus(answers, lemmatize=True) # normalize model_answer norm_model_answer = normalize_corpus(model_answer, lemmatize=True) vectorizer, corpus_features = build_feature_matrix( norm_corpus, feature_type='frequency') # extract features from model_answer model_answer_features = vectorizer.transform(norm_model_answer) doc_lengths = [len(doc.split()) for doc in norm_corpus] avg_dl = np.average(doc_lengths) corpus_term_idfs = compute_corpus_term_idfs(corpus_features, norm_corpus) for index, doc in enumerate(model_answer): doc_features = model_answer_features[index] bm25_scores = compute_bm25_similarity(doc_features, corpus_features, doc_lengths, avg_dl, corpus_term_idfs, k1=1.5, b=0.75) semantic_similarity_scores = [] for sentence in answers: score = (sentence_similarity(sentence, model_answer[0]) + sentence_similarity(model_answer[0], sentence)) / 2 semantic_similarity_scores.append(score) print 'Model Answer', ':', doc print '-' * 40 doc_index = 0 for score_tuple in zip(semantic_similarity_scores, bm25_scores): sim_score = ((score_tuple[0] * 10) + score_tuple[1]) / 2 if (sim_score < 1): sim_score = 0 elif (1 <= sim_score <= 2): sim_score = 1 elif (2 < sim_score <= 4): sim_score = 2 elif (4 < sim_score <= 6): sim_score = 3 elif (6 < sim_score <= 8): sim_score = 4 elif (8 < sim_score <= 10): sim_score = 5 print 'Ans num: {} Score: {}\nAnswer: {}'.format( doc_index + 1, sim_score, answers[doc_index]) print '-' * 40 doc_index = doc_index + 1 print
docs = parse_document(toy_text) text = ' '.join(docs) text_summarization_gensim(text, summary_ratio=0.4) sentences = parse_document(toy_text) norm_sentences = normalize_corpus(sentences,lemmatize=False) total_sentences = len(norm_sentences) print('Total Sentences in Document:', total_sentences) num_sentences = 3 num_topics = 2 vec, dt_matrix = build_feature_matrix(sentences, feature_type='frequency') td_matrix = dt_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics) sv_threshold = 0.5 min_sigma_value = max(s) * sv_threshold s[s < min_sigma_value] = 0 salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt))) print(np.round(salience_scores, 2)) top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1] top_sentence_indices.sort()
def main(): no_list = 5000 dataframe = pd.read_csv( './data/final_questions_data.csv', names=['user', 'college', 'category', 'problems', 'problem_link']) answers = list(dataframe['problems'][1:no_list]) answers = answers[1:no_list] if os.path.isfile('norm_corpus.csv'): pass read_df = pd.read_csv('norm_corpus.csv', names=['norm'], index_col=False) norm_corpus = read_df['norm'][1:].values.astype('U').tolist() else: norm_corpus = normalize_corpus(answers, lemmatize=True) write_df = pd.DataFrame(norm_corpus) write_df.to_csv('norm_corpus.csv', index=False, header=None) vectorizer, corpus_features = build_feature_matrix(norm_corpus, feature_type='tfidf') doc_lengths = [len(doc.split()) for doc in norm_corpus] avg_dl = np.average(doc_lengths) corpus_term_idfs = compute_corpus_term_idfs(corpus_features, norm_corpus) for answer in answers: answers = list(dataframe['problems'][1:no_list]) answers.remove(convert(answer)) model_answer = convert(answer) print(model_answer) # normalize answers norm_corpus = normalize_corpus(answers, lemmatize=True) # normalize model_answer norm_model_answer = normalize_corpus(model_answer, lemmatize=True) # extract features from model_answer model_answer_features = vectorizer.transform(norm_model_answer) for index, doc in enumerate(model_answer): doc_features = model_answer_features[index] bm25_scores = compute_bm25_similarity(doc_features, corpus_features, doc_lengths, avg_dl, corpus_term_idfs, k1=1.5, b=0.75) semantic_similarity_scores = [] for sentence in answers: score = (sentence_similarity(sentence, model_answer[0]) + sentence_similarity(model_answer[0], sentence)) / 2 semantic_similarity_scores.append(score) print('Model Answer', ':', doc) print('-' * 40) doc_index = 0 sim_scores = [] for score_tuple in zip(semantic_similarity_scores, bm25_scores): sim_score = ((score_tuple[0] * 10) + score_tuple[1]) / 2 sim_scores.append(sim_score) #print (sim_scores) print( sorted(range(len(sim_score)), key=lambda i: sim_score[i])[-5:]) break print('Ans num: {} Score: {}\nAnswer: {}'.format( doc_index + 1, sim_score, answers[doc_index])) print('-' * 40) doc_index = doc_index + 1 break
'Python is a great Programming language', 'Python and Java are popular Programming languages', 'Among Programming languages, both Python and Java are the most used in Analytics', 'The fox is quicker than the lazy dog', 'The dog is smarter than the fox', 'The dog, fox and cat are good friends'] query_docs = ['The fox is definitely smarter than the dog', 'Java is a static typed programming language unlike Python', 'I love to relax under the beautiful blue sky!'] # normalize and extract features from the toy corpus norm_corpus = normalize_corpus(toy_corpus, lemmatize=True) tfidf_vectorizer, tfidf_features = build_feature_matrix(norm_corpus, feature_type='tfidf', ngram_range=(1, 1), min_df=0.0, max_df=1.0) # normalize and extract features from the query corpus norm_query_docs = normalize_corpus(query_docs, lemmatize=True) query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs) def compute_cosine_similarity(doc_features, corpus_features, top_n=3): # get document vectors doc_features = doc_features[0] # compute similarities similarity = np.dot(doc_features, corpus_features.T) similarity = similarity.toarray()[0] # get docs with highest similarity scores
for row in reader: # each row is a list results.append(row[1]) # print results sentences = results # normalize corpus norm_req_synopses = normalize_corpus(sentences, lemmatize=True, only_text_chars=False) # extract tf-idf features vectorizer, feature_matrix = build_feature_matrix(norm_req_synopses, feature_type='tfidf', min_df=0.1, max_df=0.9, ngram_range=(1, 2)) # view number of features #print feature_matrix.shape # get feature names feature_names = vectorizer.get_feature_names() # print sample features #print feature_names[:20] topn_features = 10 cluster_details = {}
def main(): ''' Main function of document categorization ''' # Get a list of file names of all documents in a specified folder fnames = get_filenames() print("The total number of files: %g" % len(fnames)) titles = [] documents = [] for i, fname in enumerate(fnames): print("*" * 70) print("File no.%d %s is being processed ..." % (i, os.path.basename(fname))) text, corrupted_files = tika_parser(fname) if text: # ignore corrupted files all_tokens = preprocess_text(text) # Append this list as the new content describing the original document documents.append(all_tokens) # Keep only the file name without extension title = os.path.basename(fname).strip(".pdf") # Append the document title titles.append(title) # Write the document title and content to an SQLite database sqlite_entry(db, title, all_tokens) # Extract features from documents vectorizer, feature_matrix = \ build_feature_matrix(documents, feature_type='tfidf', min_df=0.0, max_df=1.0, ngram_range=(1, 1)) # Notice that 'feature_matrix' is normalized so that no extra normalization # is required # Save vectorizer in a file in the current folder with open(os.getenv('VECTORIZER_PKL_FILENAME'), 'wb') as file: pickle.dump(vectorizer, file) print(feature_matrix.shape) # Get feature names feature_names = vectorizer.get_feature_names() # Get the number of top features describing each cluster centroid topn_features = int(os.getenv('FEATURE_NUMBER')) matched = False if os.getenv('CLUSTERING') == "affinity": from document_clustering import (affinity_propagation, cluster_analysis) from topic_modeling import topic_extraction # Get clusters using affinity propagation ap_obj, clusters = affinity_propagation(feature_matrix=feature_matrix) cl_obj = ap_obj cluster_analysis(ap_obj, feature_names, titles, clusters, topn_features, feature_matrix) # Extract topics of each cluster tm_obj = topic_extraction(documents, ap_obj.labels_) matched = True if os.getenv('CLUSTERING') == "kmeans": from document_clustering import (k_means, cluster_analysis) from topic_modeling import topic_extraction # Get clusters using k-means num_clusters = int(os.getenv('CLUSTER_NUMBER')) km_obj, clusters = k_means(feature_matrix=feature_matrix, num_clusters=num_clusters) cl_obj = km_obj cluster_analysis(km_obj, feature_names, titles, clusters, topn_features, feature_matrix) # Extract topics of each cluster tm_obj = topic_extraction(documents, km_obj.labels_) matched = True if os.getenv('CLUSTERING') == "hierarchical": from document_clustering import (ward_hierarchical_clustering, plot_hierarchical_clusters) data = pd.DataFrame({'Title': titles}) # Build ward's linkage matrix linkage_matrix = ward_hierarchical_clustering(feature_matrix) # Plot the dendrogram plot_hierarchical_clusters(linkage_matrix=linkage_matrix, data=data, figure_size=(8, 10)) matched = True if not matched: raise ValueError("Unknown clustering algorithm!") # Save clustering and topic modeling objects in files in the current folder with open(os.getenv('CLUSTERING_PKL_FILENAME'), 'wb') as file: pickle.dump(cl_obj, file) with open(os.getenv('TOPIC_MODELING_PKL_FILENAME'), 'wb') as file: pickle.dump(tm_obj, file)
print 'Movie:', movie_titles[0] print 'Movie Synopsis:', movie_synopses[0][:1000] from normalization import normalize_corpus from utils import build_feature_matrix # normalize corpus norm_movie_synopses = normalize_corpus(movie_synopses, lemmatize=True, only_text_chars=True) # extract tf-idf features vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses, feature_type='tfidf', min_df=0.24, max_df=0.85, ngram_range=(1, 2)) # view number of features print feature_matrix.shape # get feature names feature_names = vectorizer.get_feature_names() # print sample features print feature_names[:20] from sklearn.cluster import KMeans def k_means(feature_matrix, num_clusters=5): km = KMeans(n_clusters=num_clusters,
tw = [term for term, wt in topic] print tw[:num_terms] if num_terms else tw print print_topics_gensim(topic_model=lsi, total_topics=total_topics, num_terms=5, display_weights=True) # LSI custom built topic model from utils import build_feature_matrix, low_rank_svd norm_corpus = normalize_corpus(toy_corpus) vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf') td_matrix = tfidf_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) total_topics = 2 feature_names = vectorizer.get_feature_names() u, s, vt = low_rank_svd(td_matrix, singular_count=total_topics) weights = u.transpose() * s[:, None] def get_topics_terms_weights(weights, feature_names): feature_names = np.array(feature_names) sorted_indices = np.array( [list(row[::-1]) for row in np.argsort(np.abs(weights))])
# Using Gensim Summarization Method docs = parse_document(document1) text = ' '.join(docs) text_summarization_gensim(text, summary_ratio=0.3) sentences = parse_document(document1) norm_sentences = normalize_corpus(sentences, lemmatize=False) total_sentences = len(norm_sentences) print('Total Sentences in Document:', total_sentences) num_sentences = 3 num_topics = 1 vec, dt_matrix = build_feature_matrix(sentences, feature_type='frequency') td_matrix = dt_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics) sv_threshold = 0.5 min_sigma_value = max(s) * sv_threshold s[s < min_sigma_value] = 0 salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt))) print(np.round(salience_scores, 2)) top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1] top_sentence_indices.sort()
sample_docs = [100, 5817, 7626, 7356, 1008, 7155, 3533, 13010] sample_data = [(test_reviews[index], test_sentiments[index]) for index in sample_docs] sample_data # normalization norm_train_reviews = normalize_corpus(train_reviews, lemmatize=True, only_text_chars=True) # feature extraction vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews, feature_type='tfidf', ngram_range=(1, 1), min_df=0.0, max_df=1.0) from sklearn.linear_model import SGDClassifier # build the model svm = SGDClassifier(loss='hinge', n_iter=500) svm.fit(train_features, train_sentiments) # normalize reviews norm_test_reviews = normalize_corpus(test_reviews, lemmatize=True, only_text_chars=True)
# Divide the data into the data (review) and the label (sentiment) in both training and testing sets train_reviews = np.array(train_data['review']) train_sentiments = np.array(train_data['sentiment']) test_reviews = np.array(test_data['review']) test_sentiments = np.array(test_data['sentiment']) # Normalize the training review data using the normalization.py module norm_train_reviews = normalize_corpus(train_reviews, lemmatize=False, only_text_chars=True) # Extract features from these normalized training reviews # - which features? Try other features using parameters provided in utils.py vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews, feature_type='tfidf', ngram_range=(1, 1), min_df=0.0, max_df=1.0) from sklearn.linear_model import SGDClassifier # Build/train an SVM classifier model with the train features extracted from reviews svm = SGDClassifier(loss='hinge', n_iter=500) svm.fit(train_features, train_sentiments) # We give the features and the correct labels # Normalize the test reviews norm_test_reviews = normalize_corpus(test_reviews, lemmatize=False, only_text_chars=True) # Extract features from the normalized test reviews
print 'Movie:', movie_titles[0] print 'Movie Synopsis:', movie_synopses[0][:1000] from normalization import normalize_corpus ### from utils import build_feature_matrix # normalize corpus norm_movie_synopses = normalize_corpus(movie_synopses, lemmatize=True, only_text_chars=True) #### # extract tf-idf features vectorizer, feature_matrix = build_feature_matrix(norm_movie_synopses, feature_type='tfidf', min_df=0.24, max_df=0.85, ngram_range=(1, 2)) # view number of features print feature_matrix.shape # get feature names feature_names = vectorizer.get_feature_names() # print sample features print feature_names[:20] from sklearn.cluster import KMeans def k_means(feature_matrix, num_clusters=5):
'The dog, fox and cat are good friends' ] # Documents that we will be measuring similarities for query_docs = [ 'The fox is definitely smarter than the dog', 'Java is a static typed programming language unlike Python', 'I love to relax under the beautiful blue sky!' ] # We normalize and extract features from the toy corpus norm_corpus = normalize_corpus(toy_corpus, lemmatize=True) # NB: As before it returns the particular 'vectorizer' used as well as the extracted feature matrix tfidf_vectorizer, tfidf_features = build_feature_matrix(norm_corpus, feature_type='tfidf', ngram_range=(1, 1), min_df=0.0, max_df=1.0) # Similarly, we normalize and extract features from the query corpus norm_query_docs = normalize_corpus(query_docs, lemmatize=True) # We use the same vectorizer that we used to build the feature matrix for the corpus also for query doc query_docs_tfidf = tfidf_vectorizer.transform(norm_query_docs) def compute_cosine_similarity(doc_features, corpus_features, top_n=3): # Get document vectors doc_features = doc_features[0] # Compute similarities by calling dot.product on transposed corpus feature vector similarity = np.dot(doc_features, corpus_features.T) similarity = similarity.toarray()[0]
print tw[:num_terms] if num_terms else tw print print_topics_gensim(topic_model=lsi, total_topics=total_topics, num_terms=5, display_weights=True) # LSI custom built topic model from utils import build_feature_matrix, low_rank_svd norm_corpus = normalize_corpus(toy_corpus) vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus, feature_type='tfidf') td_matrix = tfidf_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) total_topics = 2 feature_names = vectorizer.get_feature_names() u, s, vt = low_rank_svd(td_matrix, singular_count=total_topics) weights = u.transpose() * s[:, None] def get_topics_terms_weights(weights, feature_names): feature_names = np.array(feature_names) sorted_indices = np.array([list(row[::-1]) for row in np.argsort(np.abs(weights))])
#vectorizer, sample_features = build_feature_matrix(documents=norm_train_reviews, feature_type='tfidf', ngram_range=(1, 1), min_df=0.0, max_df=1.0) # normalization norm_train_reviews = normalize_corpus(train_reviews, lemmatize=True, only_text_chars=True) # feature extraction using tfidf with unigram vectorizer, train_features = build_feature_matrix(documents=norm_train_reviews, feature_type='tfidf', ngram_range=(1, 1), min_df=0.0, max_df=1.0) print(train_features) # Import SGDClassified and LogisticRegression models for training and testing to see the results from sklearn.linear_model import SGDClassifier, LogisticRegression # Build the model svm = SGDClassifier(loss='hinge', n_iter=500) # Train the model on training set svm.fit(train_features, train_sentiments)
def run(): """ answers=['Functions are used as one-time processing snippet for inling and jumbling the code.', 'Functions are used for reusing, inlining and jumbling the code.', 'Functions are used as one-time processing snippet for inlining and organizing the code.', 'Functions are used as one-time processing snippet for modularizing and jumbling the code.', 'Functions are used for reusing, inling and organizing the code.', 'Functions are used as one-time processing snippet for modularizing and organizing the code.', 'Functions are used for reusing, modularizing and jumbling the code.', 'Functions are used for reusing, modularizing and organizing the code.'] model_answer = ["Functions are used for reusing, modularizing and organizing the code."] """ dev_questions = [] dev_question_answers = [] train_questions = [] train_question_answers = [] filep = os.path.dirname(os.path.abspath(__file__)) #train_file = os.path.join(filep, "NQ-open.train.jsonl") #dev_file = os.path.join(filep, "NQ-open.efficientqa.dev.1.1.jsonl") train_file = os.path.join(filep, "test_train.jsonl") dev_file = os.path.join(filep, "test_dev.jsonl") with open(train_file, "r") as f: for line in f: d = json.loads(line) train_questions.append((d["question"])) if "answer" not in d: d["answer"] = "random" train_question_answers.append(d["answer"]) len_train = len(train_questions) with open(dev_file, "r") as f: for line in f: d = json.loads(line) dev_questions.append((d["question"])) if "answer" not in d: d["answer"] = "random" dev_question_answers.append(d["answer"]) len_dev = len(dev_questions) answers = train_questions model_answer = dev_questions # normalize answers norm_corpus = normalize_corpus(answers, lemmatize=True) print(sys.getsizeof(norm_corpus)) print(len(norm_corpus)) # normalize model_answer norm_model_answer = normalize_corpus(model_answer, lemmatize=True) vectorizer, corpus_features = build_feature_matrix( norm_corpus, feature_type='frequency') # extract features from model_answer model_answer_features = vectorizer.transform(norm_model_answer) doc_lengths = [len(doc.split()) for doc in norm_corpus] avg_dl = np.average(doc_lengths) corpus_term_idfs = compute_corpus_term_idfs(corpus_features, norm_corpus) train_predict = [None] * len_dev dev_predict = [None] * len_dev for index, doc in enumerate(model_answer): print(index) doc_features = model_answer_features[index] #bm25_scores = compute_bm25_similarity(model_answer_features,corpus_features,doc_lengths,avg_dl,corpus_term_idfs,k1=0.82, b=0.68) bm25_scores = compute_bm25_similarity(doc_features, corpus_features, doc_lengths, avg_dl, corpus_term_idfs, k1=0.82, b=0.68) exit() semantic_similarity_scores = [] for sentence in answers: score = (sentence_similarity(sentence, model_answer[0]) + sentence_similarity(model_answer[0], sentence)) / 2 semantic_similarity_scores.append(score) doc_index = 0 max_index = 0 max_score = 0 for score_tuple in zip(semantic_similarity_scores, bm25_scores): sim_score = ((score_tuple[0] * 10) + score_tuple[1]) / 2 if sim_score > max_score: max_score = sim_score max_index = doc_index doc_index = doc_index + 1 dev_predict[index] = train_question_answers[max_index][0] predict_output = [None] * len_dev for i in range(len_dev): output_dict = { 'question': dev_questions[i], 'prediction': dev_predict[i] } predict_output[i] = output_dict pred_file = os.path.join(filep, 'ef_dev_predict.json') with open(pred_file, 'w') as output: output.write(json.dumps(predict_output, indent=4) + '\n')
text_summarization_gensim(text, summary_ratio=0.4) sentences = parse_document(toy_text) norm_sentences = normalize_corpus(sentences,lemmatize=False) total_sentences = len(norm_sentences) print 'Total Sentences in Document:', total_sentences num_sentences = 3 num_topics = 2 vec, dt_matrix = build_feature_matrix(sentences, feature_type='frequency') td_matrix = dt_matrix.transpose() td_matrix = td_matrix.multiply(td_matrix > 0) u, s, vt = low_rank_svd(td_matrix, singular_count=num_topics) sv_threshold = 0.5 min_sigma_value = max(s) * sv_threshold s[s < min_sigma_value] = 0 salience_scores = np.sqrt(np.dot(np.square(s), np.square(vt))) print np.round(salience_scores, 2) top_sentence_indices = salience_scores.argsort()[-num_sentences:][::-1] top_sentence_indices.sort()