def unigram_smoothing(data, model): td, idf, vocab = data td = td[:, :-1] lsa = LSA(model) res = lsa.unigram_smoothing() print res.shape print np.abs(td - res).sum() / float(res.shape[0] * res.shape[1])
def unigram_smoothing(data, model): td, idf, vocab = data td = td[:,:-1] lsa = LSA(model) res = lsa.unigram_smoothing() print res.shape print np.abs(td-res).sum() / float(res.shape[0] * res.shape[1])
def summarize(self, text, n_sents=3): """ Summarize a given text and get top sentences """ try: prediction = dict() if text: if self.lang_code in self.valid_langs: if Utility.get_doc_length(text) > self.n_words: # generate sentences, normalized sentences from text sents, norm_sents = self.p.text_preprocessing(text) # generate doc-term-matrix, term-doc-matrix dt_matrix = self.generate_doc_term_matrix(norm_sents) td_matrix = self.generate_term_doc_matrix(dt_matrix) if self.method == "LSA": lsa = LSA(self.k, td_matrix) term_topic_matrix, singular_values, topic_doc_matrix = lsa.u, lsa.s, lsa.vt # remove singular values below given treshold singular_values = lsa.filter_singular_values( singular_values, self.sv_threshold) # get salience scores from top singular values & topic document matrix salience_scores = lsa.get_salience_scores( singular_values, topic_doc_matrix) # get the top sentence indices for summarization top_sentence_indices = lsa.get_top_sent_indices( salience_scores, n_sents) summary = self.generate_summary( sents, top_sentence_indices) elif self.method == "TEXT_RANK": tr = TextRank(dt_matrix, td_matrix) # build similarity graph similarity_matrix = tr.similiarity_matrix similarity_graph = tr.get_similarity_graph( similarity_matrix) # compute pagerank scores for all sentences ranked_sents = tr.rank_sentences(similarity_graph) # get the top sentence indices for summarization top_sentence_indices = tr.get_top_sentence_indices( ranked_sents, n_sents) summary = self.generate_summary( sents, top_sentence_indices) else: return "no method found" # apply cleaning for readability summary = Utility.remove_multiple_whitespaces(summary) summary = Utility.remove_trailing_whitespaces(summary) prediction["summary"] = summary prediction["message"] = "successful" else: return "required at least {} words".format( self.n_words) else: return "language not supported".format() else: return "required textual content" return prediction except Exception: logging.error("exception occured", exc_info=True)
def extract_file_features(filename, output_file): csvfile = open(output_file, 'w') wr = csv.writer(csvfile, quoting=csv.QUOTE_ALL, lineterminator='\n') header = ["id", "input_sim_article", "query_sim_article", "input_qcount", "input_qpercentage", "input_qonesen", "query_qcount", "query_qpercentage", "query_qonesen", "input_sim_word2vec", "query_sim_word2vec"] wr.writerow(header) data = pd.read_csv(filename, sep=';', encoding="utf-8", error_bad_lines=False) query_list = data[COL_QUERY_TEXT].unique() for index, row in data.iterrows(): features = [] try: query_text = row[COL_QUERY_TEXT] query_search = row[COL_QUERY_SEARCH] article = row[COL_ARTICLE_CONTENT] col_article = data[data[COL_QUERY_TEXT] == query_text][COL_ARTICLE_CONTENT] documents = [] for idx, art in enumerate(col_article): documents.append(art) if art == article: cur_idx = idx # id # features.append(hashlib.sha1(article.rstrip().encode()).hexdigest()) features.append(index) # similarity (input - article) similar = LSA(query_text, documents) features.append(similar.rank[cur_idx][1]) # similarity (query - article) similar = LSA(query_search, documents) features.append(similar.rank[cur_idx][1]) # word count query_count, query_percentage, query_onesen = word_count_features(query_text, article) features.append(query_count) features.append(query_percentage) features.append(query_onesen) query_count, query_percentage, query_onesen = word_count_features(query_search, article) features.append(query_count) features.append(query_percentage) features.append(query_onesen) # # word2vec sim (input - article) # features.append(old_calculate_similarity(query_text, article)) # # word2vec sim (query - article) # features.append(old_calculate_similarity(query_search, article)) # label features.append(row[COL_LABEL]) except: while len(features) < len(header): features.append(-1) features[len(features-1)] = row[COL_LABEL] print(features) wr.writerow(features) print("=============== \n") csvfile.close()
def word_topics(model): lsa = LSA(model) print lsa.word_topics().shape
def document_topics(model): lsa = LSA(model) print lsa.document_topics().shape
def folding_in(data, model): td, idf, vocab = data d = td[:, -1] lsa = LSA(model) print lsa.folding_in(d).shape
def train(data): td, idf, vocab = data td = td[:, :-1] lsa = LSA() return lsa.train(td, Z=10)
def train(data): td, idf, vocab = data td = td[:,:-1] lsa = LSA() return lsa.train(td, Z=10)
def initialize(self, article): similarity_array = [] # similarity_array.append(article) test = self.articleSummerization(article, 1) # in one line # for i in summerizedSentence: # test=str(i) print('-------Summerized Title-------') print(test) sitesContainingArticle, scrapId = self.googleSearch(article) print('sites_length_after_google search', len(sitesContainingArticle)) for index, url in enumerate(sitesContainingArticle): print('URL ', url, scrapId[index], '\n') raw_html = self.simple_get(url) #full page site content try: soup = BeautifulSoup( raw_html, 'html.parser') #proper formattinh raw_html # print('hua idhar') # print(soup) except Exception as e: print(e) return 0, [] _ = [s.extract() for s in soup('script')] soup_article = soup.find_all('div', {"class": scrapId[index]}) # print(soup_article) article_string = '' for data in soup_article: # print(data) article_string += data.text # article_string += data.text # print(article_string) if not article_string == '': # print('aaya\n') similarity_array.append( self.articleSummerization(article_string, 5)) else: print('nahi aaya\n') pass # for c in similarity_array: # print('\n\n\n',c) mylsa = LSA() wmdinit = WordMoverDistance() length = len(similarity_array) # print(length) if length == 0: return 0, sitesContainingArticle else: count = 0 score_array = [] while (count < length): print('\n\n', similarity_array[count]) lsa_similarity = mylsa.start([article + ' ' + article] + similarity_array, count + 1) wmdinit.data_accept(similarity_array[count], article) wmddistance = wmdinit.model() print('wordmover distance is', wmddistance) fuzzy = Fuzzy(lsa_similarity, wmddistance) score = fuzzy.get_score_data() # score = score/10 print('final score ', score) score_array.append(score) count = count + 1 score_array = sorted(score_array, key=lambda x: x, reverse=True) return min(100, np.around(sum(score_array[:2]), decimals=2) * 100), sitesContainingArticle # wmdinit=wordmover.WordMoverDistance(titles[count],titles[0]) # wmddistance=wmdinit.model()
def topic_labels(data, model, N=15): td, idf, vocab = data lsa = LSA(model) inv_vocab = inverse_vocab(vocab) print lsa.topic_labels(inv_vocab, N)
def folding_in(data, model): td, idf, vocab = data d = td[:,-1] lsa = LSA(model) print lsa.folding_in(d).shape
def fetch_regular_features( corpus_dir_path: str, reset=False) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame): """ FR : Récupère les mesures standards tel que les probabilités, distances, patron syntaxique pour chaque candidat du corpus\n EN : Fetch regular features such as probabilities, distances, syntaxical pattern for each candidat of the given corpus \n Params ------ corpus_dir_path : str\n FR : Emplacement du corpus\n EN : The corpus path\n reset : bool\n #TODOC Returns ------- features : DataFrame\n FR : Tableau des candidats et de leur mesures.\n EN : Table of the candidats and their features.\n lsa_noun : DataFrame\n FR : Tableau des candidats et du vecteur de leur nom\n EN : Table of the candidats and their noun vector\n lsa_verb : DataFrame\n FR : Tableau des candidats et du vecteur de leur verbe\n EN : Table of the candidats and their verb vector\n exps_lsa : DataFrae\n #TODO """ corpus_id = encode(str.encode(corpus_dir_path), 'hex').decode() + '.pkl' get_features = utilities.drive_cached_func(measurer.get_features, 'features' + corpus_id, reset) features = get_features(corpus_dir_path) get_pattern_frequency = utilities.drive_cached_func( measurer.get_candidats_pattern_frequency, 'patterns' + corpus_id, reset) patterns = get_pattern_frequency(corpus_dir_path) features = pd.merge(features, patterns, how='left', left_index=True, right_index=True).fillna(0) tmp = LSA(corpus_dir_path) lsa = pd.DataFrame(tmp.lsa, index=tmp.word_id) lsa.columns.name = 'WORD' exps_lsa = utilities.drive_cached_func(tmp, 'exps' + corpus_id, reset)(features) lsa_noun = pd.merge(features, lsa, how='left', left_on='NOUN', right_index=True).iloc[:, -100:].fillna(0) lsa_verb = pd.merge(features, lsa, how='left', left_on='VERB', right_index=True).iloc[:, -100:].fillna(0) len_v = pd.DataFrame(lsa_verb.abs().sum(axis=1)) len_v.columns = ['len_v'] len_n = pd.DataFrame(lsa_noun.abs().sum(axis=1)) len_n.columns = ['len_n'] features = pd.merge(features, len_v, left_index=True, right_index=True) features = pd.merge(features, len_n, left_index=True, right_index=True) dist_noun = pd.DataFrame(utilities.cos_similarities( exps_lsa.loc(axis=0)[lsa_noun.index].fillna(0).sort_index().values, lsa_noun.sort_index().values), index=lsa_noun.sort_index().index) dist_noun.columns = ['dist_noun'] features = pd.merge(features, dist_noun, left_index=True, right_index=True) # TODO replace loc with reindex dist_verb = pd.DataFrame(utilities.cos_similarities( exps_lsa.loc(axis=0)[lsa_verb.index].fillna(0).sort_index().values, lsa_verb.sort_index().values), index=lsa_verb.sort_index().index) dist_verb.columns = ['dist_verb'] features = pd.merge(features, dist_verb, left_index=True, right_index=True) features = features.assign( dist_relative=features['dist_noun'] / (features['dist_noun'] + features['dist_verb'])).fillna(0.5) return features, lsa_noun, lsa_verb, exps_lsa
from lsa import LSA if __name__ == '__main__': documents = [line.rstrip() for line in open('temp/all_book_titles.txt')] stopwords = set(line.rstrip() for line in open('temp/stopwords.txt')).union({ 'introduction', 'edition', 'series', 'application', 'approach', 'card', 'access', 'package', 'plus', 'etext', 'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed', 'third', 'second', 'fourth' }) model = LSA(stopwords) model.fit(documents) model.transform_plot()
from lsa import LSA if __name__ == '__main__': documents = [line.rstrip() for line in open('temp/all_book_titles.txt')] stopwords = set(line.rstrip() for line in open('temp/stopwords.txt')).union({ 'introduction', 'edition', 'series', 'application', 'approach', 'card', 'access', 'package', 'plus', 'etext', 'brief', 'vol', 'fundamental', 'guide', 'essential', 'printed', 'third', 'second', 'fourth' }) model = LSA(stopwords) model.fit(documents) model.plot()