def lsa_text_extraction(textdoc, smooth=0.4, MIN_DIMENSIONS=3, REDUCTION_RATIO=1 / 1, topn=5): """ reduction_ratio: used to reduce computation cost: limit diagonal size, when it is 1 it keeps original diagonal size, when it is 0.4 only keep 0.4 * original diagonal size smooth: is a factor appened to matrix normalization, small value might cause overfitting and large value might cause underfitting """ ''' document to sentences ''' tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') document = tokenizer.tokenize(textdoc) ''' generate term freq matrix ''' assert 0.0 <= smooth < 1.0 preprocessed_text = textClean.pipeline(document, multi_gram=[1], lower_case=True, deacc=False, encoding='utf8', errors='strict', stem_lemma='lemma', tag_drop=[], nltk_stop=True, stop_word_list=[], check_numbers=False, word_length=2, remove_consecutives=True) dictionary = DocVector.generate_corpus_dict(preprocessed_text, no_below=2, no_above=0.5, keep_n=100000) doc_vec = DocVector.create_document_vector(preprocessed_text, dictionary) tfmatrix = DocVector.get_vocab_matrix(doc_vec, dictionary) matrix_copy = tfmatrix.values.T ''' Computes TF metrics for each sentence (column) in the given matrix and normalize the tf weights of all terms occurring in a document by the maximum tf in that document according to ntf_{t,d} = a + (1-a)\frac{tf_{t,d}}{tf_{max}(d)^{'}}. The smoothing term $a$ damps the contribution of the second term - which may be viewed as a scaling down of tf by the largest tf value in $d$ ''' max_word_frequencies = np.max(matrix_copy, axis=0) rows, cols = matrix_copy.shape for row in range(rows): for col in range(cols): max_word_frequency = max_word_frequencies[col] if max_word_frequency != 0: frequency = matrix_copy[row, col] / max_word_frequency matrix_copy[row, col] = smooth + (1.0 - smooth) * frequency ''' get ranks ''' u, sigma, v_matrix = singular_value_decomposition(matrix_copy, full_matrices=False) assert len(sigma) == v_matrix.shape[0] dimensions = max(MIN_DIMENSIONS, int(len(sigma) * REDUCTION_RATIO)) powered_sigma = tuple(s**2 if i < dimensions else 0.0 for i, s in enumerate(sigma)) ranks = [] for column_vector in v_matrix.T: rank = sum(s * v**2 for s, v in zip(powered_sigma, column_vector)) ranks.append(math.sqrt(rank)) ''' output result ''' percentile_list = pd.DataFrame({ 'sentence': document, 'rank': ranks, }).sort_values(by='rank', ascending=False) output_sentence = [i for i in percentile_list.head(topn)['sentence']] return output_sentence
# ## Model Development # ### 1. Split Dataset # In[3]: preprocessed_tokens = textClean.pipeline(data['review'][0:100].to_list(), multi_gram = [1], lower_case=True, deacc=False, encoding='utf8', errors='strict', stem_lemma = 'lemma', tag_drop = [], nltk_stop=True, stop_word_list=['movie'], check_numbers=False, word_length=3, remove_consecutives=True) dictionary = DocVector.generate_corpus_dict(preprocessed_tokens, no_below =1, no_above = 0.5, keep_n = 100000) bow_corpus = DocVector.create_document_vector(preprocessed_tokens, dictionary) my_df = DocVector.get_vocab_matrix(bow_corpus, dictionary) my_df.head(3) # In[4]: X_train, X_test, y_train, y_test = train_test_split(my_df.loc[:99,:], data['label'][0:100], test_size = 0.33, random_state = 11) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 11) # ### 2. Feature Engineering # * Apart from vectorization process, addtional features can be created from the dataset such as: # * Length of comments # * Number of entities (using Named-Entity Recognition(NER))
def transform(self, X, y=None): bow_corpus = DocVector.create_document_vector(X, dictionary) my_df = DocVector.get_vocab_matrix(bow_corpus, dictionary) return my_df
# In[29]: no_topics = 10 lda_allbow, bow_corpus, dictionary = lda.fit_lda(processed_letter_df['tokens'].to_list(), num_topics = no_topics) lda.lda_topics(lda_allbow) # ### Words Frequency # In[8]: dictionary = DocVector.generate_corpus_dict(processed_letter_df['tokens'].to_list(), no_below =1, no_above = 1, keep_n = 100000) bow_corpus = DocVector.create_document_vector(processed_letter_df['tokens'].to_list(), dictionary) my_df = DocVector.get_vocab_matrix(bow_corpus, dictionary) # In[31]: test = my_df[top_10_freq_words] test.index = yearid test = test.T # In[32]: snsplt.plot_heatmap(test, x='year', y='count', title = 'Top 10 words heatmap')