def clean_tweet(tweet): tweet_clean = {key: tweet[key] for key in ['created_at', 'id', 'id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'retweet_count', 'favorite_count', 'lang']} if 'full_text' in tweet.keys(): tweet_clean['text'] = tweet['full_text'] elif 'extended_tweet' in tweet.keys(): tweet_clean['text'] = tweet['extended_tweet']['full_text'] else: tweet_clean['text'] = tweet['text'] if 'quote_count' in tweet.keys(): tweet_clean['quote_count'] = tweet['quote_count'] if 'reply_count' in tweet.keys(): tweet_clean['reply_count'] = tweet['reply_count'] tweet_clean['datetime'] = datetime.fromtimestamp(parser.parse(tweet['created_at']).timestamp()) if 'type' not in tweet.keys(): tweet_clean['type'] = tweet_type(tweet) if 'tweet_user_id' not in tweet.keys(): tweet_clean['tweet_user_id'] = tweet_creator(tweet)['id'] if 'tweet_user_id_str' not in tweet.keys(): tweet_clean['tweet_user_id_str'] = tweet_creator(tweet)['id_str'] if 'tweet_user_screen_name' not in tweet.keys(): tweet_clean['tweet_user_screen_name'] = tweet_creator(tweet)[ 'screen_name'] tweet_clean['timestamp'] = parser.parse(tweet['created_at']).timestamp() tweet_clean['text_processed'] = preprocess_text(tweet_clean['text']) text = tweetp.parse(tweet_clean['text']) tweet_clean['emojis'] = min(length(text.emojis), 127) tweet_clean['hashtags'] = min(length(text.hashtags), 127) tweet_clean['urls'] = min(length(text.urls), 127) tweet_clean['mentions'] = min(length(text.mentions), 127) return tweet_clean
def test_preprocess_text( self, mock_lemmatize_word: MagicMock, mock_substitute_token: MagicMock, mock_remove_name: MagicMock, mock_remove_stopword: MagicMock, mock_normalize_unicode: MagicMock, mock_remove_whitespace: MagicMock, mock_remove_punctuation: MagicMock, mock_remove_special_character: MagicMock, mock_check_spelling: MagicMock, mock_expand_contraction: MagicMock, mock_remove_itemized_bullet_and_numbering: MagicMock, mock_remove_phone_number: MagicMock, mock_remove_email: MagicMock, mock_remove_url: MagicMock, mock_to_lower: MagicMock): # Setup input_text = 'a test' # Actual call _ = preprocess_text(input_text) # Asserts mock_to_lower.assert_called_once() mock_remove_url.assert_called_once() mock_remove_email.assert_called_once() mock_remove_phone_number.assert_called_once() mock_remove_itemized_bullet_and_numbering.assert_called_once() mock_expand_contraction.assert_called_once() mock_check_spelling.assert_called_once() mock_remove_special_character.assert_called_once() mock_remove_punctuation.assert_called_once() mock_remove_whitespace.assert_called_once() mock_normalize_unicode.assert_called_once() mock_remove_stopword.assert_called_once() mock_remove_name.assert_called_once() mock_substitute_token.assert_called_once() mock_lemmatize_word.assert_called_once()
def clean_text(text): # Cleaing the text text = re.sub('@en', '', text) text = re.sub('@es', '', text) text = re.sub('@fr', '', text) text = preprocess_text(text, preprocess_functions) return text
def test_preprocess_text_integration_a(self): # Setup input_text = 'Helllo, I am John Doe!!! My email is [email protected]. Please visit my website ' \ 'www.johndoe.com ' expected_output = 'hello email please visit website' # Actual call output_text = preprocess_text(input_text) # Asserts self.assertEqual(output_text, expected_output)
def test_preprocess_text_integration_custom(self): # Setup input_text = 'Helllo, I am John Doe!!! My email is [email protected]. Visit my website www.johndoe.com ' expected_output = 'helllo i am john doe my email is visit my website ' # Actual call pipeline_functions = [to_lower, remove_url, remove_email, remove_punctuation] output_text = preprocess_text(input_text, pipeline_functions) # Asserts self.assertEqual(output_text, expected_output)
def test_preprocess_text_custom(self, mock_remove_phone_number: MagicMock, mock_remove_email: MagicMock, mock_remove_url: MagicMock, mock_to_lower: MagicMock): # Setup input_text = 'a test' # Actual call pipeline_functions = [mock_to_lower, mock_remove_url, mock_remove_email, mock_remove_phone_number] _ = preprocess_text(input_text, pipeline_functions) # Asserts mock_to_lower.assert_called_once() mock_remove_url.assert_called_once() mock_remove_email.assert_called_once() mock_remove_phone_number.assert_called_once()
def cleaning(segment): print("entered cleaning function") preprocess_functions = [ remove_whitespace, remove_special_character, normalize_unicode, expand_contraction, remove_name, ] segment = preprocess_text(segment, preprocess_functions) return segment
def clean_list(list_): list_clean = list_.copy() user = None if 'user' in list_clean: user = list_clean['user'].copy() list_clean['user_id_str'] = list_clean['user']['id_str'] list_clean['user_screen_name'] = list_clean['user']['screen_name'] del list_clean['user'] list_clean['created_at'] = parser.parse(list_clean['created_at']).replace(tzinfo=None) list_clean['text_processed'] = preprocess_text(' '.join([list_clean['name'], list_clean['description']])) return list_clean, user
def preprocess_and_save_data(data_file_name, preprocessed_data_file_name): print("Loading data") sentences = load_data(MOTHER_INPUT_FILE)[:8000] print("Data loaded") with open(data_file_name, 'wb') as f: cPickle.dump(sentences, f) print("Data saved") sentences = preprocess_text(sentences) with open(preprocessed_data_file_name, 'wb') as f: cPickle.dump(sentences, f) print("Preprocessed data saved")
def test_preprocess_text(): text = 'The Quick Brown Fox Jumped Over the Lazy dog.' assert text_preprocessing.preprocess_text( text, lowercase=True) == 'the quick brown fox jumped over the lazy dog.' text = 'The Quick Brown Fox Jumped Over. The Lazy dog.' assert text_preprocessing.preprocess_text( text, stopwords=True) == 'Quick Brown Fox Jumped . Lazy dog .' text = 'The Quick Brown Fox Jumped Over. The Lazy dog.' assert text_preprocessing.preprocess_text( text, stopwords=['dog', 'fox']) == 'The Quick Brown Jumped Over . The Lazy .' text = 'The Quick Brown Fox, Jumped Over! - The Lazy dog.' assert text_preprocessing.preprocess_text( text, replace_punctuation=True ) == 'The Quick Brown Fox Jumped Over The Lazy dog' text = 'The Quick Brown Fox, Jumped Over! - The Lazy dog.' assert text_preprocessing.preprocess_text( text, remove_punctuation=True ) == 'The Quick Brown Fox Jumped Over The Lazy dog'
auth = tw.OAuthHandler(apikey, s_apikey) auth.set_access_token(access, s_access) api = tw.API(auth, wait_on_rate_limit=True) search_term = input("What would you like to search on Twitter? ") search_results = tw.Cursor(api.search, q=search_term, lang="en").items(1000) raw_tweets = [tweet.text for tweet in search_results] pfuncts = [ to_lower, remove_number, remove_punctuation, remove_stopword, lemmatize_word ] clean1 = p.clean(str(raw_tweets)) clean2 = preprocess_text(clean1, pfuncts) tokens = word_tokenize(clean2) tokenblob = tb(str(tokens)) tokensent = tokenblob.sentiment.polarity print("Results") print("Corpus Size: " + str(len(tokens))) if tokensent > 0: print("Sentiment Score: " + str(tokensent) + " (Positive)\n") elif tokensent == 0: print("Sentiment Score: " + str(tokensent) + " (Neutral)\n") else: print("Sentiment Score: " + str(tokensent) + " (Negative)\n")
def search_engine_3(encoded_query, inverted_idx2, squared_tfidf_per_document, uncoded_query): """Uses search engine 2 to get the top 10 documents with with highest similarity to the query, then prompts the user to specify new info, related to the other book fields (e.g. bookTitle, setting, etc.), adjusts the score based on the new info and returns the top 3 books according to the new score Args: encoded_query (list): a textual query, encoded in integer inverted_idx2 (dict): the inverted index with tfidf scores squared_tfidf_per_document (dict): |d| of the cosine similarity formula (before sqrt) uncoded_query (list): the same textual query, not encoded in integers Returns: [dic]: the top k documents ranked by the new adjusted score """ # apply the second search engine (plot only) plot_result = search_engine_2(encoded_query, inverted_idx2, squared_tfidf_per_document, 10) additional_info = [] # maps each additional field to their position in the .tsv files field_to_idx = { 'booktitle': 0, 'bookseries': 1, 'bookauthors': 2, 'publishingdate': 8, 'characters': 9, 'setting': 10 } # prompts the user to insert additional information while True: try: info = input( 'please insert additional_info:\n Insert field name followed by ":" and the value\n Type "end" when you are done\n' ).lower() if info == 'end': break info = info.split(':') if info[0] in field_to_idx: additional_info.append(info) else: print('field not found, please try again\n') except: print('field not found, please try again\n') final_score = {} # Iterates over each book from the second search engine output for doc, score in plot_result: total_score = score with open('.\\tsvs\\article_' + str(doc) + '.tsv', 'r', encoding='utf-8') as f: all_fields = f.readlines()[2].split('\t') all_fields = [preprocess_text(field) for field in all_fields] # iterates over each additional info and if it matches, adjusts the score for item in additional_info: if item[1] in all_fields[field_to_idx[item[0]]]: total_score += total_score * 1 / 2 # final score for each document final_score[doc] = total_score # return the top 3 documents based on the new scoring return get_top_k(final_score, 3)
encoded_files_folder = "\\encoded_files\\" #create_inverted_idx(cwd, encoded_files_folder) #create_inverted_idx_2(cwd, encoded_files_folder) with open('inverted_idx.pickle', 'rb') as h: inverted_idx = pickle.load(h) with open('inverted_idx2.pickle', 'rb') as h: inverted_idx2 = pickle.load(h) with open('vocabulary.pickle', 'rb') as q: vocabulary = pickle.load(q) #store_squared_tfidf_per_document(inverted_idx2) with open('squared_tfidf_per_document.pickle', "rb") as q: squared_tfidf_per_document = pickle.load(q) query = input('enter your query:\n') preprocessed_query = preprocess_text(query) encoded_query = encode_query(preprocessed_query, vocabulary) print_search_engine_result(search_engine(encoded_query, inverted_idx)) print_search_engine_2_result( search_engine_2(encoded_query, inverted_idx2, squared_tfidf_per_document, 5)) print_search_engine_2_result( search_engine_3(encoded_query, inverted_idx2, squared_tfidf_per_document, preprocessed_query))
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer from poems import poems from text_preprocessing import preprocess_text processed_poems = [preprocess_text(poem) for poem in poems] vectorizer = TfidfVectorizer(norm=None) tfidf_scores = vectorizer.fit_transform(processed_poems) corpus_index = [f"Poem {i+1}" for i in range(len(poems))] feature_names = vectorizer.get_feature_names() try: df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=corpus_index) print(df_tf_idf) except: pass
titles.append(article.title) #prints the title of the article texts.append((article.text)) #prints the entire text of the article summaries.append(article.summary) #prints the summary of the article #print(article.keywords) #prints the keywords of the article counter += 1 if counter >= articles_examined: break except newspaper.article.ArticleException: continue #########################PREPARE REUTERS CORPUS AND TRAIN CLASSIFIER###################################################### preprocessed_corpus = [] for fid in reuters.fileids(): preprocessed_corpus.append(preprocess_text(reuters.words(fid))) cleaned_preprocessed_corpus = [] # creating the bag of words model bag_of_words_creator = CountVectorizer() bag_of_words = bag_of_words_creator.fit_transform(cleaned_preprocessed_corpus) # creating the tf-idf model tfidf_creator = TfidfVectorizer(min_df=0.2) tfidf = tfidf_creator.fit_transform(preprocessed_corpus) documents = [(list(reuters.words(fileid)), category) for category in reuters.categories() for fileid in reuters.fileids(category)] random.shuffle(documents)
def pretrain_actorCritic(): if USE_SAVED_PREPROCESSED_INPUT: #sentences = pickle.load(open(PRETRAINING_PREPROCESSED_INPUT_FILE, 'r'))[:5000] sentences = else: print("Loading data") sentences = load_data(PRETRAINING_DATA_FILE) print("Data loaded") sentences = preprocess_text(sentences)[:5000] print("shape of sentences", sentences.shape) print("Training w2v model") w2v_model = train_w2v_model(sentences) print("w2v model trained") token_sequences, output_sequences, token_to_index_dic = tokenize_and_pad_sentences(sentences) index_to_word_dic = get_index_to_word_dic(token_to_index_dic) token_sequences = np.asarray(token_sequences) output_sequences = np.asarray(output_sequences) print("input shape", token_sequences.shape) #token_sequences = token_sequences[:1000, :] #output_sequences[:1000, :] output_sequences = [one_hot(seq, len(token_to_index_dic)) for seq in output_sequences] print("Tokenization done. %d sequences" % len(token_sequences), "shape ", token_sequences.shape) #token_to_index_dic = get_word_to_index_dic(w2v_model, token_sequences) print("preprocessing done") train_x, train_y, val_x, val_y, test_x, test_y = get_train_val_test_data(token_sequences, output_sequences) autoencoder = Autoencoder(w2v_model, token_to_index_dic) print("Creating NN model") autoencoder.create_nn_model() print("NN model created") if LOAD_WEIGHTS: print("Loading saved weights from %s" % PRETRAINING_ACTOR_WEIGHTS_FILE) autoencoder.load_weights(PRETRAINING_ACTOR_WEIGHTS_FILE) if TRAIN_ACTOR: print("Training actor") autoencoder.train(train_x, train_y, val_x, val_y) if SAVE_WEIGHTS: print("Saving actor weights") autoencoder.save(PRETRAINING_ACTOR_WEIGHTS_FILE) print("Predicting using actor") output = autoencoder.predict(train_x) for seq in output: print(index_to_sentence(index_to_word_dic, [np.argmax(ele) for ele in seq])) print("Initializing actorCritic") actor_critic = ActorCriticAutoEncoder(w2v_model=w2v_model, token_to_index_dic=token_to_index_dic, actor=autoencoder.autoencoder) print("Creating critic model") actor_critic.create_critic_model() print("Critic model created") critic_train_x = output critic_train_y = [one_hot(seq, len(token_to_index_dic)) for seq in train_x] if TRAIN_CRITIC: print("Training critic") actor_critic.train_critic(critic_train_x, critic_train_y) print("Critic trained") if SAVE_WEIGHTS: print("Saving critic") actor_critic.save_critic(PRETRAINING_CRITIC_MODEL_FILE) print("Critic saved")