Beispiel #1
0
def clean_tweet(tweet):
    tweet_clean = {key: tweet[key] for key in
                   ['created_at', 'id', 'id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str',
                    'in_reply_to_screen_name',
                    'retweet_count', 'favorite_count', 'lang']}
    if 'full_text' in tweet.keys():
        tweet_clean['text'] = tweet['full_text']
    elif 'extended_tweet' in tweet.keys():
        tweet_clean['text'] = tweet['extended_tweet']['full_text']
    else:
        tweet_clean['text'] = tweet['text']
    if 'quote_count' in tweet.keys(): tweet_clean['quote_count'] = tweet['quote_count']
    if 'reply_count' in tweet.keys(): tweet_clean['reply_count'] = tweet['reply_count']
    tweet_clean['datetime'] = datetime.fromtimestamp(parser.parse(tweet['created_at']).timestamp())
    if 'type' not in tweet.keys(): tweet_clean['type'] = tweet_type(tweet)
    if 'tweet_user_id' not in tweet.keys(): tweet_clean['tweet_user_id'] = tweet_creator(tweet)['id']
    if 'tweet_user_id_str' not in tweet.keys(): tweet_clean['tweet_user_id_str'] = tweet_creator(tweet)['id_str']
    if 'tweet_user_screen_name' not in tweet.keys(): tweet_clean['tweet_user_screen_name'] = tweet_creator(tweet)[
        'screen_name']

    tweet_clean['timestamp'] = parser.parse(tweet['created_at']).timestamp()

    tweet_clean['text_processed'] = preprocess_text(tweet_clean['text'])
    text = tweetp.parse(tweet_clean['text'])
    tweet_clean['emojis'] = min(length(text.emojis), 127)
    tweet_clean['hashtags'] = min(length(text.hashtags), 127)
    tweet_clean['urls'] = min(length(text.urls), 127)
    tweet_clean['mentions'] = min(length(text.mentions), 127)
    return tweet_clean
 def test_preprocess_text(
         self, mock_lemmatize_word: MagicMock,
         mock_substitute_token: MagicMock, mock_remove_name: MagicMock,
         mock_remove_stopword: MagicMock, mock_normalize_unicode: MagicMock,
         mock_remove_whitespace: MagicMock,
         mock_remove_punctuation: MagicMock,
         mock_remove_special_character: MagicMock,
         mock_check_spelling: MagicMock, mock_expand_contraction: MagicMock,
         mock_remove_itemized_bullet_and_numbering: MagicMock,
         mock_remove_phone_number: MagicMock, mock_remove_email: MagicMock,
         mock_remove_url: MagicMock, mock_to_lower: MagicMock):
     # Setup
     input_text = 'a test'
     # Actual call
     _ = preprocess_text(input_text)
     # Asserts
     mock_to_lower.assert_called_once()
     mock_remove_url.assert_called_once()
     mock_remove_email.assert_called_once()
     mock_remove_phone_number.assert_called_once()
     mock_remove_itemized_bullet_and_numbering.assert_called_once()
     mock_expand_contraction.assert_called_once()
     mock_check_spelling.assert_called_once()
     mock_remove_special_character.assert_called_once()
     mock_remove_punctuation.assert_called_once()
     mock_remove_whitespace.assert_called_once()
     mock_normalize_unicode.assert_called_once()
     mock_remove_stopword.assert_called_once()
     mock_remove_name.assert_called_once()
     mock_substitute_token.assert_called_once()
     mock_lemmatize_word.assert_called_once()
Beispiel #3
0
def clean_text(text):
    # Cleaing the text
    text = re.sub('@en', '', text)
    text = re.sub('@es', '', text)
    text = re.sub('@fr', '', text)
    text = preprocess_text(text, preprocess_functions)

    return text
 def test_preprocess_text_integration_a(self):
     # Setup
     input_text = 'Helllo, I am John Doe!!!   My email is [email protected]. Please visit my website ' \
                  'www.johndoe.com '
     expected_output = 'hello email please visit website'
     # Actual call
     output_text = preprocess_text(input_text)
     # Asserts
     self.assertEqual(output_text, expected_output)
 def test_preprocess_text_integration_custom(self):
     # Setup
     input_text = 'Helllo, I am John Doe!!! My email is [email protected]. Visit my website www.johndoe.com '
     expected_output = 'helllo i am john doe my email is  visit my website  '
     # Actual call
     pipeline_functions = [to_lower, remove_url, remove_email, remove_punctuation]
     output_text = preprocess_text(input_text, pipeline_functions)
     # Asserts
     self.assertEqual(output_text, expected_output)
 def test_preprocess_text_custom(self,
                                 mock_remove_phone_number: MagicMock, mock_remove_email: MagicMock,
                                 mock_remove_url: MagicMock, mock_to_lower: MagicMock):
     # Setup
     input_text = 'a test'
     # Actual call
     pipeline_functions = [mock_to_lower, mock_remove_url, mock_remove_email, mock_remove_phone_number]
     _ = preprocess_text(input_text, pipeline_functions)
     # Asserts
     mock_to_lower.assert_called_once()
     mock_remove_url.assert_called_once()
     mock_remove_email.assert_called_once()
     mock_remove_phone_number.assert_called_once()
def cleaning(segment):
    print("entered cleaning function")
    preprocess_functions = [
        remove_whitespace,
        remove_special_character,
        normalize_unicode,
        expand_contraction,
        remove_name,
    ]

    segment = preprocess_text(segment, preprocess_functions)

    return segment
Beispiel #8
0
def clean_list(list_):
    list_clean = list_.copy()
    user = None
    if 'user' in list_clean:
        user = list_clean['user'].copy()
        list_clean['user_id_str'] = list_clean['user']['id_str']
        list_clean['user_screen_name'] = list_clean['user']['screen_name']
        del list_clean['user']

    list_clean['created_at'] = parser.parse(list_clean['created_at']).replace(tzinfo=None)
    list_clean['text_processed'] = preprocess_text(' '.join([list_clean['name'], list_clean['description']]))

    return list_clean, user
Beispiel #9
0
def preprocess_and_save_data(data_file_name, preprocessed_data_file_name):
    print("Loading data")
    sentences = load_data(MOTHER_INPUT_FILE)[:8000]
    print("Data loaded")

    with open(data_file_name, 'wb') as f:
        cPickle.dump(sentences, f)
    print("Data saved")

    sentences = preprocess_text(sentences)

    with open(preprocessed_data_file_name, 'wb') as f:
        cPickle.dump(sentences, f)
    print("Preprocessed data saved")
Beispiel #10
0
def test_preprocess_text():
    text = 'The Quick Brown Fox Jumped Over the Lazy dog.'
    assert text_preprocessing.preprocess_text(
        text,
        lowercase=True) == 'the quick brown fox jumped over the lazy dog.'

    text = 'The Quick Brown Fox Jumped Over. The Lazy dog.'
    assert text_preprocessing.preprocess_text(
        text, stopwords=True) == 'Quick Brown Fox Jumped . Lazy dog .'

    text = 'The Quick Brown Fox Jumped Over. The Lazy dog.'
    assert text_preprocessing.preprocess_text(
        text, stopwords=['dog',
                         'fox']) == 'The Quick Brown Jumped Over . The Lazy .'

    text = 'The Quick Brown Fox, Jumped Over! - The Lazy dog.'
    assert text_preprocessing.preprocess_text(
        text, replace_punctuation=True
    ) == 'The Quick Brown Fox  Jumped Over    The Lazy dog'

    text = 'The Quick Brown Fox, Jumped Over! - The Lazy dog.'
    assert text_preprocessing.preprocess_text(
        text, remove_punctuation=True
    ) == 'The Quick Brown Fox Jumped Over  The Lazy dog'
auth = tw.OAuthHandler(apikey, s_apikey)
auth.set_access_token(access, s_access)
api = tw.API(auth, wait_on_rate_limit=True)

search_term = input("What would you like to search on Twitter? ")
search_results = tw.Cursor(api.search, q=search_term, lang="en").items(1000)

raw_tweets = [tweet.text for tweet in search_results]

pfuncts = [
    to_lower, remove_number, remove_punctuation, remove_stopword,
    lemmatize_word
]
clean1 = p.clean(str(raw_tweets))
clean2 = preprocess_text(clean1, pfuncts)
tokens = word_tokenize(clean2)

tokenblob = tb(str(tokens))
tokensent = tokenblob.sentiment.polarity

print("Results")

print("Corpus Size: " + str(len(tokens)))
if tokensent > 0:
    print("Sentiment Score: " + str(tokensent) + " (Positive)\n")
elif tokensent == 0:
    print("Sentiment Score: " + str(tokensent) + " (Neutral)\n")
else:
    print("Sentiment Score: " + str(tokensent) + " (Negative)\n")
Beispiel #12
0
def search_engine_3(encoded_query, inverted_idx2, squared_tfidf_per_document,
                    uncoded_query):
    """Uses search engine 2 to get the top 10 documents with with highest similarity to the query,
       then prompts the user to specify new info, related to the other book fields (e.g. bookTitle, setting, etc.),
       adjusts the score based on the new info and returns the top 3 books according to the new score
    Args:
        encoded_query (list): a textual query, encoded in integer
        inverted_idx2 (dict): the inverted index with tfidf scores
        squared_tfidf_per_document (dict): |d| of the cosine similarity formula (before sqrt) 
        uncoded_query (list): the same textual query, not encoded in integers

    Returns:
        [dic]: the top k documents ranked by the new adjusted score
    """

    # apply the second search engine (plot only)
    plot_result = search_engine_2(encoded_query, inverted_idx2,
                                  squared_tfidf_per_document, 10)

    additional_info = []

    # maps each additional field to their position in the .tsv files
    field_to_idx = {
        'booktitle': 0,
        'bookseries': 1,
        'bookauthors': 2,
        'publishingdate': 8,
        'characters': 9,
        'setting': 10
    }

    # prompts the user to insert additional information
    while True:
        try:
            info = input(
                'please insert additional_info:\n Insert field name followed by ":" and the value\n Type "end" when you are done\n'
            ).lower()

            if info == 'end':
                break

            info = info.split(':')

            if info[0] in field_to_idx:
                additional_info.append(info)
            else:
                print('field not found, please try again\n')

        except:
            print('field not found, please try again\n')

    final_score = {}

    # Iterates over each book from the second search engine output
    for doc, score in plot_result:
        total_score = score

        with open('.\\tsvs\\article_' + str(doc) + '.tsv',
                  'r',
                  encoding='utf-8') as f:
            all_fields = f.readlines()[2].split('\t')
            all_fields = [preprocess_text(field) for field in all_fields]

            # iterates over each additional info and if it matches, adjusts the score
            for item in additional_info:
                if item[1] in all_fields[field_to_idx[item[0]]]:
                    total_score += total_score * 1 / 2

        # final score for each document
        final_score[doc] = total_score

    # return the top 3 documents based on the new scoring
    return get_top_k(final_score, 3)
Beispiel #13
0
    encoded_files_folder = "\\encoded_files\\"

    #create_inverted_idx(cwd, encoded_files_folder)
    #create_inverted_idx_2(cwd, encoded_files_folder)

    with open('inverted_idx.pickle', 'rb') as h:
        inverted_idx = pickle.load(h)

    with open('inverted_idx2.pickle', 'rb') as h:
        inverted_idx2 = pickle.load(h)

    with open('vocabulary.pickle', 'rb') as q:
        vocabulary = pickle.load(q)

    #store_squared_tfidf_per_document(inverted_idx2)

    with open('squared_tfidf_per_document.pickle', "rb") as q:
        squared_tfidf_per_document = pickle.load(q)

    query = input('enter your query:\n')
    preprocessed_query = preprocess_text(query)
    encoded_query = encode_query(preprocessed_query, vocabulary)

    print_search_engine_result(search_engine(encoded_query, inverted_idx))
    print_search_engine_2_result(
        search_engine_2(encoded_query, inverted_idx2,
                        squared_tfidf_per_document, 5))
    print_search_engine_2_result(
        search_engine_3(encoded_query, inverted_idx2,
                        squared_tfidf_per_document, preprocessed_query))
Beispiel #14
0
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from poems import poems
from text_preprocessing import preprocess_text

processed_poems = [preprocess_text(poem) for poem in poems]

vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_poems)

corpus_index = [f"Poem {i+1}" for i in range(len(poems))]
feature_names = vectorizer.get_feature_names()
try:
    df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(),
                             index=feature_names,
                             columns=corpus_index)
    print(df_tf_idf)
except:
    pass
Beispiel #15
0
        titles.append(article.title)  #prints the title of the article
        texts.append((article.text))  #prints the entire text of the article
        summaries.append(article.summary)  #prints the summary of the article
        #print(article.keywords) #prints the keywords of the article
        counter += 1
        if counter >= articles_examined:
            break

    except newspaper.article.ArticleException:
        continue

#########################PREPARE REUTERS CORPUS AND TRAIN CLASSIFIER######################################################
preprocessed_corpus = []

for fid in reuters.fileids():
    preprocessed_corpus.append(preprocess_text(reuters.words(fid)))

cleaned_preprocessed_corpus = []

# creating the bag of words model
bag_of_words_creator = CountVectorizer()
bag_of_words = bag_of_words_creator.fit_transform(cleaned_preprocessed_corpus)

# creating the tf-idf model
tfidf_creator = TfidfVectorizer(min_df=0.2)
tfidf = tfidf_creator.fit_transform(preprocessed_corpus)

documents = [(list(reuters.words(fileid)), category)
             for category in reuters.categories()
             for fileid in reuters.fileids(category)]
random.shuffle(documents)
Beispiel #16
0
def pretrain_actorCritic():
    if USE_SAVED_PREPROCESSED_INPUT:
        #sentences = pickle.load(open(PRETRAINING_PREPROCESSED_INPUT_FILE, 'r'))[:5000]
        sentences = 
    else:
        print("Loading data")
        sentences = load_data(PRETRAINING_DATA_FILE)
        print("Data loaded")
        sentences = preprocess_text(sentences)[:5000]
    print("shape of sentences", sentences.shape)

    print("Training w2v model")
    w2v_model = train_w2v_model(sentences)
    print("w2v model trained")

    token_sequences, output_sequences, token_to_index_dic = tokenize_and_pad_sentences(sentences)
    index_to_word_dic = get_index_to_word_dic(token_to_index_dic)
    token_sequences = np.asarray(token_sequences)
    output_sequences = np.asarray(output_sequences)
    print("input shape", token_sequences.shape)

    #token_sequences = token_sequences[:1000, :]
    #output_sequences[:1000, :]

    output_sequences = [one_hot(seq, len(token_to_index_dic)) for seq in output_sequences]
    print("Tokenization done. %d sequences" % len(token_sequences), "shape ", token_sequences.shape)
    #token_to_index_dic = get_word_to_index_dic(w2v_model, token_sequences)
    print("preprocessing done")
    train_x, train_y, val_x, val_y, test_x, test_y  = get_train_val_test_data(token_sequences, output_sequences)

    autoencoder = Autoencoder(w2v_model, token_to_index_dic)


    print("Creating NN model")
    autoencoder.create_nn_model()
    print("NN model created")

    if LOAD_WEIGHTS:
        print("Loading saved weights from %s" % PRETRAINING_ACTOR_WEIGHTS_FILE)
        autoencoder.load_weights(PRETRAINING_ACTOR_WEIGHTS_FILE)

    if TRAIN_ACTOR:
        print("Training actor")
        autoencoder.train(train_x, train_y,  val_x, val_y)

        if SAVE_WEIGHTS:
            print("Saving actor weights")
            autoencoder.save(PRETRAINING_ACTOR_WEIGHTS_FILE)

    print("Predicting using actor")
    output = autoencoder.predict(train_x)
    for seq in output:
        print(index_to_sentence(index_to_word_dic, [np.argmax(ele) for ele in seq]))

    print("Initializing actorCritic")
    actor_critic = ActorCriticAutoEncoder(w2v_model=w2v_model,
            token_to_index_dic=token_to_index_dic,
            actor=autoencoder.autoencoder)
    print("Creating critic model")
    actor_critic.create_critic_model()
    print("Critic model created")

    critic_train_x = output
    critic_train_y = [one_hot(seq, len(token_to_index_dic)) for seq in train_x]
    if TRAIN_CRITIC:
        print("Training critic")
        actor_critic.train_critic(critic_train_x, critic_train_y)
        print("Critic trained")

        if SAVE_WEIGHTS:
            print("Saving critic")
            actor_critic.save_critic(PRETRAINING_CRITIC_MODEL_FILE)
            print("Critic saved")