Python preprocess_textの例、preprocessing.preprocess_text Pythonの例

コード例 #1

0

ファイルを表示

ファイル: bag_of_words.py プロジェクト: knights207210/nlp

def text_to_bow_vector(some_text, features_dictionary):
    bow_vector = [0] * len(features_dictionary)
    tokens = preprocess_text(some_text)
    for token in tokens:
        feature_index = features_dictionary[token]
        bow_vector[feature_index] += 1
    return bow_vector, tokens

コード例 #2

0

ファイルを表示

ファイル: datawrapper.py プロジェクト: PerceptumNL/TweedejaarsProject

 def value_for_keys_with_item(self, item, *keys):
     """
     Returns the concatenated preprocessed values for keys in the
     item dictionary.
     """
     val = ' '.join([item.get(k, '') for k in keys])
     return preprocessing.preprocess_text(val)

コード例 #3

0

ファイルを表示

ファイル: run-basic-search.py プロジェクト: rokcej/wier-2021

def search(query_words, document_names):
    search_results = []
    for document_name in document_names:
        page_path = config.INPUT_PATH + "/" + document_name
        with open(page_path, "r", encoding="utf-8") as f:
            # Extract text from page
            page_html = f.read()
            page_text = preprocessing.extract_text(page_html)

            # Process words
            page_words, page_indexes, page_strings = preprocessing.preprocess_text(
                page_text)

            # Find query matches
            frequency = 0
            indexes = []
            for page_word, page_index in zip(page_words, page_indexes):
                if page_word in query_words:
                    indexes.append(page_index)
                    frequency += 1

            if frequency > 0:
                # Get snippets
                snippets_str = searching.extract_snippets(
                    indexes, page_strings)

                # Add to search results
                search_results.append((frequency, document_name, snippets_str))

    return search_results

コード例 #4

0

ファイルを表示

ファイル: main.py プロジェクト: imakshat47/Pre-Process-text-cleaning

def process_init(text):
    try:
        #  Convert text to lower
        lower_text = text.lower()
        # Handle Emojis
        emojis_text = prep.handle_emojis(lower_text)
        # Cleaning text
        processed_text = prep.preprocess_text(emojis_text)

        # tokenize
        tokenizer = RegexpTokenizer(r'\w+')
        tokens = tokenizer.tokenize(processed_text)

        # remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]

        # filter out stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]

        # tokens = [word for word in tokens if len(word) > 1]

        return tokens

    except Exception:
        pass

コード例 #5

0

ファイルを表示

ファイル: evaluate.py プロジェクト: amieo-ra/nmt-tensorflow

def evaluate(text):
    with open('input_tokenizer.pickle', 'rb') as handle:
        input_tokenizer = pickle.load(handle)
        
    with open('output_tokenizer.pickle', 'rb') as handle:
        output_tokenizer = pickle.load(handle)
        
    input_vocab_size = len(input_tokenizer.word_index) + 1
    output_vocab_size = len(output_tokenizer.word_index) + 1
    
    text = preprocess_text(text)  
    seq = input_tokenizer.texts_to_sequences([text])
    inputs = tf.keras.preprocessing.sequence.pad_sequences(seq, truncating='post', padding='post')
    inputs = tf.convert_to_tensor(inputs)
    
    result = ""
    
    encoder = Encoder(input_vocab_size, constants.embedding_dim, constants.units, constants.BATCH_SIZE)
    decoder = Decoder(output_vocab_size, constants.embedding_dim, constants.units, constants.BATCH_SIZE)
    
    checkpoint_dir = './checkpoints'
    checkpoint = tf.train.Checkpoint(encoder=encoder, decoder=decoder)
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
    
    enc_outputs, enc_hidden = encoder(inputs)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([output_tokenizer.word_index['<start>']], 0)
    
    result = beam_search(constants.beam_width, decoder, dec_input, dec_hidden, 
                         enc_outputs, output_tokenizer.word_index['<end>'], output_vocab_size)
    result = output_tokenizer.sequences_to_texts([result])
    print(result[0])

コード例 #6

0

ファイルを表示

ファイル: search.py プロジェクト: Evevon/IRalbumsearch

def term_based_search():
    query = 'best albums of 2019'
    query = preprocess_text(query)

    n = {url['doc_id'] for item in index.values() for url in item}
    n = len(n)

    query_tfidf = [
        (max(float(0), 1 + log10(count))) * (log10(n / len(index[word])))
        for word, count in Counter(query).items()
    ]

    dict_docs = dict()
    for j, word in enumerate(query):
        for i in index[word]:
            if i['doc_id'] in dict_docs:
                dict_docs[i['doc_id']][j] = i['tfidf']
            else:
                dict_docs[i['doc_id']] = [0] * len(query)
                dict_docs[i['doc_id']][j] = i['tfidf']

    search_results = dict()
    for key, vector in dict_docs.items():
        search_results[key] = (1 -
                               spatial.distance.cosine(query_tfidf, vector))

    sorted_x = sorted(search_results.items(),
                      key=operator.itemgetter(1),
                      reverse=True)
    print(sorted_x[:20])

コード例 #7

0

ファイルを表示

def text_to_bow(some_text):
    bow_dictionary = {}
    tokens = preprocess_text(some_text)
    for token in tokens:
        if token in bow_dictionary:
            bow_dictionary[token] += 1
        else:
            bow_dictionary[token] = 1
    return bow_dictionary

コード例 #8

0

ファイルを表示

ファイル: datawrapper.py プロジェクト: PerceptumNL/TweedejaarsProject

 def tag_glossary(self, tag):
     """
     Returns the glossary text of a tag if set. Otherwise returns an
     empty string.
     """
     glossary_id = self.tag(tag)['glossary']
     if(glossary_id):
         return preprocessing.preprocess_text(self.item(glossary_id)['text'])
     return ''

コード例 #9

0

ファイルを表示

ファイル: bag_of_words.py プロジェクト: knights207210/nlp

def create_features_dictionary(documents):
    features_dictionary = {}
    merged = " ".join(documents)
    tokens = preprocess_text(merged)
    index = 0
    for token in tokens:
        if token not in features_dictionary:
            features_dictionary[token] = index
            index += 1
    return features_dictionary, tokens

コード例 #10

0

ファイルを表示

def test():
    input_text = request.form["tweet"]
    input_button = request.form["button"]

    print(input_text)
    print(input_button)

    text = preprocess_text(input_text)
    pred = model.predict(text)

    return render_template("index.html", pred=str(pred))

コード例 #11

0

ファイルを表示

def bot():
    incoming_msg = request.values.get('Body', '').lower()
    resp = MessagingResponse()
    msg = resp.message()
    responded = False

    hello_list = ['hello', 'hey', 'start', 'hi']
    global hello_flag

    # --------------------------
    # First Time Welcome Message
    # --------------------------
    if any(hello in incoming_msg for hello in hello_list) and hello_flag == 0:
        set_global_flag(value=1)

        hello_message = """_Hi, 
        I am *COVID19 Mythbuster*_ 👋🏻

        ◻️ _In these crazy hyperconnected times, there is a lot of FAKE NEWS spreading about the NOVEL CORONAVIRUS._

        ◻️ _I Can Help You In Differentiating the Fake News From The Real News_ 📰

        ◻️ _All you need to do is send me the news you get to verify if it Real or not._ 

        _It's that simple 😃
        Try it for yourself, simply send me a News About COVID19 and I'll try to tell if it is Fake Or Real_ ✌🏻✅
        """

        msg.body(hello_message)
        responded = True

    else:
        text = preprocess_text(incoming_msg)
        pred = model.predict(text)[0][0]

        output = ''

        if pred > 0.5:
            output = "The given news is real"
            responded = True
        elif pred < 0.5:
            output = "The given news is fake"
            responded = True

        msg.body(output)

    if not responded:
        msg.body(
            """That didn't quite work! Try some other text, or send a
            Hello to get started if you haven't already""")

    return str(resp)

コード例 #12

0

ファイルを表示

    def process_item(self, album, spider):

        dir_ = os.path.dirname(os.path.abspath(__file__))

        #sentiment and entity extraction
        blob = album['description']

        sentiment_blob = TextBlob(blob)
        nlp = spacy.load('en')
        entity_blob = nlp(blob)
        entities = {X.text for X in entity_blob.ents if X.label_ == 'PERSON'}
        entities = list(entities)

        sentiment_dict = {
            'sentiment': sentiment_blob.sentiment[0],
            'polarity': sentiment_blob.sentiment[1],
            'entities': entities
        }

        with open(
                dir_ + '/sentiment/' + spider.name + str(spider.count) +
                '_sentiment.json', 'w') as outfile:
            json.dump(sentiment_dict, outfile)

        # text preprocessing
        album['description'] = preprocessing.preprocess_text(
            album['description'])
        album['name'] = preprocessing.preprocess_text(album['name'],
                                                      specialchars=False,
                                                      stopwords=False,
                                                      stem=False)

        # write scraped data to json file
        write_to_json(spider.name + str(spider.count) + '.json', spider.count,
                      dict(album))

        return album

コード例 #13

0

ファイルを表示

ファイル: search.py プロジェクト: Evevon/IRalbumsearch

def basic_search():
    query = 'best albums of 2019'
    query = preprocess_text(query)

    dict_docs = dict()
    for j, word in enumerate(query):
        for i in index[word]:
            if i['doc_id'] in dict_docs:
                dict_docs[i['doc_id']][j] = i['count']
            else:
                dict_docs[i['doc_id']] = [0] * len(query)
                dict_docs[i['doc_id']][j] = i['count']

    search_results = dict()
    for key, vector in dict_docs.items():
        search_results[key] = sum(vector)

    sorted_x = sorted(search_results.items(),
                      key=operator.itemgetter(1),
                      reverse=True)
    print(sorted_x[:20])

コード例 #14

0

ファイルを表示

ファイル: unsupervised.py プロジェクト: jasmineseee/email-filing-with-incremental

def main():
    preprocessed_text = preprocessing.preprocess_text(
        preprocessing.query_emailbody())
    documents_to_tokens = [
        tokenizer.tokenize(sentence) for sentence in preprocessed_text
    ]
    message_id = preprocessing.query_messageid()
    final_df = unsupervised_learning(preprocessed_text, documents_to_tokens,
                                     message_id)

    # Create a temporary table to store the results of unsupervised learning
    final_df.to_sql('unsupervised_temp', con, if_exists='replace')

    # Update folder_directory in emails table and delete temporary table for unsupervised learning
    cur.executescript("""UPDATE emails_main
                         SET folder_directory = (
                            SELECT Topic 
                            FROM unsupervised_temp 
                            WHERE message_id = emails_main.message_id)
                         WHERE emails_main.folder_directory IS NULL;

                         DROP TABLE unsupervised_temp;""")
    con.commit()

コード例 #15

0

ファイルを表示

ファイル: news_topic_analysis.py プロジェクト: anshulg825/Codecademy-Courses

import codecademylib3_seaborn
import pandas as pd
import numpy as np
from articles import articles
from preprocessing import preprocess_text

# import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# view article
#print(articles[4])

# preprocess articles
processed_articles = []
for i in range(0,10):
  processed_articles.append(preprocess_text(articles[i]))
#print(processed_articles[4])

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(processed_articles)

# convert counts to tf-idf
transformer = TfidfTransformer(norm=None)
tfidf_scores_transformed = transformer.fit_transform(counts)

# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_articles)

# check if tf-idf scores are equal

コード例 #16

0

ファイルを表示

import codecademylib3_seaborn
import pandas as pd
import numpy as np
from articles import articles
from preprocessing import preprocess_text

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# check one of the articles
#print(articles[0])

# preprocess articles
processed_articles = [preprocess_text(article) for article in articles]
print(processed_articles[0])

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(processed_articles)

# convert counts to tf-idf
transformer = TfidfTransformer(norm=None)
tfidf_scores_transformed = transformer.fit_transform(counts)

# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_articles)

# check if tf-idf scores are equal
if np.allclose(tfidf_scores_transformed.todense(), tfidf_scores.todense()):
  print(pd.DataFrame({'Are the tf-idf scores the same?':['YES']}))
else:

コード例 #17

0

ファイルを表示

ファイル: exercise_6.py プロジェクト: piotrkawa/data-mining

def get_chapter_cloud(chapters, chapter):
    chapter_cloud_data = []
    unique_words = set(chapter)
    for word in unique_words:        
        weight = utility.tf_idf(word, chapters, chapter)
        chapter_cloud_data.append((word, int(weight * 100)))
    weight = lambda element: element[1]
    chapter_cloud_data.sort(key=weight, reverse=True)
    return chapter_cloud_data


if __name__ == '__main__':
    book = utility.get_text_file_as_list('shrek.txt')
    chapters = utility.split_by_delimiter(book, "#" * 10)
    preprocessed_chapters = [preprocess_text(chapter) for chapter in chapters]
    cloud_data = prepare_word_cloud_data(preprocessed_chapters)
    
    for i, data in enumerate(cloud_data): 
        wc = WordCloud(background_color="white", max_words=2000, contour_width=3, contour_color='steelblue')
        wc.generate_from_frequencies(dict(data[5:]))
        wc.to_file(f'clouds/shrek_cloud{i}.png')

    # subexercise 5
    preprocessed_book = preprocess_text(book)
    cloud = get_chapter_cloud(preprocessed_book, preprocessed_book)
    wc = WordCloud(background_color="white", max_words=2000, contour_width=3, contour_color='steelblue')
    wc.generate_from_frequencies(dict(cloud[15:]))
    wc.to_file('clouds/book_tf_idf.png')

コード例 #18

0

ファイルを表示

ファイル: read_articles.py プロジェクト: btietze0623/News_Article_Reader

import pandas as pd
import numpy as np
from articles import articles
from preprocessing import preprocess_text

##Note, this code was built with a codecademy course. There are modifications to pull the top n words instead of 1 which the cours did

# import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# preprocess articles
processed_articles = [preprocess_text(story) for story in articles]

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(processed_articles)

# convert counts to tf-idf
transformer = TfidfTransformer(norm=None)

# initialize and fit TfidfVectorizer
tfidf_scores_transformed = transformer.fit_transform(counts)

vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_articles)
# check if tf-idf scores are equal
if np.allclose(tfidf_scores_transformed.todense(), tfidf_scores.todense()):
    print(pd.DataFrame({'Are the tf-idf scores the same?': ['YES']}))
else:
    print(
        pd.DataFrame(

コード例 #19

0

ファイルを表示

ファイル: 3_Intent with TF-IDF.py プロジェクト: artdevesa7/Retrieval-Based-Chatbot

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from preprocessing import preprocess_text

response_a = "Every dress style is cut from a polyester blend for a strechy fit."
response_b = "The 'Elosie' dress runs large. I suggest you take your regular size or smaller."
response_c = "The 'Elosie' dress comes in green, lavender, and orange."
user_message = "Hello! What is the fit of the 'Elosie' dress? My shoulders are broad, so I often size up for a comfortable fit. Do dress sizes run large or small?"

documents = [response_a, response_b, response_c, user_message]

# preprocess responses and user_message
processed_docs = [preprocess_text(doc) for doc in documents]

# create tfidf vectorizer
vectorizer = TfidfVectorizer()

# fit and transform vectorizer on processed docs
tfidf_vectors = vectorizer.fit_transform(processed_docs)

# compute cosine similarity betweeen the user message tf-idf vector and the different response tf-idf vectors
cosine_similarities = cosine_similarity(tfidf_vectors[-1], tfidf_vectors)
# get the index of the most similar response to the user message
similar_response_index = cosine_similarities.argsort()[0][-2]

best_response = documents[similar_response_index]
print(best_response)

コード例 #20

0

ファイルを表示

from torchtext_sentiment import analyse_sentiments
from utils import get_model_name


# INPUTS
############
PROCESS_DATASETS = False
CREATE_EMBEDDINGS = False
TRAINING_MODULE = True
training_mode = True

if PROCESS_DATASETS:
    dataset_path = os.path.normpath(os.getcwd() + os.sep + os.pardir)
    dataset_path = os.path.join(dataset_path, "data")
    dataset_path = os.path.join(dataset_path, "training.1600000.processed.noemoticon.csv")
    preprocess_text(dataset_path, stem=False)


if CREATE_EMBEDDINGS:
    # TODO CREATE OWN EMBEDDINGS
    embedding_params = [{
        'min_count': [1],  # valitaan tähän vakioarvo
        'max_vocab_size': [1000e3],  # valitaan tähän vakioarvo, esim. 50k
        'window_size': [7],  # Testataanko: [5, 10] for skip-gram usually around 10, for CBOW around 5
         'vector_size': [100],  # Testataanko [10, 100, 300]
         'noise_words': [20],  # for large datasets between 2-5 valitaan yksi
         'use_skip_gram': [1],  # 1 for skip-gram, 0 for CBOW, testi molemmilla?
         'cbow_mean': [0],  # if using cbow
         'w2v_iters': [10]  # onko tarpeeksi?
         }]

コード例 #21

0

ファイルを表示

Not one of all the purple host
Who took the flag to-day
Can tell the definition,
So clear, of victory,

As he, defeated, dying,
On whose forbidden ear
The distant strains of triumph
Break, agonized and clear!'''

# define clear_count:
clear_count = 2

# preprocess text
processed_poem = preprocess_text(poem)

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform([processed_poem])

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# print(feature_names)
# create pandas DataFrame with term frequencies
try:
    df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(),
                                       index=feature_names,
                                       columns=['Term Frequency'])
    print(df_term_frequencies)

コード例 #22

0

ファイルを表示

import hickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import numpy as np
from preprocessing import preprocess_text

X_train, X_test, y_train, y_test, vocab_size, embedding_matrix, maxlen, tokenizer = hickle.load(
    "preprocessed.hickle")

test_texts = [
    "f*****g shit movie so bad omg why is it this shit dont like",
    "i really liked the style of the movie, moreover the style is really fresh and nice"
]

for i, text in enumerate(test_texts):
    test_texts[i] = preprocess_text(text)

tokenized = tokenizer.texts_to_sequences(test_texts)
tokenized = pad_sequences(tokenized, padding='post', maxlen=maxlen)

model = load_model("model.h5")
model.summary()
predict = model.predict(tokenized)

print(predict)

コード例 #23

0

ファイルを表示

ファイル: run-basic-search.py プロジェクト: rokcej/wier-2021

    return search_results


if __name__ == "__main__":
    # Parse parameters
    if len(sys.argv) < 2:
        print(f"Error: Missing search parameter!")
        sys.exit(1)
    query_text = sys.argv[1]

    # Use perf_counter instead of process_time when multiprocessing
    time_start = time.perf_counter()  # Start timer

    # Preprocess query
    query_words, _, _ = preprocessing.preprocess_text(query_text)
    query_words = searching.remove_duplicates(query_words)

    search_results = []

    # Get list of all documents
    document_names = []
    for site in config.INPUT_SITES:
        site_path = config.INPUT_PATH + "/" + site
        padding = (max([len(x) for x in config.INPUT_SITES]) -
                   len(site)) * " "  # Add spaces to align progress bars
        for page in os.listdir(site_path):
            # Only process html files with the same name as site
            if page.startswith(site) and page.endswith(".html"):
                document_names.append(site + "/" + page)

コード例 #24

0

ファイルを表示

import codecademylib3_seaborn
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from raven import the_raven_stanzas
from preprocessing import preprocess_text

# view first stanza

print(the_raven_stanzas[0])
vectorizer = TfidfVectorizer(norm=None)

# preprocess documents
processed_stanzas = [preprocess_text(stanza) for stanza in the_raven_stanzas]

# initialize and fit TfidfVectorizer

tfidf_scores = vectorizer.fit_transform(processed_stanzas)

# get vocabulary of terms

feature_names = vectorizer.get_feature_names()

# get stanza index
stanza_index = [f"Stanza {i+1}" for i in range(len(the_raven_stanzas))]

# create pandas DataFrame with tf-idf scores
try:
    df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(),
                             index=feature_names,
                             columns=stanza_index)
    print(df_tf_idf)

コード例 #25

0

ファイルを表示

ファイル: term_frequency.py プロジェクト: rwilmar/lnp_tests

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from preprocessing import preprocess_text
from poems import poems

# preprocess text
processed_poems = [preprocess_text(poem) for poem in poems]

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform(processed_poems)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Poem {i+1}" for i in range(len(poems))]

# create pandas DataFrame with term frequencies
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(),
                                   index=feature_names,
                                   columns=corpus_index)

コード例 #26

0

ファイルを表示

import codecademylib3_seaborn
import pandas as pd
import numpy as np
from articles import articles
from preprocessing import preprocess_text

# import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

# view article
print(articles[0])

# preprocess articles
processed_articles = [preprocess_text(document) for document in articles]

# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(processed_articles)

# convert counts to tf-idf
transformer = TfidfTransformer(norm=None)
tfidf_scores_transformed = transformer.fit_transform(counts)

# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(processed_articles)

# check if tf-idf scores are equal
if np.allclose(tfidf_scores_transformed.todense(), tfidf_scores.todense()):
    print(pd.DataFrame({'Are the tf-idf scores the same?': ['YES']}))
else:

コード例 #27

0

ファイルを表示

ファイル: nlp_tf_idf.py プロジェクト: adelrio89/codecademy

import nltk, re
from sherlock_holmes import bohemia_ch1, bohemia_ch2, bohemia_ch3, boscombe_ch1, boscombe_ch2, boscombe_ch3
from preprocessing import preprocess_text
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# preparing the text
corpus = [
    bohemia_ch1, bohemia_ch2, bohemia_ch3, boscombe_ch1, boscombe_ch2,
    boscombe_ch3
]
preprocessed_corpus = [preprocess_text(chapter) for chapter in corpus]

# Update stop_list:
stop_list = [
    'man', 'say', 'upon', 'could', 'one', 'see', 'think', 'know', 'come', 'yes'
]


# filtering topics for stop words
def filter_out_stop_words(corpus):
    no_stops_corpus = []
    for chapter in corpus:
        no_stops_chapter = " ".join(
            [word for word in chapter.split(" ") if word not in stop_list])
        no_stops_corpus.append(no_stops_chapter)
    return no_stops_corpus


filtered_for_stops = filter_out_stop_words(preprocessed_corpus)

コード例 #28

0

ファイルを表示

ファイル: sample_tfidf.py プロジェクト: akhilnair111/100DaysOfCode

import codecademylib3_seaborn
from preprocessing import preprocess_text
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# sample documents
document_1 = "This is a sample sentence!"
document_2 = "This is my second sentence."
document_3 = "Is this my third sentence?"

# corpus of documents
corpus = [document_1, document_2, document_3]

# preprocess documents
processed_corpus = [preprocess_text(doc) for doc in corpus]

# initialize and fit TfidfVectorizer
vectorizer = TfidfVectorizer(norm=None)
tf_idf_scores = vectorizer.fit_transform(processed_corpus)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()
corpus_index = [n for n in processed_corpus]

# create pandas DataFrame with tf-idf scores
df_tf_idf = pd.DataFrame(tf_idf_scores.T.todense(),
                         index=feature_names,
                         columns=corpus_index)
print(df_tf_idf)

コード例 #29

0

ファイルを表示

ファイル: min_preprocessing.py プロジェクト: hrshtt/document-classification

    for folder in os.listdir(text_dir):
        all_text[folder] = []
        for file_ in os.listdir(text_dir / folder):
            with open(text_dir / folder / file_) as f:
                all_text[folder].append(f.read().strip())
    return(all_text)

if __name__ == "__main__":
    
    #reading files and getting the data as a dict
    all_text_list = get_all_text()

    #cleaning and preprocessing the data
    all_text_list_clean = {}
    for i, class_ in enumerate(all_text_list):
        all_text_list_clean[class_] = []
        for j, _ in enumerate(all_text_list[class_]):
            all_text_list_clean[class_].append(preprocess_text(all_text_list[class_][j]))
    
    #saving text as minimal processed doc
    all_text = ''
    for class_ in all_text_list_clean:
            for value in all_text_list_clean[class_]:
                if value.strip()!= '':
                    all_text += f'__label__{class_} {value}\n'
                    # all_text += f'{value}\n'

    with open('./data/raw_text.txt', 'w') as f:
        f.write(all_text.strip())

コード例 #30

0

ファイルを表示

# {key=word: val=frequency)
bow_vectorizer = CountVectorizer()

# Define friends_vectors:
friends_vectors = bow_vectorizer.fit_transform(friends_docs)

# Define friends_classifier:
friends_classifier = MultinomialNB()

# train test split
X_train, X_test, y_train, y_test = train_test_split(friends_vectors, friends_labels, test_size=0.33, random_state=42)

#never before seen message for which the NN will predict the sender
# mystery_message = "big boi manipulation lol"
mystery_message = '''lmao well since you're up, ill update and say i will never not buy anything off of amazon ever again literally found the same shoe for 40 dollars less than their sale?'''
mystery_message = preprocess_text(mystery_message)

# Define mystery_vector:
mystery_vector = bow_vectorizer.transform([mystery_message])

# Train the classifier:
# friends_classifier.fit(friends_vectors, friends_labels)
friends_classifier.fit(X_train, y_train)


# Change prediction back to a name:
predictions = friends_classifier.predict(mystery_vector)
confidence = friends_classifier.predict_proba(mystery_vector) #technically the probability.
score = friends_classifier.score(X_test, y_test)
score1 = friends_classifier.score(X_train, y_train)

コード例 #31

0

ファイルを表示

from preprocessing import preprocess_text
from nltk.util import ngrams
from collections import Counter

text = "It's exciting to watch flying fish after a hard day's work. I don't know why some fish prefer flying and other fish would rather swim. It seems like the fish just woke up one day and decided, 'hey, today is the day to fly away.'"
tokens = preprocess_text(text)

# Bigram approach:
bigrams_prepped = ngrams(tokens, 2)
bigrams = Counter(bigrams_prepped)
print(
    "Three most frequent word sequences and the number of occurrences according to Bigrams:"
)
print(bigrams.most_common(3))

# Bag-of-Words approach:
# Define bag_of_words here:
bag_of_words = Counter(tokens)
most_common_three = bag_of_words.most_common(3)
print(
    "\nThree most frequent words and number of occurrences according to Bag-of-Words:"
)
print(bag_of_words)

print(most_common_three)

コード例 #32

0

ファイルを表示

ファイル: exercise_8.py プロジェクト: piotrkawa/data-mining

    word_occurence_indices = [
        index for index, element in enumerate(book) if element.strip() == word
    ]
    return [
        book[index + 2] for index in word_occurence_indices
        if index + 2 < len(book)
    ]


def generate_random_paragraph(start_word, words, length=50):
    next_word = words[start_word][random.randint(0,
                                                 len(words[start_word]) - 1)]
    paragraph = [start_word, next_word]

    current_word = next_word

    for _ in range(length - 2):
        random_index = random.randint(0, len(words[current_word]) - 1)
        next_word = words[current_word][random_index]
        current_word = next_word
        paragraph.append(next_word)
    return ' '.join(paragraph)


if __name__ == '__main__':
    book = utility.get_text_file_as_list('shrek.txt')
    book = preprocessing.preprocess_text(book)
    words_with_successors = get_words_with_most_common_successors(book)
    paragraph = generate_random_paragraph('donkey', words_with_successors)
    pdb.set_trace()