def __init__(self,
                 X_train: list,
                 Y_train: list,
                 embed_path: str,
                 embed_dim: int,
                 stop_words=[],
                 X_test=[],
                 Y_test=[],
                 max_len=None,
                 epochs=3,
                 batch_size=256):

        # Preprocessing the text
        X_train = [clean_text(text, stop_words=stop_words) for text in X_train]
        Y_train = np.asarray(Y_train)

        # Tokenizing the text
        tokenizer = Tokenizer()
        tokenizer.fit_on_texts(X_train)

        # Saving the tokenizer
        self.tokenizer = tokenizer

        # Creating the embedding matrix
        embedding = Embeddings(embed_path, embed_dim)
        embedding_matrix = embedding.create_embedding_matrix(
            tokenizer, len(tokenizer.word_counts))

        # Creating the padded input for the deep learning model
        if max_len is None:
            max_len = np.max([len(text.split()) for text in X_train])
        TextToTensor_instance = TextToTensor(tokenizer=tokenizer,
                                             max_len=max_len)
        X_train = TextToTensor_instance.string_to_tensor(X_train)

        # Creating the model
        rnn = RnnModel(embedding_matrix=embedding_matrix,
                       embedding_dim=embed_dim,
                       max_len=max_len)
        rnn.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs)

        self.model = rnn.model

        # If X_test is provided we make predictions with the created model
        if len(X_test) > 0:
            X_test = [clean_text(text) for text in X_test]
            X_test = TextToTensor_instance.string_to_tensor(X_test)
            yhat = [x[0] for x in rnn.model.predict(X_test).tolist()]

            self.yhat = yhat

            # If true labels are provided we calculate the accuracy of the model
            if len(Y_test) > 0:
                self.acc = accuracy_score(Y_test,
                                          [1 if x > 0.5 else 0 for x in yhat])
                self.f1 = f1_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])
def crawler(x):
    import os
    import sys
    import urllib.request
    from bs4 import BeautifulSoup as bts
    from text_preprocessing import clean_text
    from lst_pick import lst_pick
    client_id = "IFvtovuLLdeQi6K6jywv"
    client_secret = "_51j6auaOC"
    encText = urllib.parse.quote(x)
    start = 1
    str_big = []
    while start < 1000:
        url = "https://openapi.naver.com/v1/search/news.xml?query=" + encText + \
            "&display=30" +  "&sort=date" + "&start=" + str(start)  # xml 결과
        request = urllib.request.Request(url)
        request.add_header("X-Naver-Client-Id", client_id)
        request.add_header("X-Naver-Client-Secret", client_secret)
        response = urllib.request.urlopen(request)
        rescode = response.getcode()
        if (rescode == 200):
            response_body = response.read()
            a = response_body.decode('utf-8')
        else:
            print("Error Code:" + rescode)

        html = bts(a, "html.parser")
        news_titles = html.find_all("title")

        for title in news_titles:
            title_str = str(title.string)
            str_big.append(title_str.strip())
        start += 30

    try:
        pre_str_big = [clean_text(i) for i in str_big]
        data = {}
        for row in pre_str_big:
            for text in row.split():
                data[text] = data.get(text, 0) + 1
    except Exception as e:
        print("예외 발생", e)
    return data
Beispiel #3
0
df.index = df.Date
df = df.sort_index()

# create a series of held out documents to be tested - This will be 3 months of documents from Jan 18 - March 18
df_heldout = df.truncate(before=datetime.date(year=2018, month=1, day=1),
                         after=datetime.date(year=2018, month=3, day=31))

# cut the dataframe so that it only contains values from 2014 - 2018
df = df.truncate(before=datetime.date(year=2014, month=1, day=1),
                 after=datetime.date(year=2017, month=12, day=31))
df = df.drop(['DateTime', 'Date', 'Time'], axis=1)

# lemmatize the data
lemmatized_data = []
for post in df.Content:
    lemmatized_data.append(clean_text(post))
print('posts have been lemmatized')

# set up stopwords
STOPWORDS = stopwords.words('english')
STOPWORDS.extend(['from', 'subject', 're', 'edu', 'use'])
STOPWORDS.extend(['s', 'https', 'www', 'http', 'com', 't'])

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(lemmatized_data)
print(dictionary)
dictionary.save(
    'C:/Users/tliu/Documents/4YP/Outputs/dictionary_2014-2018.dict')

# Buiid the corpus
corpus = [dictionary.doc2bow(text) for text in lemmatized_data]
Beispiel #4
0
from text_preprocessing import clean_text
from vocabulary import build_vocabulary, known_words, WORDS_NLTK
from spell_checker import SpellChecker


# Load training text
filepath = ''  # complete with the filepath where your data are
with open(filepath) as f:
    text = f.read()

# Clean text
text = clean_text(text)

# build vocabulary
WORDS = build_vocabulary(text)
KNOWN_WORDS = known_words(list(WORDS.keys()))

# Instantiate SpellChecker
spellchecker = SpellChecker(words=WORDS, known_words=KNOWN_WORDS)
spellchecker.spell_checking('helo')
Beispiel #5
0
        seen.add(b)
print('Number of Documents (no repeats): ' + str(len(newarr)))

# Initialise a dataframe with dates
df = pd.DataFrame(newarr, columns=['DateTime', 'Content'])
df.to_pickle('C:/Users/tliu/Documents/4YP/Outputs/Pickles/Initial_DF_all.pkl')

#  sort the information
# df['DateTime'] = [datetime.datetime.strptime(d, '%Y-%m-%d %H:%M:%S') for d in df['DateTime']]
# df['Date'] = [datetime.datetime.date(d) for d in df['DateTime']]
# df['Time'] = [datetime.datetime.time(d) for d in df['DateTime']]

posts = [i[1] for i in newarr]
tokenized_data = []
for text in posts:
    tokenized_data.append(clean_text(text))

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Filter out the extremes
dictionary.filter_extremes(no_below=50)
print(dictionary)

# Save the dictionary
dictionary.save('C:/Users/tliu/Documents/4YP/Outputs/dictionary_all.dict')

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

# Save the corpus