def __init__(self, X_train: list, Y_train: list, embed_path: str, embed_dim: int, stop_words=[], X_test=[], Y_test=[], max_len=None, epochs=3, batch_size=256): # Preprocessing the text X_train = [clean_text(text, stop_words=stop_words) for text in X_train] Y_train = np.asarray(Y_train) # Tokenizing the text tokenizer = Tokenizer() tokenizer.fit_on_texts(X_train) # Saving the tokenizer self.tokenizer = tokenizer # Creating the embedding matrix embedding = Embeddings(embed_path, embed_dim) embedding_matrix = embedding.create_embedding_matrix( tokenizer, len(tokenizer.word_counts)) # Creating the padded input for the deep learning model if max_len is None: max_len = np.max([len(text.split()) for text in X_train]) TextToTensor_instance = TextToTensor(tokenizer=tokenizer, max_len=max_len) X_train = TextToTensor_instance.string_to_tensor(X_train) # Creating the model rnn = RnnModel(embedding_matrix=embedding_matrix, embedding_dim=embed_dim, max_len=max_len) rnn.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs) self.model = rnn.model # If X_test is provided we make predictions with the created model if len(X_test) > 0: X_test = [clean_text(text) for text in X_test] X_test = TextToTensor_instance.string_to_tensor(X_test) yhat = [x[0] for x in rnn.model.predict(X_test).tolist()] self.yhat = yhat # If true labels are provided we calculate the accuracy of the model if len(Y_test) > 0: self.acc = accuracy_score(Y_test, [1 if x > 0.5 else 0 for x in yhat]) self.f1 = f1_score(Y_test, [1 if x > 0.5 else 0 for x in yhat])
def crawler(x): import os import sys import urllib.request from bs4 import BeautifulSoup as bts from text_preprocessing import clean_text from lst_pick import lst_pick client_id = "IFvtovuLLdeQi6K6jywv" client_secret = "_51j6auaOC" encText = urllib.parse.quote(x) start = 1 str_big = [] while start < 1000: url = "https://openapi.naver.com/v1/search/news.xml?query=" + encText + \ "&display=30" + "&sort=date" + "&start=" + str(start) # xml 결과 request = urllib.request.Request(url) request.add_header("X-Naver-Client-Id", client_id) request.add_header("X-Naver-Client-Secret", client_secret) response = urllib.request.urlopen(request) rescode = response.getcode() if (rescode == 200): response_body = response.read() a = response_body.decode('utf-8') else: print("Error Code:" + rescode) html = bts(a, "html.parser") news_titles = html.find_all("title") for title in news_titles: title_str = str(title.string) str_big.append(title_str.strip()) start += 30 try: pre_str_big = [clean_text(i) for i in str_big] data = {} for row in pre_str_big: for text in row.split(): data[text] = data.get(text, 0) + 1 except Exception as e: print("예외 발생", e) return data
df.index = df.Date df = df.sort_index() # create a series of held out documents to be tested - This will be 3 months of documents from Jan 18 - March 18 df_heldout = df.truncate(before=datetime.date(year=2018, month=1, day=1), after=datetime.date(year=2018, month=3, day=31)) # cut the dataframe so that it only contains values from 2014 - 2018 df = df.truncate(before=datetime.date(year=2014, month=1, day=1), after=datetime.date(year=2017, month=12, day=31)) df = df.drop(['DateTime', 'Date', 'Time'], axis=1) # lemmatize the data lemmatized_data = [] for post in df.Content: lemmatized_data.append(clean_text(post)) print('posts have been lemmatized') # set up stopwords STOPWORDS = stopwords.words('english') STOPWORDS.extend(['from', 'subject', 're', 'edu', 'use']) STOPWORDS.extend(['s', 'https', 'www', 'http', 'com', 't']) # Build a Dictionary - association word to numeric id dictionary = corpora.Dictionary(lemmatized_data) print(dictionary) dictionary.save( 'C:/Users/tliu/Documents/4YP/Outputs/dictionary_2014-2018.dict') # Buiid the corpus corpus = [dictionary.doc2bow(text) for text in lemmatized_data]
from text_preprocessing import clean_text from vocabulary import build_vocabulary, known_words, WORDS_NLTK from spell_checker import SpellChecker # Load training text filepath = '' # complete with the filepath where your data are with open(filepath) as f: text = f.read() # Clean text text = clean_text(text) # build vocabulary WORDS = build_vocabulary(text) KNOWN_WORDS = known_words(list(WORDS.keys())) # Instantiate SpellChecker spellchecker = SpellChecker(words=WORDS, known_words=KNOWN_WORDS) spellchecker.spell_checking('helo')
seen.add(b) print('Number of Documents (no repeats): ' + str(len(newarr))) # Initialise a dataframe with dates df = pd.DataFrame(newarr, columns=['DateTime', 'Content']) df.to_pickle('C:/Users/tliu/Documents/4YP/Outputs/Pickles/Initial_DF_all.pkl') # sort the information # df['DateTime'] = [datetime.datetime.strptime(d, '%Y-%m-%d %H:%M:%S') for d in df['DateTime']] # df['Date'] = [datetime.datetime.date(d) for d in df['DateTime']] # df['Time'] = [datetime.datetime.time(d) for d in df['DateTime']] posts = [i[1] for i in newarr] tokenized_data = [] for text in posts: tokenized_data.append(clean_text(text)) # Build a Dictionary - association word to numeric id dictionary = corpora.Dictionary(tokenized_data) # Filter out the extremes dictionary.filter_extremes(no_below=50) print(dictionary) # Save the dictionary dictionary.save('C:/Users/tliu/Documents/4YP/Outputs/dictionary_all.dict') # Transform the collection of texts to a numerical form corpus = [dictionary.doc2bow(text) for text in tokenized_data] # Save the corpus