def __init__(self, n_gram, tokenizer, stop_en=None, stop_th=None, keyword=None): import re import os from nltk.tokenize import TreebankWordTokenizer from clean_text import CleanText self.cleaner = CleanText(stop_en=stop_en, stop_th=stop_th, keyword=keyword) self.test_text = 'ตัวอย่างความต้องการใช้ตัวอย่างความต้องการลีนุ๊กซ์การใช้ยากลำบาก' self.eng_tokenizer = TreebankWordTokenizer() self.n_gram = n_gram self.tokenizer = tokenizer self.pattern_sentence_collide = re.compile('[a-z][A-Z]]') self.pattern_thai_char = re.compile(u'[\u0e00-\u0e7f]') if keyword: with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', keyword), 'rt', encoding='utf-8') as keyword_file: self.keyword = set([item for item in keyword_file.read().split('\n')]) else: self.keyword = set([])
def _prepare_data(self, data_path, test_size, random_state): """Loads data and prepares for training Args: data_path (str): File path to the data test_size (float): Percent of the data to use for the test set random_state (int): Seed for randomly splitting data for train and test sets """ ct = CleanText() df = pd.read_pickle(data_path) df = df[df['issue'] != ''] df['clean_text'] = df['ticket_text'].apply( lambda x: ct.prepare_text(x)) weights = self._weights_helper(df['issue']) trainLines, trainLabels = df['clean_text'], df['issue'] labels = pd.get_dummies(trainLabels) X_train, X_test, y_train, y_test = train_test_split( trainLines, labels, test_size=test_size, random_state=random_state, stratify=labels) encoder = EncodeText() length = encoder.max_length(X_train) vocab_size = encoder.vocab_size(X_train) X_train = encoder.encode_text(X_train) X_test = encoder.encode_text(X_test, test_data=True) self.weights = weights self.labels = labels self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.length = length self.vocab_size = vocab_size
def __init__(self): self.data = pd.read_csv('../data/sa_dataset.csv', index_col=0) self.data['Reviews'] = CleanText().fit_transform(self.data['Reviews']) self.X, self.y = self.classify_data(self.data) self.target_names = [ '#GENERAL', '#FEATURE', '#PRICE', '#CAMERA', '#DESIGN#SCREEN' ] self.pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1))), ('clf', LogisticRegression())]) self.model = [] self.a = [] self.p = [] self.r = [] self.f = []
def __init__(self): self.data = pd.read_csv('../data/ac_dataset.csv', index_col=0) self.data['Target'] = self.convert_targets( self.data.drop(['Reviews'], axis=1)) self.X = self.data['Reviews'] self.y = self.data['Target'] self.X_train, self.X_val, self.y_train, self.y_val = train_test_split( self.X, self.y, train_size=0.8, random_state=0) self.pipeline = Pipeline([ ('preprocess', CleanText()), ('vect', CountVectorizer(ngram_range=(1, 2))), ('clf', OneVsRestClassifier(LogisticRegression())) ]) self.model = None
from encode_text import EncodeText from clean_text import CleanText from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, MaxPooling1D, Conv1D, concatenate from tensorflow.keras import metrics from sklearn.preprocessing import LabelEncoder from tensorflow.keras.utils import to_categorical from sklearn.model_selection import train_test_split from numpy import array import pandas as pd df = pd.read_pickle('./data.pkl') ct = CleanText() encoder = EncodeText() df = df[df.issue.str.contains('cant_add_bank|refund_e_|transactions_not_importing')] df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x)) trainLines, trainLabels = df['clean_text'], df['issue'] lb = LabelEncoder() transformed_labels = lb.fit_transform(trainLabels) transformed_labels = to_categorical(transformed_labels) X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels) length = encoder.max_length(X_train) vocab_size = encoder.vocab_size(X_train) X_train = encoder.encode_text(X_train)
class Tokenizer: def __init__(self, n_gram, tokenizer, stop_en=None, stop_th=None, keyword=None): import re import os from nltk.tokenize import TreebankWordTokenizer from clean_text import CleanText self.cleaner = CleanText(stop_en=stop_en, stop_th=stop_th, keyword=keyword) self.test_text = 'ตัวอย่างความต้องการใช้ตัวอย่างความต้องการลีนุ๊กซ์การใช้ยากลำบาก' self.eng_tokenizer = TreebankWordTokenizer() self.n_gram = n_gram self.tokenizer = tokenizer self.pattern_sentence_collide = re.compile('[a-z][A-Z]]') self.pattern_thai_char = re.compile(u'[\u0e00-\u0e7f]') if keyword: with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', keyword), 'rt', encoding='utf-8') as keyword_file: self.keyword = set([item for item in keyword_file.read().split('\n')]) else: self.keyword = set([]) def tokenizer(self, text=None, cleaning=False): def n_gram_compile(tokens, n): tokens = tokens[:] n_tokens = [] if n <= 1: return tokens for j, token in enumerate(tokens[:-(n - 1)]): new_token = '' for word in tokens[j:j + n]: if self.pattern_thai_char.search(word) and len(word) > 1: new_token += word else: new_token = '' break if new_token: n_tokens.extend([new_token]) return n_tokens def n_grams_compile(tokens, n): if n < 2: return tokens n_tokens = [] for j in range(2, n + 1): n_tokens.extend(n_gram_compile(tokens, j)) n_tokens = tokens + n_tokens return n_tokens if not text: return elif text == '-test': text = self.test_text if cleaning: text = self.cleaner.clean_text(text) print(text) text_split = text.split('|') print(text_split) first_pass = [] for i, item in enumerate(text_split): if self.pattern_sentence_collide.search(item) and item not in self.keyword: c_text = self.pattern_sentence_collide.search(item) first_pass.extend([c_text.string[:c_text.span()[0]+1], c_text.string[c_text.span()[1]-1:]]) else: first_pass.append(item) second_pass = [] for i, chunk in enumerate(first_pass): if self.pattern_thai_char.search(chunk) and len(chunk) > 1: new_chunk = self.tokenizer(chunk) new_chunk = n_grams_compile(new_chunk, self.n_gram) second_pass.extend(new_chunk) else: second_pass.append(chunk.lower()) return second_pass
from clean_text import CleanText from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from numpy import array import numpy as np from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D, concatenate from tensorflow.keras.layers import Bidirectional, GRU from tensorflow.keras.models import Sequential from tensorflow.keras.metrics import AUC from tensorflow.keras.utils import to_categorical from sklearn.model_selection import train_test_split from sklearn.utils import compute_class_weight df = pd.read_pickle('./data.pkl') clean = CleanText() df['clean_text'] = df['ticket_text'].apply(lambda x: clean.prepare_text(x)) def create_tokenizer(lines): tokenizer = Tokenizer() tokenizer.fit_on_texts(lines) return tokenizer def max_length(lines): return max([len(s.split()) for s in lines]) def encode_text(tokenizer, lines, length):
from gensim.utils import lemmatize from pattern.en import parse from nltk import word_tokenize from nltk import pos_tag from nltk.stem import WordNetLemmatizer from clean_text import CleanText from multiprocessing import cpu_count from concurrent.futures import ThreadPoolExecutor from time import time ct = CleanText() wnl = WordNetLemmatizer() def gensimTest(text): print 'gensim' start = time() lemmas = lemmatize(text) for lemma in lemmas: lemma = lemma.split('/') print lemma[0], lemma[1] end = time() print 'gensim time:', (end-start) print "********************************" def replacePos(pos): pos = pos[0].lower().replace('j', 'a') if pos in ['n', 'a', 'v', 'r']: return pos return 'n' def nltkTest(text):
from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import keras from keras.models import Model, Sequential from keras.layers import LSTM from keras.layers import Flatten, Dense, Dropout, Activation, Input, BatchNormalization from keras.optimizers import Adam from keras.layers.embeddings import Embedding from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences tweets = pd.read_csv('Tweets.csv') tweets = tweets[['text', 'airline_sentiment']] clean = CleanText() tweets['text'] = tweets['text'].apply(lambda x: clean.clean(x)) docs = tweets['text'] labels = tweets['airline_sentiment'] le = LabelEncoder() labels_en = le.fit_transform(labels) #Neutral: 1, Positive: 2, Negative: 0 labels_en = keras.utils.to_categorical(np.asarray(labels_en)) #tokenizer t = Tokenizer() t.fit_on_texts(docs) vocab_size = len(t.word_index) + 1 #encode the documents encoded_docs = t.texts_to_sequences(docs)
tweets.columns = ['sentiment', 'id', 'time', 'query', 'name', 'text'] tweets = tweets[['text', 'sentiment']] tweets = tweets.sample(frac=0.1, random_state=8) '''tweets_df = pd.read_csv('data/tweets_1600000.csv', encoding = 'latin') tweets_df.columns = ['sentiment','id','time','query','name','tweet'] tweets_df = tweets_df[['tweet','sentiment']] tweets_df['clean_tweet'] = clean.clean(tweets_df['tweet']) tweets_df['clean_tweet'] = tweets_df['clean_tweet'].apply(lambda x: clean.tokenize(x)) docs2 = tweets_df['clean_tweet'] t2 = Tokenizer() t2.fit_on_texts(docs2) vocab_size2 = len(t2.word_index) + 1 #encode the documents encoded_docs2 = t2.texts_to_sequences(docs2)''' clean = CleanText() #clean() removes urls, emoticons and hashtags tweets['text'] = clean.clean(tweets['text']) #remove punctuations, stopwords, lemmatize and splits the sentences into tokens tweets['text'] = tweets['text'].apply(lambda x: clean.tokenize(x)) docs = tweets['text'] labels = tweets['sentiment'] le = LabelEncoder() labels_en = le.fit_transform(labels) #Negative: 0, Positive: 1 labels_en = keras.utils.to_categorical(np.asarray(labels_en)) #tokenizer t = Tokenizer() t.fit_on_texts(docs)
__author__ = "Ciprian-Octavian Truică" __copyright__ = "Copyright 2015, University Politehnica of Bucharest" __license__ = "GNU GPL" __version__ = "0.1" __email__ = "*****@*****.**" __status__ = "Production" from clean_text import CleanText from pattern.fr import parse as parseFR from pattern.en import parse as parseEN from nltk import word_tokenize from nltk import pos_tag from nltk.stem import WordNetLemmatizer ct = CleanText() wnl = WordNetLemmatizer() #TO_DO modify this class to accept french also class LemmatizeText: class Word: def __init__(self): self.word = "" self.wtype = [] self.count = 0 self.tf = 0.0 #mode: 0 - fast but not accurate, 1 - slow but accurate (works only for english) def __init__(self, rawText, language='EN', mode=0): self.wordList = []
df = pd.read_pickle('./test_data.pkl') X_test, y_test = df['clean_text'], df.loc[:, df.columns != 'clean_text'] encoder = EncodeText() encoder.load_encoder('./encoder_files/encoder.pkl') encoder.load_encoder_variables('./encoder_files/encoder_variables.json') X_test = encoder.encode_text(X_test, test_data=True) cnn = load_model('./model_files/cnn_classification_model.h5') rnn = load_model('./model_files/rnn_classification_model.h5') hybrid = load_model('./model_files/hybrid_attention_classification_model.h5') clean = CleanText() test_text = ['''I cant get my morgan stanley account to connect to EveryDollar. If I cant get it to connect, Im going to need to get a refund. Its the only value I get from the app'''] tt = [clean.prepare_text(t) for t in test_text] tt = encoder.encode_text(tt, test_data=True) cnn_res = cnn.predict(tt) y_test.columns[np.argmax(cnn_res)] rnn_res = rnn.predict(tt) y_test.columns[np.argmax(rnn_res)] hybrid_res = hybrid.predict(tt) y_test.columns[np.argmax(hybrid_res)]
st.title("Sentiment Analysis of Tweets") date = st.sidebar.date_input('Enter Date Range:',[datetime.date(2019, 7, 6), datetime.date(2019, 7, 8)]) limit = st.sidebar.slider('Enter number of Tweets to scrape:',0,1000) lang = 'english' if st.button('Scrape Tweets'): with st.spinner('Scraping Tweets...'): tweets = query_tweets('videogames', begindate = date[0], enddate = date[1], limit = limit, lang = lang) df = pd.DataFrame(t.__dict__ for t in tweets) df = df[['timestamp','text','likes','retweets']] df = df.drop_duplicates(subset=['likes']) clean = CleanText() df['clean_text'] = clean.clean(df['text']) df['clean_text'] = df['clean_text'].apply(lambda x: clean.tokenize(x)) docs = df['clean_text'] #tokenizer t = Tokenizer() t.fit_on_texts(docs) vocab_size = len(t.word_index) + 1 #encode the documents encoded_docs = t.texts_to_sequences(docs) #pad docs to max length padded_docs = pad_sequences(encoded_docs, maxlen = 40, padding = 'post')