def __init__(self, n_gram, tokenizer, stop_en=None, stop_th=None, keyword=None):

        import re
        import os
        from nltk.tokenize import TreebankWordTokenizer
        from clean_text import CleanText

        self.cleaner = CleanText(stop_en=stop_en, stop_th=stop_th, keyword=keyword)
        self.test_text = 'ตัวอย่างความต้องการใช้ตัวอย่างความต้องการลีนุ๊กซ์การใช้ยากลำบาก'
        self.eng_tokenizer = TreebankWordTokenizer()
        self.n_gram = n_gram
        self.tokenizer = tokenizer
        self.pattern_sentence_collide = re.compile('[a-z][A-Z]]')
        self.pattern_thai_char = re.compile(u'[\u0e00-\u0e7f]')
        if keyword:
            with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', keyword), 'rt', encoding='utf-8') as keyword_file:
                self.keyword = set([item for item in keyword_file.read().split('\n')])
        else:
            self.keyword = set([])
Beispiel #2
0
    def _prepare_data(self, data_path, test_size, random_state):
        """Loads data and prepares for training

        Args:
            data_path (str): File path to the data
            test_size (float): Percent of the data to use for the test set
            random_state (int): Seed for randomly splitting data for train and test sets
        """
        ct = CleanText()

        df = pd.read_pickle(data_path)
        df = df[df['issue'] != '']

        df['clean_text'] = df['ticket_text'].apply(
            lambda x: ct.prepare_text(x))

        weights = self._weights_helper(df['issue'])

        trainLines, trainLabels = df['clean_text'], df['issue']
        labels = pd.get_dummies(trainLabels)

        X_train, X_test, y_train, y_test = train_test_split(
            trainLines,
            labels,
            test_size=test_size,
            random_state=random_state,
            stratify=labels)

        encoder = EncodeText()
        length = encoder.max_length(X_train)
        vocab_size = encoder.vocab_size(X_train)
        X_train = encoder.encode_text(X_train)
        X_test = encoder.encode_text(X_test, test_data=True)

        self.weights = weights
        self.labels = labels
        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test
        self.length = length
        self.vocab_size = vocab_size
    def __init__(self):
        self.data = pd.read_csv('../data/sa_dataset.csv', index_col=0)
        self.data['Reviews'] = CleanText().fit_transform(self.data['Reviews'])
        self.X, self.y = self.classify_data(self.data)
        self.target_names = [
            '#GENERAL', '#FEATURE', '#PRICE', '#CAMERA', '#DESIGN#SCREEN'
        ]
        self.pipeline = Pipeline([('vect', CountVectorizer(ngram_range=(1,
                                                                        1))),
                                  ('clf', LogisticRegression())])
        self.model = []

        self.a = []
        self.p = []
        self.r = []
        self.f = []
Beispiel #4
0
    def __init__(self):
        self.data = pd.read_csv('../data/ac_dataset.csv', index_col=0)
        self.data['Target'] = self.convert_targets(
            self.data.drop(['Reviews'], axis=1))

        self.X = self.data['Reviews']
        self.y = self.data['Target']

        self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
            self.X, self.y, train_size=0.8, random_state=0)

        self.pipeline = Pipeline([
            ('preprocess', CleanText()),
            ('vect', CountVectorizer(ngram_range=(1, 2))),
            ('clf', OneVsRestClassifier(LogisticRegression()))
        ])
        self.model = None
from encode_text import EncodeText
from clean_text import CleanText
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, MaxPooling1D, Conv1D, concatenate
from tensorflow.keras import metrics
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from numpy import array
import pandas as pd

df = pd.read_pickle('./data.pkl')
ct = CleanText()
encoder = EncodeText()

df = df[df.issue.str.contains('cant_add_bank|refund_e_|transactions_not_importing')]

df['clean_text'] = df['ticket_text'].apply(lambda x: ct.prepare_text(x))

trainLines, trainLabels = df['clean_text'], df['issue']

lb = LabelEncoder()
transformed_labels = lb.fit_transform(trainLabels)
transformed_labels = to_categorical(transformed_labels)

X_train, X_test, y_train, y_test = train_test_split(trainLines, transformed_labels, test_size=.2, random_state=42, stratify=transformed_labels)


length = encoder.max_length(X_train)
vocab_size = encoder.vocab_size(X_train)
X_train = encoder.encode_text(X_train)
class Tokenizer:

    def __init__(self, n_gram, tokenizer, stop_en=None, stop_th=None, keyword=None):

        import re
        import os
        from nltk.tokenize import TreebankWordTokenizer
        from clean_text import CleanText

        self.cleaner = CleanText(stop_en=stop_en, stop_th=stop_th, keyword=keyword)
        self.test_text = 'ตัวอย่างความต้องการใช้ตัวอย่างความต้องการลีนุ๊กซ์การใช้ยากลำบาก'
        self.eng_tokenizer = TreebankWordTokenizer()
        self.n_gram = n_gram
        self.tokenizer = tokenizer
        self.pattern_sentence_collide = re.compile('[a-z][A-Z]]')
        self.pattern_thai_char = re.compile(u'[\u0e00-\u0e7f]')
        if keyword:
            with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'dict', keyword), 'rt', encoding='utf-8') as keyword_file:
                self.keyword = set([item for item in keyword_file.read().split('\n')])
        else:
            self.keyword = set([])

    def tokenizer(self, text=None, cleaning=False):

        def n_gram_compile(tokens, n):

            tokens = tokens[:]
            n_tokens = []
            if n <= 1:
                return tokens
            for j, token in enumerate(tokens[:-(n - 1)]):
                new_token = ''
                for word in tokens[j:j + n]:
                    if self.pattern_thai_char.search(word) and len(word) > 1:
                        new_token += word
                    else:
                        new_token = ''
                        break
                if new_token:
                    n_tokens.extend([new_token])
            return n_tokens

        def n_grams_compile(tokens, n):

            if n < 2:
                return tokens
            n_tokens = []
            for j in range(2, n + 1):
                n_tokens.extend(n_gram_compile(tokens, j))
            n_tokens = tokens + n_tokens
            return n_tokens

        if not text:
            return
        elif text == '-test':
            text = self.test_text

        if cleaning:
            text = self.cleaner.clean_text(text)
        print(text)
        text_split = text.split('|')
        print(text_split)

        first_pass = []
        for i, item in enumerate(text_split):
            if self.pattern_sentence_collide.search(item) and item not in self.keyword:
                c_text = self.pattern_sentence_collide.search(item)
                first_pass.extend([c_text.string[:c_text.span()[0]+1], c_text.string[c_text.span()[1]-1:]])
            else:
                first_pass.append(item)
        second_pass = []
        for i, chunk in enumerate(first_pass):
            if self.pattern_thai_char.search(chunk) and len(chunk) > 1:
                new_chunk = self.tokenizer(chunk)
                new_chunk = n_grams_compile(new_chunk, self.n_gram)
                second_pass.extend(new_chunk)
            else:
                second_pass.append(chunk.lower())

        return second_pass
Beispiel #7
0
from clean_text import CleanText
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from numpy import array
import numpy as np
from tensorflow.keras.layers import Input, Dense, Flatten, Dropout, Embedding, Conv1D, MaxPooling1D, concatenate
from tensorflow.keras.layers import Bidirectional, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.metrics import AUC
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight

df = pd.read_pickle('./data.pkl')

clean = CleanText()

df['clean_text'] = df['ticket_text'].apply(lambda x: clean.prepare_text(x))


def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer


def max_length(lines):
    return max([len(s.split()) for s in lines])


def encode_text(tokenizer, lines, length):
Beispiel #8
0
from gensim.utils import lemmatize
from pattern.en import parse
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from clean_text import CleanText
from multiprocessing import cpu_count
from concurrent.futures import ThreadPoolExecutor
from time import time

ct = CleanText()
wnl = WordNetLemmatizer()

def gensimTest(text):
    print 'gensim'
    start = time()
    lemmas = lemmatize(text)
    for lemma in lemmas:
        lemma = lemma.split('/')
        print lemma[0], lemma[1]
    end = time()
    print 'gensim time:', (end-start)
    print "********************************"

def replacePos(pos):
    pos = pos[0].lower().replace('j', 'a')
    if pos in ['n', 'a', 'v', 'r']:
        return pos
    return 'n'

def nltkTest(text):
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import keras
from keras.models import Model, Sequential
from keras.layers import LSTM
from keras.layers import Flatten, Dense, Dropout, Activation, Input, BatchNormalization
from keras.optimizers import Adam
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tweets = pd.read_csv('Tweets.csv')
tweets = tweets[['text', 'airline_sentiment']]

clean = CleanText()

tweets['text'] = tweets['text'].apply(lambda x: clean.clean(x))

docs = tweets['text']
labels = tweets['airline_sentiment']
le = LabelEncoder()
labels_en = le.fit_transform(labels)  #Neutral: 1, Positive: 2, Negative: 0
labels_en = keras.utils.to_categorical(np.asarray(labels_en))

#tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
vocab_size = len(t.word_index) + 1
#encode the documents
encoded_docs = t.texts_to_sequences(docs)
tweets.columns = ['sentiment', 'id', 'time', 'query', 'name', 'text']
tweets = tweets[['text', 'sentiment']]
tweets = tweets.sample(frac=0.1, random_state=8)
'''tweets_df = pd.read_csv('data/tweets_1600000.csv', encoding = 'latin')
tweets_df.columns = ['sentiment','id','time','query','name','tweet']
tweets_df = tweets_df[['tweet','sentiment']]
tweets_df['clean_tweet'] = clean.clean(tweets_df['tweet'])
tweets_df['clean_tweet'] = tweets_df['clean_tweet'].apply(lambda x: clean.tokenize(x))
docs2 = tweets_df['clean_tweet']
t2 = Tokenizer()
t2.fit_on_texts(docs2)
vocab_size2 = len(t2.word_index) + 1
#encode the documents
encoded_docs2 = t2.texts_to_sequences(docs2)'''

clean = CleanText()

#clean() removes urls, emoticons and hashtags
tweets['text'] = clean.clean(tweets['text'])
#remove punctuations, stopwords, lemmatize and splits the sentences into tokens
tweets['text'] = tweets['text'].apply(lambda x: clean.tokenize(x))

docs = tweets['text']
labels = tweets['sentiment']
le = LabelEncoder()
labels_en = le.fit_transform(labels)  #Negative: 0, Positive: 1
labels_en = keras.utils.to_categorical(np.asarray(labels_en))

#tokenizer
t = Tokenizer()
t.fit_on_texts(docs)
Beispiel #11
0
__author__ = "Ciprian-Octavian Truică"
__copyright__ = "Copyright 2015, University Politehnica of Bucharest"
__license__ = "GNU GPL"
__version__ = "0.1"
__email__ = "*****@*****.**"
__status__ = "Production"

from clean_text import CleanText
from pattern.fr import parse as parseFR
from pattern.en import parse as parseEN
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

ct = CleanText()
wnl = WordNetLemmatizer()


#TO_DO modify this class to accept french also
class LemmatizeText:
    class Word:
        def __init__(self):
            self.word = ""
            self.wtype = []
            self.count = 0
            self.tf = 0.0

    #mode: 0 - fast but not accurate, 1 - slow but accurate (works only for english)
    def __init__(self, rawText, language='EN', mode=0):
        self.wordList = []
df = pd.read_pickle('./test_data.pkl')

X_test, y_test = df['clean_text'], df.loc[:, df.columns != 'clean_text']

encoder = EncodeText()

encoder.load_encoder('./encoder_files/encoder.pkl')
encoder.load_encoder_variables('./encoder_files/encoder_variables.json')

X_test = encoder.encode_text(X_test, test_data=True)

cnn = load_model('./model_files/cnn_classification_model.h5')
rnn = load_model('./model_files/rnn_classification_model.h5')
hybrid = load_model('./model_files/hybrid_attention_classification_model.h5')

clean = CleanText()

test_text = ['''I cant get my morgan stanley account to connect to EveryDollar. If I cant get it to connect, 
    Im going to need to get a refund. Its the only value I get from the app''']

tt = [clean.prepare_text(t) for t in test_text]
tt = encoder.encode_text(tt, test_data=True)

cnn_res = cnn.predict(tt)
y_test.columns[np.argmax(cnn_res)]

rnn_res = rnn.predict(tt)
y_test.columns[np.argmax(rnn_res)]

hybrid_res = hybrid.predict(tt)
y_test.columns[np.argmax(hybrid_res)]
    st.title("Sentiment Analysis of Tweets")    
    date = st.sidebar.date_input('Enter Date Range:',[datetime.date(2019, 7, 6), datetime.date(2019, 7, 8)])
    limit = st.sidebar.slider('Enter number of Tweets to scrape:',0,1000)
    lang = 'english'
    
    
    if st.button('Scrape Tweets'):
        with st.spinner('Scraping Tweets...'):
            tweets = query_tweets('videogames', begindate = date[0], enddate = date[1], limit = limit, lang = lang)
        
        
        df = pd.DataFrame(t.__dict__ for t in tweets)
        df = df[['timestamp','text','likes','retweets']]
        df = df.drop_duplicates(subset=['likes'])
        clean = CleanText()
        df['clean_text'] = clean.clean(df['text']) 
        df['clean_text'] = df['clean_text'].apply(lambda x: clean.tokenize(x)) 
        
        docs = df['clean_text']
        
        #tokenizer
        t = Tokenizer()
        t.fit_on_texts(docs)
        vocab_size = len(t.word_index) + 1
        
        #encode the documents
        encoded_docs = t.texts_to_sequences(docs)
        
        #pad docs to max length
        padded_docs = pad_sequences(encoded_docs, maxlen = 40, padding = 'post')