def data():
    # get current directory
    path = os.getcwd()
    # get one directory up
    path = os.path.dirname(path)

    WORD_VECTORS = "../embeddings/word2vec.txt"
    WORD_VECTORS_DIMS = 300

    TRAIN_DATA = path + "/datasets/ABSA16_Restaurants_Train_SB1_v2.xml"
    VAL_DATA = path + "/datasets/EN_REST_SB1_TEST.xml"
    max_length = 80
    # load word embeddings
    print("loading word embeddings...")
    word2idx, idx2word, embeddings = load_word_vectors(WORD_VECTORS,
                                                       WORD_VECTORS_DIMS)
    print("loading categories")
    entity_attribute_pairs = createCategories2()
    # load raw data
    print("loading datasets...")
    train_review, train_ent_attrib = \
        read_xml2_train3(entity_attribute_pairs, TRAIN_DATA)

    gold_review, gold_ent_attrib = \
        read_xml2_train3(entity_attribute_pairs, VAL_DATA)

    y_train = train_ent_attrib
    y_test = gold_ent_attrib
    print("Tokenizing...")
    # nltk tokenizer
    X_train = [
        casual_tokenize(x,
                        preserve_case=False,
                        reduce_len=True,
                        strip_handles=False) for x in train_review
    ]
    X_test = [
        casual_tokenize(x,
                        preserve_case=False,
                        reduce_len=True,
                        strip_handles=False) for x in gold_review
    ]
    print("Vectorizing...")
    X_train = numpy.array(
        [vectorize(x, word2idx, max_length) for x in X_train])
    X_test = numpy.array([vectorize(x, word2idx, max_length) for x in X_test])
    print("Turning test and train data to numpy arrays")
    X_train = numpy.array(X_train)
    y_train = numpy.array(y_train)
    X_test = numpy.array(X_test)
    y_test = numpy.array(y_test)
    label_encoder = LabelBinarizer()
    y_train_res = label_encoder.fit_transform(y_train)
    y_test = label_encoder.fit_transform(y_test)
    # Everything to numpy
    X_train = numpy.array(X_train)
    y_train = numpy.array(y_train_res)
    y_test = numpy.array(y_test)
    return embeddings, X_train, X_test, y_train, y_test, max_length
Esempio n. 2
0
 def emotions_with_decay(self, s, N, M, D):
     # (s)tring, (N)umber of slices
     # slices in (M)emory, (D)ecay factor
     tokens = casual_tokenize(s)
     len_tokens = len(tokens)
     base_window = dict(zip(self.emotions, [0] * len(self.emotions)))
     summary = [base_window] * N
     priors = [base_window] * M
     carry = 0
     done = 0
     for i in range(0, N):
         w_size = int((len_tokens + carry) / N)
         carry = w_size - ((len_tokens + carry) / N)
         w = tokens[done:min(done + w_size, len_tokens)]
         done += w_size
         summ = self.do_sentiments(w)
         # Add decay values to summary
         summ_mem = summ.copy()
         for j in range(M):
             summ_mem = self.add_dict(summ_mem, priors[j])
         summary[i] = summ_mem
         # shift prior windows
         priors.pop(0)
         priors.append(summ)
         # decay prior windows
         priors = self.decay(priors, D)
     return pd.DataFrame(summary)
Esempio n. 3
0
def process_text(text, adds=None, removals=None):
    if adds is None:
        adds = set([])
    if removals is None:
        removals = set([])
    words = casual_tokenize(text, preserve_case=False)
    filtered = set([])
    go_words = set([])
    normed_go_words = set([])
    for x in words:
        if x in sw:
            filtered.add(x)
        else:
            go_words.add(x)

    for x in go_words:
        nw = stemmer.stem(x)

        stem_record.setdefault(nw, set([]))
        stem_record[nw].add(x)

        if nw in sw:
            filtered.add(nw)
        else:
            normed_go_words.add(nw)
    normed_go_words = (normed_go_words | adds) - removals
    return normed_go_words
Esempio n. 4
0
def getSentencePositivity(sentence):
    """
    Returns positivity of the given sentence from -1.0 (very negative) to 1.0 (very positive).
    May return None if no classifier exists to perform sentiment analysis.
    """
    classifier = __getClassifier()
    if classifier is None:
        return None

    #prepare for classifier
    tokenized = list(
        map(lambda x: 'I' if x == 'i' else x, casual_tokenize(sentence)))
    custom_tokens = __remove_noise(tokenized)

    #classify and get probability
    probdist = classifier.prob_classify(
        dict([token, True] for token in custom_tokens))
    pos = probdist.prob('Positive')
    normalized_pos = pos * 2 - 1

    #handle negation
    negation_count = len(
        list(
            filter(lambda x: x[1] == 'RB' and x[0] in ("not", "n't"),
                   pos_tag(tokenized))))
    normalized_pos *= (
        -0.2
    )**negation_count  #invert with lower magnitude if negation is detected in sentence

    #return result
    return normalized_pos
    def tokenize(self,
                 text,
                 a_preserve_case=True,
                 a_reduce_len=False,
                 a_strip_handles=False):

        return casual_tokenize(text,
                               preserve_case=a_preserve_case,
                               reduce_len=a_reduce_len,
                               strip_handles=a_strip_handles)
Esempio n. 6
0
def tokenize(text):
    """Use Twitter aware casual tokenizer followed by WordNetLemmatizer on extracted tokens"""

    # Implementation of casual_tokenize at www.nltk.org/_modules/nltk/tokenize/casual.html
    tokens = casual_tokenize(text.lower())

    lemmatizer = WordNetLemmatizer()
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)
    return clean_tokens
Esempio n. 7
0
def prepare_text_data(text_array, word2idx, MAX_LENGTH):
    "Method used for all the necessary text prepossessing before it enters the model"
    print("Tokenizing...")
    X_train = [
        casual_tokenize(x,
                        preserve_case=False,
                        reduce_len=True,
                        strip_handles=False) for x in text_array
    ]
    print("Vectorizing...")
    X_train = numpy.array(
        [vectorize(x, word2idx, MAX_LENGTH) for x in X_train])
    print("Turning test and train data to numpy arrays")
    X_train = numpy.array(X_train)
    return X_train
Esempio n. 8
0
    def parse_sentence(s):
        '''
        Returns the tagged and tokenized sentence in the form of a (token, tag) list.

        If a (token, tag) list is given, it returns itself. This allows for redundant calls to make sure the sentence is tokenized.
        '''
        if isinstance(s, list):
            return s

        global IDENTITY
        if IDENTITY is None:
            import positivity
            IDENTITY = positivity.Sentience.getIdentity()

        s = s.replace('@' + IDENTITY, IDENTITY)
        tokens = list(map(lambda x: 'I' if x == 'i' else x, casual_tokenize(s,reduce_len=True)))
        tagged_tokens = list(map(lambda x: (x[0], 'NN') if Understanding.matches_target(x[0]) else x, pos_tag(tokens)))
        return tagged_tokens
Esempio n. 9
0
def check(cell):
    '''Spell check one markdown cell'''
    lines = cell['source']
    for i, line in enumerate(lines):
        # using casual tokenize to get rid of urls and emojis
        words = casual_tokenize(line)
        words = [word for word in words if not word.startswith('http')]
        newline = ' '.join(words)
        # this handles e.g. contractions: you're, etc
        words = tokenizer.tokenize(newline)
        misspelled = spell.unknown(words)
        if len(misspelled):
            print_highlight(line, misspelled)
            for word in misspelled:
                new_word = handle_word(word)
                # learnt_words.append(new_word)
                if new_word:
                    line = update_line(line, word, new_word)
            cell['source'][i] = line
Esempio n. 10
0
def preprocess(text):
    if not text or type(text) != str:
        return ''

    text = text.lower()
    text = re.sub(r"https?://[^\s]+", '', text) # hyperlinks
    text = re.sub(r"\@\w+", '', text) # mentions
    text = re.sub(r"#", '', text) # hashtags
    text = re.sub(r"\d+\w*", '', text) # numbers
    text = re.sub(r"'s", '', text) # possesive
    text = re.sub(r"n't", ' not', text) # contractions
    text = re.sub(r"'m", ' am', text)
    text = re.sub(r"'s", ' is', text)
    text = re.sub(r"'re", ' are', text)
    
    words = [word for word in casual_tokenize(text) if word not in stops]
    words = [
        lemmatizer.lemmatize(word, tag_for_lemmatizer(tag))
        for word, tag in pos_tag(words)
    ]
    text = ' '.join(words)
    return text
 def ingest(self, document, weight=1.0, purge_list=PURGE_LIST):
     doc_lower = UnicodeDammit(document).unicode_markup.lower()
     for word in BLACKLIST:
         if word in doc_lower:
             return
     word_stream = [
         item for item in casual_tokenize(document, reduce_len=True)
         if "/" not in item and item not in purge_list
     ]
     if len(word_stream) < 3:
         return
     word_stream = [BEGIN_SYMBOL] * self.n_back + word_stream
     word_stream += [END_SYMBOL] * self.n_back
     for current_position, word in enumerate(word_stream):
         if current_position < self.n_back:
             # advance until we have enough words in memory to consider
             # (even if only begin symbols)
             continue
         prior_ngram = word_stream[(current_position -
                                    self.n_back):current_position]
         result_word = word_stream[current_position]
         self.chain = record_chain_link(prior_ngram, result_word,
                                        self.chain, weight)
Esempio n. 12
0
from nltk.tokenize.casual import casual_tokenize
from nltk.util import ngrams
import re

message = "RT @TJMonticello Best day everrrrrrr at Monticello.... Awesommmmmmeeeeeeee day :*) "

cas_tok = casual_tokenize(message)

print(cas_tok)

cas_tok2 = casual_tokenize(message, reduce_len=True, strip_handles=True)
print(cas_tok2)

print('------------------')

sentence = "Albiona Hoti filloi punen si software engineer ne moshen 22 vjecare."

pattern = re.compile(r"([-\s.,;!?])+")

tokens = pattern.split(sentence)
tokens = [x for x in tokens if x and x not in '- \t\n.,;!?']
print(tokens)

print('---------------')

two_grams = list(ngrams(tokens, 2))

print(two_grams)

print('---------------')
Esempio n. 13
0
eric.babble_and_evaluate_one()

print results["acc"]

eric.babble_and_evaluate_one()
vec, features = eric.fit_tfidf()

# eric.amnesia(3)
from nltk.tokenize.casual import casual_tokenize

for k, v in eric.observational_memory.items():
    try:
        print "{}".format(unicode(v["text"]))
    except:
        print "***"
        print casual_tokenize(v["text"])

print unicode(u'\U0001f98b')

for k, v in eric.observational_memory.items():
    eric.observational_memory[k]["stemmed_text"] = stem_and_tokenize(
        v["text"])[3]
eric.pickle_me()

for item in vec.get_feature_names():
    print item

from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
reduced = svd.fit_transform(features)
Esempio n. 14
0
def annotate_string(string):
    return liwc_inst.annotate_doc(casual_tokenize(string, reduce_len=True))
Esempio n. 15
0
def summarize_string(string):
    return liwc_inst.summarize_doc(casual_tokenize(string, reduce_len=True))
Esempio n. 16
0
 def get_string_tokens(text: str) -> list:
     """Returns word tokens of input string."""
     return casual_tokenize(text)
Esempio n. 17
0
from nltk.tokenize.casual import casual_tokenize
message = "RT @TJMonticello Best day everrrrrrr at Monticello." \
          "Awesommmmmmeeeeeeee day :*)"
print(casual_tokenize(message))
print(casual_tokenize(message, reduce_len=True, strip_handles=True))
Esempio n. 18
0
    def generate_response(s, autoanswer_level=2):
        '''
        Generates a response for the given message with the autoanswer_level (default 1).

        autoanswer_level:
        - 0: Do not answer except for debug calls
        - 1: Only respond rarely
        - 2: Respond sometimes if confident
        - 3: Respond whenever possible
        - 4: Always respond
        '''

        debug_out = Responder.process_debug_output(s)
        if debug_out:
            return debug_out

        #
        # SENTENCE PARSING
        #

        words = casual_tokenize(s.lower(), reduce_len=True)
        parsed_result = Understanding.parse_queries(s, merge_results=True)

        subject_call = parsed_result["subject_call"]
        queries      = parsed_result["queries"]
        statements   = parsed_result["statements"]
        tofu_tagged           = Understanding.is_target_tagged(s)
        tofu_targeted         = parsed_result["target_summoned"] or autoanswer_level >= 3
        someone_else_targeted = not tofu_targeted and subject_call

        query_types = set(map(lambda x: x[1], queries))
        too_complicated = len(query_types) > 1 or len(queries) > 4

        Sentience.exposeToMessage(s)
        mood = Sentience.getPrimaryMood()

        if autoanswer_level == 0:
            return None

        #
        # Greetings
        #
        if queries == [] and statements == [] and tofu_targeted:
            #greeting likely
            now = datetime.datetime.now()

            if mood > 0.3:
                if (6 <= now.hour <= 11) and 'morning' in words:
                    return random.choice(['good morning', 'morning', 'おはよう']) + ('!' if mood > 0.75 else ('.' if mood < 0.5 else ''))
                if (19 <= now.hour <= 23 or now.hour <= 2) and ('night' in words or 'gn' in words):
                    return random.choice(['good night', 'gn', 'おやすみ']) + ('.' if mood < 0.5 else '')
                if 'hello' in words or 'hi' in words:
                    if mood > 0.7 or autoanswer_level >= 4:
                        return random.choice(['hello!', 'hi!', 'こんにちは!'])

            if mood <= 0.3:
                return random.choice(['bleh', 'o', 'meh', 'hmph'])

        #
        # Query answering
        #

        if too_complicated:
            if tofu_targeted:
                return random.choice([
                    "i'm confused",
                    "interesting question",
                    "uh.. i am confused",
                    "i don't understand what you mean",
                    "this sentence is too complicated for me to understand",
                    "this question is too confusing for me"
                    "hmm",
                ])
            return None

        if 'STD_QN' in query_types and tofu_targeted:
            return random.choice([
                "sorry, this question is not within my capabilities to answer",
                "i can't answer that yet oops",
                "sorry, the question is too open-ended for me",
                "i don't know how to answer that, am weak in FRQs sry",
                "i'm not smart enough to know how to answer that",
                "that sounds like an interesting question",
                "hmm",
            ])

        if 'YN_QN' in query_types and ((autoanswer_level >= 2 and not someone_else_targeted) or tofu_targeted):
            filtered_queries = list(filter(lambda x: x[1] == 'YN_QN', queries))
            if len(filtered_queries) == 1:
                yes_opt = random.choice([
                    "perhaps",
                    "i believe yes",
                    "yeah",
                    "yes",
                    "my deductions indicate yes",
                    "maybe",
                    "i think so",
                    "very likely",
                    "most definitely",
                    "yes indeed",
                    "i'd say yes"
                ])
                no_opt = random.choice([
                    "maybe not",
                    "my sources say no",
                    "no",
                    "nah",
                    "i don't think so",
                    "doubt it",
                    "probably not",
                    "most definitely not",
                    "i think no",
                    "not at all"
                ])
                rnd_opt = random.choice([
                    "i'm not sure about that",
                    "bleh",
                    "interesting question",
                    "i don't wanna tell you right now",
                    "i don't have a clue",
                    "hmmm",
                    "my sources cannot be trusted"
                ])

                chosen = Sentience.decideResponseAgree(filtered_queries[0][0])
                if chosen is None:
                    return rnd_opt
                return yes_opt if chosen else no_opt

            if len(filtered_queries) == 2:
                opt_1 = random.choice([
                    "first option",
                    "go with the first",
                    "the former"
                ])
                opt_2 = random.choice([
                    "second option",
                    "on second thought, your second option",
                    "the latter"
                ])
                opt_nil = random.choice([
                    "why not both",
                    "i can't find the answer to that",
                    "i think neither",
                    "can't decide, so i'll say yes"
                ])
                subj, pred1 = Understanding.parse_sentence_subject_predicate(filtered_queries[0][0])
                _   , pred2 = Understanding.parse_sentence_subject_predicate(filtered_queries[1][0])
                chosen = Sentience.decideResponseOptionsIndex(subj, [pred1, pred2])
                if chosen == 0:
                    return opt_1
                if chosen == 1:
                    return opt_2
                return opt_nil

            if len(filtered_queries) > 2 and tofu_targeted:
                subject = None
                options = []
                for query, _ in filtered_queries:
                    res = Understanding.parse_sentence_subject_predicate(query)
                    if subject is None:
                        subject = res[0]
                    options.append(res[1])
                chosen = Sentience.decideResponseOptionsIndex(subject, options)
                if chosen is None:
                    return random.choice([
                        "i can't decide",
                        "am a little confused here",
                        "not sure which one"
                    ])
                return random.choice([
                    "option %d it is",
                    "i'll pick option %d",
                    "i think option %d",
                    "option %d"
                ]) % (chosen+1)

        #
        # Misc responses
        #
        if mood > 0.5 and Sentience.getExposedPositivity() >= 0 and autoanswer_level >= 1:
            if not tofu_targeted and (IDENTITY.lower() in words or IDENTITY.lower() == s.lower()) and random.random() <= 0.1:
                return random.choice(['hmm i heard my name', 'hmmmm', 'interesting', 'hm'])
            if len(words) <= 5:
                combos = _get_message_combos()

                words_copy = words.copy()
                random.shuffle(words_copy)

                for word in words_copy:
                    for w in [word, Understanding.remove_repeated_chars_word(word)]:
                        if w in combos:
                            w_response, w_response_chance = combos[w]

                            #increase chances at higher autoanswer levels
                            if autoanswer_level >= 3:
                                w_response_chance **= 0.25

                            #lower chances at lower autoanswer levels
                            if autoanswer_level <= 1:
                                w_response_chance **= 3

                            if (tofu_tagged or random.random() <= w_response_chance):
                                return random.choice(w_response)
                            break

        roll = random.random()
        if Sentience.isExposedPositivityOverloaded():
            roll **= 2
        if autoanswer_level >= 4 or (autoanswer_level >= 2 and roll > 0.95) or (tofu_targeted and roll > 0.75):
            if mood >= 0.3:
                x = Sentience.determineMessagePositivity(s)
                if x >= 0.6:
                    return random.choice([
                        'ay',
                        'nice',
                        ':D',
                        'yay',
                        'heh',
                        'haha',
                        'lol',
                    ])

                if x < 0:
                    return random.choice([
                        'oof',
                        'ono',
                        'uh',
                        'oops',
                        'sad',
                        ':(',
                        '.-.',
                    ])

                return random.choice([
                    'hmm',
                    'ah',
                    'hm',
                    'oof',
                    'interesting',
                ])


            return random.choice(['o', 'meh', 'm', '.'])

        return None
Esempio n. 19
0
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

tokenized_sen = tokenizer.tokenize(sentence)

# print("TreebankWordTokenizer")
# print(tokenized_sen)

# Tokenize informal text from social networks such as Twitter and Facebook
from nltk.tokenize.casual import casual_tokenize

message = """RT @TJMonticello Best day everrrrrrr at Monticello. Awesommmmmmeeeeeeee day :*)"""

tokenized_sen = casual_tokenize(message)

# print("casual_tokenize")
# print(tokenized_sen)

tokenized_sen = casual_tokenize(message, reduce_len=True, strip_handles=True)

# print("casual_tokenize reduce_len")
# print(tokenized_sen)

#  Stop Words
import nltk

nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
'''
Esempio n. 20
0
import numpy as np
import pandas as pd
import seaborn as sns
import re
import nltk
from nltk.tokenize.casual import casual_tokenize
from nltk.util import ngrams
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

sen = "I know he gets knocked on a lot.But I like being un-sober and watching robots beat each other up. :)"
tokens = casual_tokenize(sen, reduce_len=True, strip_handles=True)
print(list(ngrams(tokens, 2)))  #Creates a generator
analyzer = SentimentIntensityAnalyzer()
#print(analyzer.lexicon)
print(analyzer.polarity_scores(text=sen))
Esempio n. 21
0
def tokenize_text(text):
    tokens = casual_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() or word.isdigit()]
    return [word for word in tokens if word not in stopwords.words('english')]
        res = []
        for doc in X:
            res.append(self._doc_transform(doc))
        return res


def avg_glove(df):
    vectors = []
    for t in tqdm(df.content.values):
        vectors.append(np.average(GloveStruct.glove.query(word_tokenize(t)), axis=0))
    return np.array(vectors)


def tfidf_glove(df, idf_dict):
    vectors = []
    for title in tqdm(df.content.values):
        glove_vectors = GloveStruct.glove.query(word_tokenize(title))
        weights = [idf_dict.get(word, 1) for word in word_tokenize(title)]
        vectors.append(np.average(glove_vectors, axis=0, weights=weights))
    return np.array(vectors)
"""

if __name__ == '__main__':
    __nltk_corpus_data_downloader()
    snowball = SnowballTokenizer()
    casual_tokenize('to be fair.aaa aaa aaa')
    print(snowball('to be fair. $3.11 beeb. aaa aaa aaa 80% 3453'))
    lemma = LemmaTokenizer()
    lemma('test')
    stem = StemTokenizer()
    stem('test')
Esempio n. 23
0
def casualTokenize(raw_sentence, preserve_case=False):
    return casual_tokenize(raw_sentence,
                           preserve_case=preserve_case,
                           reduce_len=True)
Esempio n. 24
0
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r"\w+|$[0-9.]+|\S+")
print(tokenizer.tokenize(sentence))
print(
    "----------------------------------------------------------------------------------"
)
from nltk.tokenize import TreebankWordTokenizer
sentence = """Monticello wasn't designated as UNESCO World Heritage Site until 1987."""
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(sentence))
print(
    "------------------------------------------------------------------------------------"
)
from nltk.tokenize.casual import casual_tokenize
message = """RT @TJMonticello Best day everrrrrrrr at Monticello. Awesommmmmmmmmeeeeeeeee day :*)"""
print(casual_tokenize(message))
print(casual_tokenize(message, reduce_len=True, strip_handles=True))
print(
    "-----------------------------------------------------------------------------------------"
)
sentence = "Thomas Jefferson began bulding Monticello at the age of 26."
from nltk.util import ngrams
print(list(ngrams(tokenizer.tokenize(sentence), 2)))
print(list(ngrams(tokenizer.tokenize(sentence), 3)))
two_grams = list(ngrams(tokenizer.tokenize(sentence), 2))
print([" ".join(x) for x in two_grams])
stop_words = ["a", "an", "the", "on", "of", "off", "this", "is"]
tokens = ["the", "house", "is", "on", "fire"]
tokens_without_stopwords = [x for x in tokens if x not in stop_words]
print(tokens_without_stopwords)
print(
from three_step_decoding import *
from nltk.tokenize.casual import casual_tokenize

tsd = ThreeStepDecoding('lid_models/hinglish',
                        htrans='nmt_models/rom2hin.pt',
                        etrans='nmt_models/eng2eng.pt')

dataset = []
dataset_t = []

with open('/home/devanshg27/cm_parallel_data/en-hi-codemixed-corpus/s-enhi.txt'
          ) as f:
    for line in f:
        line = line.rstrip()
        line = casual_tokenize(line,
                               preserve_case=True,
                               reduce_len=True,
                               strip_handles=False)
        dataset.append(line)

with open('/home/devanshg27/cm_parallel_data/en-hi-codemixed-corpus/t-en.txt'
          ) as f:
    for line in f:
        line = line.rstrip()
        line = casual_tokenize(line,
                               preserve_case=True,
                               reduce_len=True,
                               strip_handles=False)
        dataset_t.append(line)

valid_idx = [
    i for i in range(len(dataset)) if dataset[i] != [] and dataset_t[i] != []
Esempio n. 26
0
    def __init__(self,
                 csv_path,
                 tokenizer_name,
                 use_stopwords=True,
                 use_preprocessor=False,
                 min_df=10,
                 max_df=0.75,
                 max_ngram=3):

        # Where data is stored
        self.csv_path = csv_path

        #Read data directly
        self.dataframe = pd.read_csv(self.csv_path)

        # Choose tokenizer
        if tokenizer_name == 'casual_std':
            func = lambda x: casual_tokenize(
                x, preserve_case=True, reduce_len=False, strip_handles=False)
            self.tokenizer = func
        elif tokenizer_name == 'casual_reduce':
            func = lambda x: casual_tokenize(
                x, preserve_case=False, reduce_len=True, strip_handles=True)
            self.tokenizer = func
        elif tokenizer_name == 'words':
            self.tokenizer = tokenize_words
        elif tokenizer_name == 'orig':
            self.tokenizer = tokenize
        else:
            raise NotImplementedError('Unknown tokenizer')

        # Stopwords
        if use_stopwords:
            self.stopwords = nltk.corpus.stopwords.words("english").extend(
                ["#ff", "ff", "rt"])
        else:
            self.stopwords = None

        # Preprocessor
        if use_preprocessor:
            self.preprocessor = preprocess
        else:
            self.preprocessor = None

        # Some hyperparameters
        self.min_df = min_df
        self.max_df = max_df
        self.max_ngram = max_ngram

        # Vectorizer
        self.vectorizer = TfidfVectorizer(
            tokenizer=self.tokenizer,  #casual_tokenize_specified,
            preprocessor=self.preprocessor,
            ngram_range=(1, self.max_ngram),
            stop_words=self.stopwords,
            use_idf=True,
            smooth_idf=False,
            norm=None,
            decode_error='replace',
            max_features=10000,
            min_df=self.min_df,
            max_df=self.max_df)
        # PosVectorizer
        self.pos_vectorizer = TfidfVectorizer(
            tokenizer=None,
            lowercase=False,
            preprocessor=None,
            ngram_range=(1, self.max_ngram),
            stop_words=None,
            use_idf=False,
            smooth_idf=False,
            norm=None,
            decode_error='replace',
            max_features=5000,
            min_df=5,
            max_df=0.75,
        )

        #Construct tfidf matrix and get relevant scores
        self.tfidf = self.vectorizer.fit_transform(
            self.dataframe['tweet']).toarray()
        self.vocab = {
            v: i
            for i, v in enumerate(self.vectorizer.get_feature_names())
        }
        self.idf_vals = self.vectorizer.idf_
        self.idf_dict = {i: self.idf_vals[i] for i in self.vocab.values()}
        print(f'A vocab was created. It consists of {len(self.vocab)} entries')

        # POS-tagging
        self.tweet_tags = [
            pos_tag_tweet(tweet, self.tokenizer, print_tweet=False)
            for tweet in self.dataframe['tweet']
        ]
        self.pos = self.pos_vectorizer.fit_transform(pd.Series(
            self.tweet_tags)).toarray()
        self.pos_vocab = {
            v: i
            for i, v in enumerate(self.pos_vectorizer.get_feature_names())
        }

        # Other features: this is untouched
        self.feats = get_feature_array(self.dataframe['tweet'])

        #Now join them all up
        self.features = np.concatenate([self.tfidf, self.pos, self.feats],
                                       axis=1)
        self.feature_names = [k for k, _ in self.vocab.items()] + [
            k for k, _ in self.pos_vocab.items()
        ] + [
            "FKRA", "FRE", "num_syllables", "avg_syl_per_word", "num_chars",
            "num_chars_total", "num_terms", "num_words", "num_unique_words",
            "vader neg", "vader pos", "vader neu", "vader compound",
            "num_hashtags", "num_mentions", "num_urls", "is_retweet"
        ]

        self.labels = self.dataframe['class']

        print(
            f'\n Data has been processed and is now available. Feature dim: {self.features.shape}'
        )
Esempio n. 27
0
from nltk.tokenize.casual import casual_tokenize

message = """RT TJMonticello Best day everrrrrrr at Monticello. Awesommmmmmeeeeeeee day"""

tokens = casual_tokenize(message)
print(tokens)

tokens = casual_tokenize(message, preserve_case=False, reduce_len=True, strip_handles=True)
print(tokens)