def relevant_days(dates, query, n_days): tknzr = tokenizer.TweetTokenizer( preserve_handles=False, preserve_hashes=False, preserve_case=False, preserve_url=False) args = {"tknzr": tknzr, "lemmatize": True} query_tokens = tokenizer_bow(query, tknzr, lemmatize=True) choosen_dates = [] for date in dates: date_tweets = df.loc[df['created_at'] == date].get("text").values if len(date_tweets) > 0: vectorizer = CountVectorizer(stop_words="english", binary=False, tokenizer=lambda text: tokenizer_bow(text, **args)) tf = vectorizer.fit_transform(date_tweets) # Tokens Frequency freqs = list(zip(vectorizer.get_feature_names(), np.ravel(tf.sum(axis=0)))) sum = 0 for t in query_tokens: try: token, freq = next((token, freq) for (token, freq) in freqs if token == t) sum += freq except StopIteration: sum += 0 choosen_dates.append(sum) order = np.flip(np.argsort(choosen_dates)) return dates[order][:n_days]
def __init__(self): self.T = tokenizer.TweetTokenizer(preserve_handles=False, preserve_url=False, preserve_len=False, preserve_hashes=False, preserve_emoji=False, preserve_case=True, regularize=True)
def train(input_text): T = tokenizer.TweetTokenizer() mc = collections.defaultdict(dict) mr = collections.defaultdict(dict) for i in input_text: tokens = T.tokenize(i) tokens.insert(0, 'START') tokens.append('END') mc = add_to_model(tokens, mc) mr = add_to_reverse_model(tokens, mr) return mc, mr
def features_bow(X_BOW, _dataTexts, _lemmatize=False, _mdf=3, _metric="cosine", _k=10, _handles=False, _hashes=False, _case=False, _url=False): if X_BOW is None: tknzr = tokenizer.TweetTokenizer(preserve_handles=_handles, preserve_hashes=_hashes, preserve_case=_case, preserve_url=_url) X_BOW_VEC, X_BOW = init_bow(_dataTexts, { "tknzr": tknzr, "lemmatize": _lemmatize }, _mdf) return X_BOW, X_BOW_VEC
def tokenize(tweet): #remove email tweet = re.sub('\S*@\S*\s?', '', tweet) #remove url tweet = re.sub(r'http\S+', '', tweet) tweet = tokenizer.TweetTokenizer( preserve_case=False, preserve_handles=False, preserve_hashes=False, regularize=True, preserve_emoji=True ).tokenize(tweet) #emoji processing tweet = list(map(lambda x: str2emoji(x), tweet)) tweet = ' '.join(tweet) #remove contraction tweet = contraction_removal(tweet) #remove puntuation tweet = re.sub('[' + punctuation + ']', '', tweet).split(' ') tweet = list(filter(lambda x: x != u'', tweet)) return tweet
import re from nltk import pos_tag from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from string import punctuation from nltk.corpus import stopwords import pandas as pd import nltk notstopwords = set(('not', 'no', 'mustn', "mustn\'t")) stopwords = set(stopwords.words('english')) - notstopwords lemmatizer = WordNetLemmatizer() T = tokenizer.TweetTokenizer(preserve_handles=False, preserve_hashes=False, preserve_case=False, preserve_url=False, regularize=True) def data_preprocessing(path_tweets): tweets = pd.read_csv(path_tweets, encoding='utf-8', sep=',') tweets['text'] = tweets['text'].apply(lambda x: standardization(x)) tweets['sentiment'] = tweets['airline_sentiment'].apply( lambda x: 0 if x == 'negative' else (1 if x == 'neutral' else 2)) return tweets['text'], tweets['sentiment'] def data_preprocessing(path_tweets, corpora): data = pd.read_csv(path_tweets, encoding='utf-8',
import flask import torch from flask import Flask, render_template, request from utils import label_full_decoder import sys import config import dataset import engine from model import BERTBaseUncased from tokenizer import tokenizer T = tokenizer.TweetTokenizer(preserve_handles=True, preserve_hashes=True, preserve_case=False, preserve_url=False) app = Flask(__name__, static_url_path='', static_folder='app/static', template_folder='app/templates/public') MODEL = None DEVICE = config.device def preprocess(text): tokens = T.tokenize(text) print(tokens, file=sys.stderr) ptokens = [] for index, token in enumerate(tokens): if "@" in token: if index > 0:
#!/usr/bin/python #-*-coding:utf-8-*- from nltk.corpus import stopwords from tokenizer import tokenizer import nltk import os import json import re word_freq = dict() #nltk.download('stopwords') stop_words = set(stopwords.words('english')) T = tokenizer.TweetTokenizer() #过滤字符 def Filter(input): pattern1 = re.compile(r'http[a-zA-Z0-9.?/&=:]*') input = pattern1.sub("", input) # pattern2 = re.compile(r'[-,$()#+&*!?.":;/–:,。“”‘’=+]') # input = pattern2.sub(" ", input) r = "" words = input.strip().split() for word in words: word = word.lower() if '@' not in word: r += (word + ' ') if word in word_freq: word_freq[word] += 1 else:
dataset.append(datapoint) label_counter[indexer.get_object(label)] += 1 count += 1 if count % 500000 == 0: print("created", count, "datapoints") return dataset indexer = Indexer() label_counter = Counter() dataset = create_dataset(tweets, indexer, label_counter print ("length of dataset: ", len(dataset)) from tokenizer import tokenizer as vinay v = vinay.TweetTokenizer(regularize=True, preserve_len=False) word_cnts = Counter() def count_words(text): words = v.tokenize(text) for word in words: word_cnts[word] += 1 for dp in dataset: count_words(dp.text) new_dataset = [] count_of_bad = 0 for i in range(0, len(dataset)): data = dataset[i]