def __init__( self, token_mapping: Mapping[str, int], preserve_case: bool = False, ): self._token_mapping = token_mapping self._tokenizer = TweetTokenizer(preserve_case=preserve_case)
def preprocess_tweet(tweet): """ This function will preprocess the input tweet Steps for preprocessing: 1. Lowercase the letters 2. Replace the characters with frequency greater than 3 with 3 in a word 3. Replace a url with Tag: <URLURL> 4. Replace a tag mention: <UsernameMention> @TODO: 1. Look for better preprocessing methods on the web 2. Apply here """ clean_tweet = tp.clean(tweet) # perform lemmatization tokenizer = TweetTokenizer() tweet_tokens = tokenizer.tokenize(clean_tweet) lemmatized_tweet = lemmatize_tweet(tweet_tokens) # remove stopwords preprocessed_tweet = remove_stopwords(lemmatized_tweet) return preprocessed_tweet
def nltk_tweet_tokenizer(s, **tokenizer_kwargs): """NTLK TweetTokenizer""" kwargs = dict(strip_handles=False, reduce_len=True) kwargs.update(**tokenizer_kwargs) tokenizer = TweetTokenizer(**kwargs) token_list = tokenizer.tokenize(s) return token_list
def tokenize_comments(base_dir, comments_file,hashh=None): tkd_data = None if hashh: tkd_data = load_cached_data(hashh) if tkd_data is None: hash_f = get_cache_path(hashh) with open(hash_f, 'wb') as pkl_f: tkd_data = defaultdict(dict) tk = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=False) for i, (root, dirs, files) in enumerate(os.walk(base_dir)): if comments_file in files: project = root.split('/')[-1] print('Processing %s, number %d' % (project, i)) posts = [] with open(os.path.join(root, comments_file), 'r') as inf: r = csv.DictReader(inf) for row in r: p = post(' '.join(list(tk.tokenize(row['body']))), row['login'], row['mention_login'], row['issue_num'], row['datetime'], project) posts.append(p) tkd_data[project] = posts pickle.dump(tkd_data, pkl_f) return tkd_data
def __init__(self, source_vocabulary, target_vocabulary, max_source_length, max_target_length): self.source_vocabulary = source_vocabulary self.target_vocabulary = target_vocabulary self.max_source_length = max_source_length self.max_target_length = max_target_length self.tokenizer = TweetTokenizer()
def data_processing(df): t = TweetTokenizer() emotions = [ 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust' ] df['translated_full_text'] = df['translated_full_text'].astype(str).apply( remove_links) df['cleaned_text'] = df['translated_full_text'].astype(str).apply( style_text) df['cleaned_text'] = df['cleaned_text'].astype(str).apply( lambda x: remove_words(x.split(), stopcorpus)) df['cleaned_text'] = df['cleaned_text'].apply(collapse_list_to_string) df['cleaned_text'] = df['cleaned_text'].astype(str).apply( remove_apostrophes) df['tokenized_sents'] = df.apply( lambda row: t.tokenize(row['cleaned_text']), axis=1) df['word_count'] = df.apply(lambda row: len(row['tokenized_sents']), axis=1) df = df[df.word_count > 0] df = text_emotion(df) for emotion in emotions: df[emotion] = df[emotion] / df['word_count'] date = datetime.datetime.strptime(df['created_at'].min(), '%Y-%m-%d %H:%M:%S').date() df.to_pickle(str(date) + ".pickle") df.to_excel(str(date) + ".xlsx") return
def nltk_tweet_tokenizer(s, **tokenizer_kwargs): """NTLK TweetTokenizer""" kwargs = dict(strip_handles=False, reduce_len=True) kwargs.update(**tokenizer_kwargs) tokenizer = TweetTokenizer(**kwargs) token_list = tokenizer.tokenize(s) return token_list
def parse_XML (file): text = '' continuation = False for line in file: #Searches if it's a line with a tweet if not continuation: groups = re.search('\[CDATA\[(.*)\]\]>', line) if groups is not None: #Extracts the tweet and lowercases it tweet = groups.group(1) tweet = tweet.lower() #Tokenizes the tweet tokens = TweetTokenizer().tokenize(tweet) tokenized = ' '.join(s.encode('ascii', 'ignore') for s in tokens) text = text + '\n' + tokenized else: groups = re.search('\[CDATA\[(.*)', line) if groups is not None: temp = groups.group(1) continuation = True else: groups = re.search('(.*)\]\]>', line) if groups is not None: tweet = temp + ' ' + groups.group(0) tweet = tweet.lower() tokens = TweetTokenizer().tokenize(tweet) tokenized = ' '.join(s.encode('ascii', 'ignore') for s in tokens) text = text + '\n' + tokenized continuation = False else: temp = temp + ' ' + line.rstrip() return text[1:]
def tokenize(self, text): #Make a list where each word is an element, text_list = text.split(' ') #Lemmatize each word. Exception: We want "better" to become its lemma "good" but "best" should stay "best". #There are nltk methods for this. Look at https://www.youtube.com/watch?v=uoHVztKY6S4 #Remove the articles 'a', 'an', 'the' #Also split on punctuation marks so that, "I like, fish" becomes ['I', 'like', ',', 'fish'] = token_list tweettokenizer = TweetTokenizer(); lemmatizer = WordNetLemmatizer(); token_list = tweettokenizer.tokenize(text) try: token_list.remove('a'); token_list.remove('an'); token_list.remove('the'); except ValueError: pass pos_list = pos_tag(token_list) pos_listwordnet = [(word[0], self.get_wordnet_pos(word[1])) for word in pos_list] for i in range(len(token_list)): token_list[i] = lemmatizer.lemmatize(token_list[i] ,pos=pos_listwordnet[i][1]) if len(token_list) == 1: token_list.append('.') return token_list
def parse_data_iterator(vocab, filename, delimiter=",", steps=10): vocab.add_word('</s>') file = open(filename, 'r') reader = csv.reader( file, delimiter=delimiter, ) headers = next(reader) list_of_train = [] tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=False) for row in reader: curr = [] encoded = [] label = [row[1]] if (row[1] == 0): label.append(1) else: label.append(0) words = tokenizer.tokenize(" ".join(row[3:])) for i in range(steps): if i < len(words): try: words[i] = str(words[i]) except: words[i] = words[i] words[i] = canon_word(words[i]) vocab.add_word(words[i]) curr.append(words[i]) else: curr.append('</s>') for word in curr: encoded.append(vocab.encode(word)) yield label, curr
def tokenize(sents: list): tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) sents_tok = [] for sent in sents: tokens = [token for token in tokenizer.tokenize(sent) if not token.startswith('http')] sents_tok.append(' '.join(tokens)) return sents_tok
def __init__(self): with open('model/tokenizer.pickle', 'rb') as handle: self.tokenizer = pickle.load(handle) with open('model/label_encoder', 'rb') as handle: self.y_enc = pickle.load(handle) self.tweeter = TweetTokenizer() self.lemma = WordNetLemmatizer() self.vocab_size = len(self.tokenizer.word_index) + 1 self.model = tf.keras.Sequential([ tf.keras.layers.Embedding(self.vocab_size, 50, mask_zero=True), tf.keras.layers.Dropout(0.4), tf.keras.layers.Bidirectional( tf.keras.layers.LSTM(1024, return_sequences=True)), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024)), tf.keras.layers.Dropout(0.4), tf.keras.layers.Flatten(), tf.keras.layers.Dense(64, activation="relu"), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(len(self.y_enc.classes_), activation='softmax') ]) self.model.load_weights('model/chatbot') self.responses = self._load_responses()
def tweet_clean(self, tweet): # Remove tickers sent_no_tickers = re.sub(r'\$\w*', '', tweet) tw_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) temp_tw_list = tw_tokenizer.tokenize(sent_no_tickers) # Remove stopwords list_no_stopwords = [ i for i in temp_tw_list if i.lower() not in self._cache_english_stopwords ] # Remove hyperlinks list_no_hyperlinks = [ re.sub(r'https?:\/\/.*\/\w*', '', i) for i in list_no_stopwords ] # Remove hashtags list_no_hashtags = [re.sub(r'#', '', i) for i in list_no_hyperlinks] # Remove Punctuation and split 's, 't, 've with a space for filter list_no_punctuation = [ re.sub(r'[' + string.punctuation + ']+', ' ', i) for i in list_no_hashtags ] # Remove multiple whitespace new_sent = ' '.join(list_no_punctuation) # Remove any words with 2 or fewer letters filtered_list = tw_tokenizer.tokenize(new_sent) list_filtered = [re.sub(r'^\w\w?$', '', i) for i in filtered_list] filtered_sent = ' '.join(list_filtered) cleaned_tweet = re.sub(r'\s\s+', ' ', filtered_sent) # Remove any whitespace at the front of the sentence cleaned_tweet = cleaned_tweet.lstrip(' ') return cleaned_tweet
def get_train_test_data(find_and_concatenate_expressions=False): def remove_url(tokens): tokens = filter(lambda x: "http" not in x, tokens) return list(tokens) def remove_hashtags(tokens): tokens = map(lambda x: x.replace('#', ''), tokens) return list(tokens) db = pd.read_excel("Classeur1.xlsx", encoding="utf-8") dict_values = {'Not Relevant': -1, 'Relevant': 1, "Can't Decide": 0} db["to_predict"] = db.choose_one.map(dict_values) db = db[["text", "to_predict"]] twtk = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) db["token_retreated_text"] = db["text"].apply( lambda x: remove_hashtags(remove_url(twtk.tokenize(x)))) db["retreated_text"] = db["token_retreated_text"].apply( lambda x: " ".join(x)) if find_and_concatenate_expressions: db["token_retreated_text"] = clean_corpus(db["retreated_text"]) db["retreated_text"] = db["token_retreated_text"].apply( lambda x: " ".join(x)) msk = np.random.rand(len(db)) < 0.8 train = db[msk] test = db[~msk] return train, test
def set_params(self, **parameters): """Set the params""" for parameter, value in parameters.items(): setattr(self, '_{}'.format(parameter), value) self._tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)
def tokenize(s): sentence_splitter = TweetTokenizer() tokens = sentence_splitter.tokenize(s) result = [] for word in tokens: result.append(unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')) return result
def __init__(self, data_loader): self.data = data_loader self.tokenizer = TweetTokenizer() self.stemmer = PorterStemmer() self.stopwords = stopwords.words('english') self.re_url = r'http\S+' self.punctuation = string.punctuation self.vocab = defaultdict(set)
def preprocess(comments, preprocessors): tokenizer = TweetTokenizer() html_cleaner = re.compile('<.+?>') for comment in comments: comment = html_cleaner.sub('', comment) tokenized_comment = tokenizer.tokenize(comment) for preprocessor in preprocessors: tokenized_comment = preprocessor.optimize(tokenized_comment) yield tokenized_comment
def tokenize(s): sentence_splitter = TweetTokenizer() tokens = sentence_splitter.tokenize(s) result = [] for word in tokens: result.append( unicodedata.normalize('NFKD', word).encode('ascii', 'ignore')) return result
def __init__(self, tokenizer="tweet", punctuation=True, verbose=1): self.contextualizer = Contextualizer() self.corrector = Corrector(word2index=self.contextualizer.word2index, index2word=self.contextualizer.index2word) self.tokenizer_type = tokenizer self.keep_punctuation = punctuation if self.tokenizer_type == "tweet": self.tokenizer = TweetTokenizer() self.verbose = verbose
def tokeniza(chars, keyword=None): """ Tokenize a string (duplicates keywords if any) """ tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tokens = tokenizer.tokenize(chars) return tokens
def modify_abbrev(tweet): tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) tokens = tokenizer.tokenize(tweet) for w in tokens: w = abbreviations[ w.lower()] if w.lower() in abbreviations.keys() else w text = ' '.join(tokens) return text
def clear_data(tweet): tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True) tokens = tokenizer.tokenize(tweet) clean_tweet = tokens \ | remove_urls \ | process_hashtags \ | remove_stopwords \ | remove_numbers \ | remove_multiple_occurrence return ' '.join(clean_tweet)
def preprocess(text): tokenizer = TweetTokenizer() # Remove stopwords. tokens = tokenizer.tokenize(text) tokens = [ token for token in tokens if token not in ENGLISH_STOPWORDS and token.isalpha() ] return tokens
def tokenize(text): tweet_tokenizer = TweetTokenizer() # 1. Tokenize text = tweet_tokenizer.tokenize(text) # 2. Cleaning # Punctuation text = [t for t in text if t not in string.punctuation] # Normalisieren text = [t.lower() for t in text] return text
def process_tweet_text(tweet): if tweet.startswith('@null'): return "[Tweet not available]" tweet = re.sub(r'\$\w*','',tweet) # Remove tickers tweet = re.sub(r'https?:\/\/.*\/\w*','',tweet) # Remove hyperlinks tweet = re.sub(r'['+string.punctuation+']+', ' ',tweet) # Remove puncutations like 's twtok = TweetTokenizer(strip_handles=True, reduce_len=True) tokens = twtok.tokenize(tweet) tokens = [i.lower() for i in tokens if i not in stopwords and len(i) > 2 and i in english_vocab] return tokens
def normalize_messages(messages): tokenizer = TweetTokenizer(preserve_case=False) normalized_messages = [] for message in messages: try: tokens = tokenizer.tokenize(message) text = [word.lower() for word in Text(tokens)] if text: normalized_messages.append(text) except TypeError: pass return normalized_messages
def __init__(self, preserve_case: Boolean(), reduce_len: Boolean(), strip_handles: Boolean()): self.preserve_case = preserve_case self.reduce_len = reduce_len self.strip_handles = strip_handles NltkTokenizer.__init__(self) _TweetTokenizer.__init__( self, preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles, )
def create_matrix(tweets: List, name: str = 'oscar pistorius') -> csr_matrix: matrix_loc = Path('data', name, 'tf_idf_matrix.pickle') if matrix_loc.exists(): logger.info("Matrix exists! loading...") with matrix_loc.open('rb') as f: matrix = pickle.loads(f.read()) return matrix stemmer = PorterStemmer() tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True) texts = [] for tweet in tqdm(tweets, desc="(create_matrix) iterating over tweets..."): text = tweet.text tokens = tokenizer.tokenize(text) text_proc = [] for token in tokens: token = token.strip() if len(token) < 3: continue elif token in stopwords.words('english'): continue elif nlp_utils.match_url(token): continue elif token in string.punctuation: continue # elif token.startswith(("#", "$")): # continue token = token.translate({ord(k): "" for k in string.punctuation}) token = stemmer.stem(token) token = token.strip() if token == "": continue text_proc.append(token) texts.append(text_proc) vectorizer = TfidfVectorizer(analyzer="word", tokenizer=lambda x: x, lowercase=False) m = vectorizer.fit_transform(texts) logger.info("Saving computed matrix...") with matrix_loc.open('wb') as f: f.write(pickle.dumps(m)) return m
def tokenize_text(text): """ Transforms the specified files in tokens using the Twitter tokenizer. @params: str Input text to tokenize @returns: list(str) Returns the tokens as a list of strings. """ tokenizer = TweetTokenizer() # tokenizing the text tokens = tokenizer.tokenize(text) words = [w.lower() for w in tokens] return words
def compare(topicsFileName, headlinesFileName): """ This function compares a set of detected trending topics to a list of headlines in the JSON format provided by NewsAPI. A detected trending topic is considered as matching a headline if the intersection of the headline and the topic is at least 40%. It returns the list of trending topics that are included in the provided headlines, as well as: recall: number of matching topics divided by number of headlines precision: average fraction of headline terms found per matching topic """ # load topics from file topics = [] with open(topicsFileName, 'r', encoding='utf-8') as tf: topics = [json.loads(line) for line in tf] # load headlines from file headlines = [] with open(headlinesFileName, 'r', encoding='utf-8') as hf: headlines = [json.loads(line) for line in hf] # prepare stemmer and tokenizer stemmer = PorterStemmer(mode=PorterStemmer.MARTIN_EXTENSIONS) tokenizer = TweetTokenizer() # compare every topic with every headline matchingTopics = [] for tIter, topic in enumerate(topics): print('\r', tIter + 1, len(topics), end='', file=sys.stderr) for headline in headlines: # split headline title (rather than description) into stemmed terms if 'title' not in headline or headline['title'] is None or len( headline['title']) == 0: continue usedText = headline['title'] headlineTerms = [ stemmer.stem(term) for term in tokenizer.tokenize(usedText) if term not in stopwords.stopwords + stopwords.moreStopwords ] # check for inclusion of topic in headline if len(set(topic['terms'].keys()) & set(headlineTerms) ) >= 0.4 * min(len(set(headlineTerms)), len(set(topic['terms'].keys()))): matchingTopics.append(topic) break print(file=sys.stderr) precision = len(matchingTopics) / len(topics) recall = len(matchingTopics) / len(headlines) return matchingTopics, recall, precision
def preprocess(text, sentiments, w2i, maxlen, shuffle=True): tokenizer = TweetTokenizer() reviews = [] for t in text: tokens = list(tokenizer.tokenize(t)) token_idx = convert_str_to_idx(tokens, w2i, maxlen) reviews.append(token_idx) txt, sents = torch.LongTensor(reviews), torch.FloatTensor(sentiments) if shuffle: txt, sents = _shuffle(txt, sents) return txt, sents.unsqueeze(1)
def __init__(self, preserve_case=True, strip_handles=True, reduce_len=True): TweetTokenizer.__init__(self, preserve_case, strip_handles, reduce_len)