def bag_of_words(tr_tweets, te_tweets, tr_targets=pd.Series(), te_targets=pd.Series(), per_target=False, max_feats=None, normalise_counts=False, **kwargs): """ Calculate bag-of-words representations of train and test tweets :param tr_tweets: pandas Series of strings, raw texts to convert (from train set) :param te_tweets: pandas Series of strings, raw texts to convert (from test set) :param tr_targets: pandas Series of strings, target classes (from train set) :param te_targets: pandas Series of strings, target classes (from test set) :param per_target: bool, whether to find separate BoW repr for each target class :param max_feats: int, maximum number of words/ngrams to keep, number of dimensions in returned feature matrices :param normalise_counts: bool, whether to divide the counts within each tweet by the number of tokens (not for Multinomial NB) :param kwargs: to be passed onto sklearn CountVectorizer :return: tuple, training feature matrix, test feature matrix, list of feature names (with '_bow' appended to each) """ if per_target and not tr_targets.empty and not te_targets.empty: # Create different BoW for each target # Only useful if using max_features - as most common words/n-grams # May be for only one or two of the targets x_tr = np.zeros((tr_tweets.shape[0], max_feats), dtype=np.int64) x_te = np.zeros((te_tweets.shape[0], max_feats), dtype=np.int64) for _targ in tr_targets.unique(): word_bagger = text_sk.CountVectorizer(max_features=max_feats, **kwargs) x_tr[(tr_targets == _targ).values] = \ word_bagger.fit_transform(tr_tweets[(tr_targets == _targ).values].values).toarray() x_te[(te_targets == _targ).values] = \ word_bagger.transform(te_tweets[(te_targets == _targ).values].values).toarray() else: word_bagger = text_sk.CountVectorizer(max_features=max_feats, **kwargs) x_tr = word_bagger.fit_transform(tr_tweets).toarray() x_te = word_bagger.transform(te_tweets).toarray() if normalise_counts: # Normliase counts by length of tweet tr_tweet_lens = tr_tweets.apply( tokenize.TweetTokenizer().tokenize).apply(len) te_tweet_lens = te_tweets.apply( tokenize.TweetTokenizer().tokenize).apply(len) x_tr = np.divide(x_tr, tr_tweet_lens.values[:, np.newaxis]) x_te = np.divide(x_te, te_tweet_lens.values[:, np.newaxis]) return x_tr, x_te, [ _fn + '_bow' for _fn in word_bagger.get_feature_names() ]
def tokenizer(tweets): tokens = list() tk = tokenize.TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False) for tweet in tweets: try: element = tk.tokenize(tweet) except UnicodeDecodeError: element = [] tokens.append(element) return tokens
def _get_word2vec(): ''' Loads the word2vec model from the {WORD2VEC_MODE_FILE} filepath. If the file cannot be found, creates a new model from going through the top posts of all ccsr subreddits. :return: A gensim word2vec model centered on ccsr subreddits ''' if os.path.isfile(WORD2VEC_MODEL_FILE): logging.info('Loading word2vec model from file {0} ...'.format( WORD2VEC_MODEL_FILE)) return Word2Vec.load(WORD2VEC_MODEL_FILE) word_tokenizer = tokenize.TweetTokenizer() sentences = [] def parse_comment(comment, subreddit_sentences): subreddit_sentences.append(word_tokenizer.tokenize(comment.body)) for reply in comment.replies: parse_comment(reply, subreddit_sentences) for ccsr in CRYPTOCURRENCY_SUBREDDITS: logging.info( "Compiling submission titles and comments from subreddit '{0}' ..." .format(ccsr)) subreddit = reddit.subreddit(ccsr) subreddit_sentences = [] for submission in subreddit.top(limit=1e10): logging.info("Looking at submission '{0}'".format( submission.title)) subreddit_sentences.append( word_tokenizer.tokenize(submission.title)) submission.comments.replace_more(limit=0) for comment in submission.comments: parse_comment(comment, subreddit_sentences) logging.info( "Collected {0} sentences ... {1} in training set.".format( len(subreddit_sentences), len(subreddit_sentences) + len(sentences))) if len(subreddit_sentences ) > NUM_TRAINING_SUBMISSIONS_PER_SUBREDDIT: sentences.extend(subreddit_sentences) break logging.info('Training model on {0} sentences ...'.format(len(sentences))) model = Word2Vec(sentences, size=200, window=5, min_count=5, workers=4) model.save(WORD2VEC_MODEL_FILE) return model
def tokenize(self, sentences, task_ids): # nltk TweetTokenizer for stance tweet_tokenizer = tokenize.TweetTokenizer() # nltk WordPunctTokenizer for NLI punct_tokenizer = tokenize.WordPunctTokenizer() all_sentence = [] for sentence, task_id in zip(sentences, task_ids): if task_id == 0: # stance tokenize_sent = tweet_tokenizer.tokenize(sentence) elif task_id == 1: # NLI tokenize_sent = punct_tokenizer.tokenize(sentence) all_sentence.append(tokenize_sent) return all_sentence
def get_tweets_with_emoji(tweets, emojis_ours, emojis_theirs, emojis_popular): """Get all tweets with emoji in the sets Args: tweets: List of Tweets emojis_ours: Emoji vectors trained on our model emojis_theirs: Emoji vectors trained on an external model emojis_popular: List of popular emojis Returns: All tweets containing emoji """ tokenizer = tk.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) ems = list() for tweet in tweets: if get_emojis_in_tweet(tweet, emojis_ours, emojis_theirs, emojis_popular, tokenizer): ems.append(tweet) return ems
def sentenceTransform(sentenceList): #gives a list of sentences, return a cleaned, tokenized sentenceList, and calculate the longest sentence length token_sentence_list = [] tknzr = tokenize.TweetTokenizer() for sentence in sentenceList: sentence = sentence.lower() #Tolowercase sentence = tknzr.tokenize(sentence) #tokenize token_sentence_list.append(sentence) for sentence in token_sentence_list: for idx, word in enumerate(sentence): if word == '@user': sentence[idx] = '<user>' if word == 'url': sentence[idx] = '<url>' if word.isdigit(): sentence[idx] = '<number>' if word[0] == '#': sentence[idx] = '<hashtag>' return token_sentence_list
def prepare_tweet_vector_averages(tweets, p2v): """Take the vector sum of all tokens in each tweet Args: tweets: All tweets p2v: Phrase2Vec model Returns: Average vectors for each tweet Truth """ tokenizer = tk.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) avg_vecs = list() y = list() for tweet in tweets: tokens = tokenizer.tokenize(tweet.text) avg_vecs.append(np.sum([p2v[x] for x in tokens], axis=0) / len(tokens)) y.append(tweet.label) return avg_vecs, y
def predict_nltk_twitter_tokenizer(sentences: list) -> list: """ Predict all sentences sequentially. Parameters ---------- sentences: list List of strings with pre-processed sentences. Retunrs ------- list: List of predicted tokens for each sentenec. """ pred_tokens = [] tokenizer = tokenize.TweetTokenizer() for sentence in sentences: pred_tokens.append(tokenizer.tokenize(sentence)) return pred_tokens
import gensim.models as gsm from parse_recipes import load_recipes from config import cfg import phrase2vec as p2v import numpy as np import json import pickle import nltk.tokenize as tk tokenizer = tk.TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) def create_recipe_vectors(p2v_our_emoji): """ Create recipe vectors by averaging embeddings of inredients and saving the result """ recipes = load_recipes() recipes_clean = {} for key, value in recipes.items(): ingredients_tokens = [ tokenizer.tokenize(ingredient) for ingredient in value['ingredients_clean'] ] ingredients_tokens_flat = [ ingredient for sublist in ingredients_tokens for ingredient in sublist ]
class TweetTokenizer(BaseTokenizer): """ Pre-trained tokenizer for tweets. """ tokenizer = tokenize.TweetTokenizer() name = 'Tweet'
import copy, inspect from scipy.spatial.distance import cosine stem = SnowballStemmer("english").stem link_re = re.compile(r"(http(s)?[^\s]*)|(pic\.[s]*)") hashtag_re = re.compile(r"#[a-zA-Z0-9_]+") mention_re = re.compile(r"@[a-zA-Z0-9_]+") pat_type = {'links': link_re, 'hashtags': hashtag_re, 'mentions': mention_re} tokenizers = {'treebank': nltk_token.TreebankWordTokenizer().tokenize, 'wordpunct': nltk_token.WordPunctTokenizer().tokenize, 'tweettokenize': nltk_token.TweetTokenizer().tokenize} def read_file(path): if not os.path.exists(path): raise ValueError("Path does not point to existing file: {}".format(path)) return ending = path.split('.')[-1] if ending == 'csv': return pd.read_csv(path) elif ending == 'tsv': return pd.read_csv(path, delimiter='\t') elif ending == 'pkl': return pd.read_pickle(path) elif ending == 'json': return pd.read_json(path)
from matplotlib import pyplot as plt from sklearn.cluster import AgglomerativeClustering DATA_PATH = config.get_data_path() STORIES_FILE = DATA_PATH / 'stories_processed.csv' stories = pd.read_csv(STORIES_FILE, index_col=0, converters={ 'all_text': eval, 'font_size': eval, 'guids': eval }) ft_model_path = DATA_PATH / '../fasttext' / 'cc.ru.300.bin' model = fasttext.load_model(str(ft_model_path)) tokenizer = tokenize.TweetTokenizer() vectors = [] for text in stories['all_text']: text = ' '.join(text).lower() text = tokenizer.tokenize(text) vector = np.array([model[word] for word in text]).mean(axis=0) vectors.append(vector) cluster_model = AgglomerativeClustering(n_clusters=8) clusters = cluster_model.fit_predict(vectors) stories['clusters'] = clusters stories.to_csv(STORIES_FILE)
class TweetTokenizer(BaseTokenizer): """ 预训练的推特分词器.保留表情符号. This example. :-) #simple → (This), (example), (.), (:-)), (#simple) """ tokenizer = tokenize.TweetTokenizer() name = '推特分词'
VERSION_STR = 'v1.0.0' import db import time from error import Error from flask import Blueprint, request, jsonify, json, g blueprint = Blueprint(VERSION_STR, __name__) import nltk.data from nltk import tokenize from nltk.sentiment.vader import SentimentIntensityAnalyzer VADER_SENTIMENT_ANALYZER = SentimentIntensityAnalyzer() WORD_TOKENIZER = tokenize.TweetTokenizer() SENT_TOKENIZER = nltk.data.load('tokenizers/punkt/english.pickle') PARA_TOKENIZER = tokenize.BlanklineTokenizer() def score_word(word): return VADER_SENTIMENT_ANALYZER.lexicon.get(word, 0.0) def compute_sentiment_record(text): sentiment_record = {'text': text} sentiment_record.update(VADER_SENTIMENT_ANALYZER.polarity_scores(text)) return sentiment_record def r_remove_key(o, keys_to_remove): if hasattr(o, 'iteritems'):
# -*- coding: utf-8 -*- """ Created on Mon Apr 9 17:25:19 2018 @author: miaoji """ import nltk.tokenize as nt from textblob import TextBlob import time start_time = time.time() in_file = open("/data/zhangbin/caozhaojun/true_procress_data/daodao_en.txt", 'r') out_file = open("handle_daodao_en.txt", 'a+') tokenizer = nt.TweetTokenizer() line_id = 0 for line in in_file.readlines(): line_id += 1 if line_id % 1000 == 0: print(line_id) correct_line = TextBlob(line.lower().replace('...', ' ').strip()) #.correct() token_line = correct_line.tokenize(tokenizer) final_line = ' '.join([word for word in token_line]) out_file.write(final_line + '\n') in_file.close() out_file.close() end_time = time.time()