def get_utterances(utterances, line, category, wgram, cgram): tknzr = TweetTokenizer() gram_list = [] # WORD GRAMS if wgram == 1: # unigram wgram_list = tknzr.tokenize(line) elif wgram == 2: # uni + bigram # unigram list tokens = nltk.wordpunct_tokenize(line) # bigram list finder = BigramCollocationFinder.from_words(tokens) scored = finder.score_ngrams(bigram_measures.raw_freq) bigram_list = sorted(bigram for bigram, score in scored) # res wgram_list = tknzr.tokenize(line) + bigram_list elif wgram == 3: # uni + bi + trigram # unigram list tokens = nltk.wordpunct_tokenize(line) # bigram list bi_finder = BigramCollocationFinder.from_words(tokens) bi_scored = bi_finder.score_ngrams(bigram_measures.raw_freq) bigram_list = sorted(bigram for bigram, biscore in bi_scored) # trigram list tri_finder = TrigramCollocationFinder.from_words(tokens) tri_scored = tri_finder.score_ngrams(trigram_measures.raw_freq) trigram_list = sorted(trigram for trigram, triscore in tri_scored) # res wgram_list = tknzr.tokenize(line) + bigram_list + trigram_list # CHAR GRAMS cgram_list = [] if cgram == 1: # uni-chargram cgram_list = [line[i:i+1] for i in range(len(line)-1)] elif cgram == 2: # bi-chargram cgram_list = [line[i:i+2] for i in range(len(line)-1)] elif cgram == 3: # tri-chargram cgram_list = [line[i:i+3] for i in range(len(line)-1)] # RESULT if category == 'QA': # non-task utterances.append((wgram_list + cgram_list, 0)) elif category == 'Shopping': # task utterances.append((wgram_list + cgram_list, 1)) elif category == 'Travel': # task utterances.append((wgram_list + cgram_list, 2)) elif category == 'Hotel': # task utterances.append((wgram_list + cgram_list, 3)) elif category == 'Food': # task utterances.append((wgram_list + cgram_list, 4)) elif category == 'Art': # task utterances.append((wgram_list + cgram_list, 5)) elif category == 'Weather': # task utterances.append((wgram_list + cgram_list, 6)) elif category == 'Friends': # task utterances.append((wgram_list + cgram_list, 7)) elif category == 'Chat': # chat utterances.append((wgram_list + cgram_list, 8)) else: print utt_category,"ERROR"
def load_data_and_labels_semeval(): # load the entire semeval dataset old_dataset = list(open("./input/2013-dev")) old_dataset.extend(list(open("./input/2013-devtest"))) old_dataset.extend(list(open("./input/2013-train"))) old_dataset.extend(list(open("./input/2014-devtest"))) new_dataset = list(open("./input/2016-train")) new_dataset.extend(list(open("./input/2016-dev"))) new_dataset.extend(list(open("./input/2016-devtest"))) # filter out invalid tweets from new dataset new_dataset = [entry for entry in new_dataset if entry.split('\t')[2] != 'Not Available\n'] # generate x from old tk = TweetTokenizer(reduce_len=True) # handles punctuations x_text = [entry.split('\t')[3] for entry in old_dataset] x_text = [clean_str(tweet) for tweet in x_text] x_text = [tk.tokenize(tweet) for tweet in x_text] # generate x from new x_text_new = [entry.split('\t')[2] for entry in new_dataset] x_text_new = [clean_str(tweet) for tweet in x_text_new] x_text_new = [tk.tokenize(tweet) for tweet in x_text_new] # concat x and x_new x_text.extend(x_text_new) # generate y from old y = [entry.split('\t')[2] for entry in old_dataset] for idx, label in enumerate(y): if label == 'positive': y[idx] = [1, 0, 0] elif label == 'neutral': y[idx] = [0, 1, 0] elif label == 'negative': y[idx] = [0, 0, 1] else: print 'wrong label in semeval: ' + label # generate y from new y_new = [entry.split('\t')[1] for entry in new_dataset] for idx, label in enumerate(y_new): if label == 'positive': y_new[idx] = [1, 0, 0] elif label == 'neutral': y_new[idx] = [0, 1, 0] elif label == 'negative': y_new[idx] = [0, 0, 1] else: print 'wrong label in semeval: ' + label # concat y and y_new y.extend(y_new) return [x_text, y]
def custom_tokenizer(text, bigrams = None): chunks = text.split('-') tokenizer = TweetTokenizer(reduce_len=True, preserve_case=False) tokens = tokenizer.tokenize(text) tokens = [ subchunk for chunk in chunks for subchunk in tokenizer.tokenize(chunk) ] tokens = [ token for token in tokens if token.isalpha() ] if bigrams: tokens = mwe_tokenize(tokens, bigrams) stemmer = SnowballStemmer('english', ignore_stopwords=True) tokens = [ stemmer.stem(token) for token in tokens ] return tokens
def getVocab(): freq = [] vocab = [] length = 0 tknzr = TweetTokenizer() with open(path+'/data/training/training_stances.csv', 'r', encoding='UTF-8') as csvDataFile: csvReader = csv.reader(csvDataFile) first = 1 for row in csvReader: if first == 1: first = 0 else: headline = row[0] tokens = tknzr.tokenize(headline) tokens=[token.lower() for token in tokens if (token.isalpha() and token not in stop_words)] #for word in r.split(headline): length = length + len(tokens) for word in tokens: if word not in vocab: vocab.append(word) freq.append(1) else: ind = vocab.index(word) freq[ind] = freq[ind] + 1 with open(path+'/data/training/train_bodies.csv', 'r', encoding='UTF-8') as csvDataFile: csvReader = csv.reader(csvDataFile) first = 1 for row in csvReader: if first == 1: first = 0 else: body = row[1] tokens = tknzr.tokenize(body) tokens=[token.lower() for token in tokens if (token.isalpha() and token not in stop_words)] length = length + len(tokens) #for word in r.split(headline): for word in tokens: if word not in vocab: vocab.append(word) freq.append(1) else: ind = vocab.index(word) freq[ind] = freq[ind] + 1 return vocab, freq, length #vocab list #vocab, freq, length = getVocab()
def get_classifier(featx): tokenizer = TweetTokenizer() print "Training Classifier..." negstr = [obj["text"] for obj in handle.negative_tweets.find()] posstr = [obj["text"] for obj in handle.positive_tweets.find()] negfeats = [(featx(tokenizer.tokenize(Twitter.process_tweet(negstr[i]))), 'neg') for i in range(0, len(negstr)-1)] posfeats = [(featx(tokenizer.tokenize(Twitter.process_tweet(posstr[i]))), 'pos') for i in range(0, len(posstr)-1)] trainfeats = negfeats + posfeats classifier = NaiveBayesClassifier.train(trainfeats) return classifier
def get_features(utterances, ngram, classify_method): features = [] tknzr = TweetTokenizer() for utt in utterances: utt_content = utt[0] # text content of the utterance utt_category = utt[1] if ngram: # use bow & ngram as feature # bow list bow_list = tknzr.tokenize(utt_content) # cgram list uni_cgram_list = [utt_content[i:i+1] for i in range(len(utt_content)-1)] bi_cgram_list = [utt_content[i:i+2] for i in range(len(utt_content)-1)] tri_cgram_list = [utt_content[i:i+3] for i in range(len(utt_content)-1)] feature_list = bow_list # add bow tokens feature_list += uni_cgram_list # add unigram character lists feature_list += bi_cgram_list # add bigram character lists feature_list += tri_cgram_list # add trigram character lists else: # only use bow as feature feature_list = tknzr.tokenize(utt_content) if classify_method == 'binary': if utt_category == 'QA': # non-task features.append((feature_list, 0)) else: # task features.append((feature_list, 1)) elif classify_method == 'multi': if utt_category == 'QA': # non-task features.append((feature_list, 0)) elif utt_category == 'Shopping': # task features.append((feature_list, 1)) elif utt_category == 'Travel': # task features.append((feature_list, 2)) elif utt_category == 'Hotel': # task features.append((feature_list, 3)) elif utt_category == 'Food': # task features.append((feature_list, 4)) elif utt_category == 'Art': # task features.append((feature_list, 5)) elif utt_category == 'Weather': # task features.append((feature_list, 6)) elif utt_category == 'Friends': # task features.append((feature_list, 7)) elif utt_category == 'Chat': # chat features.append((feature_list, 8)) else: print utt_category,"ERROR" return features
def get_test(infile, NUM_TEST): with codecs.open(infile, 'rb') as csvfile: test = [] pos_tweets = 0 neg_tweets = 0 reader = csv.reader(csvfile) tokenizer = TweetTokenizer(preserve_case=True) for line in reader: if line[0] == "0": sent="Negative" neg_tweets+=1 if neg_tweets < NUM_TEST: text = tokenizer.tokenize(line[5].decode("utf-8")) for i,token in enumerate(text): text[i] = re.sub("@[\S]+", "USERNAME", text[i]) text[i] = re.sub("www.[\S]+|https://[\S]+", "URL", text[i]) newstr = "" for ch in text[i]: if ord(ch)>128: newstr+= "EMOJI_{0}".format(ord(ch)) #print [ch], ord(ch) else: newstr+=(ch) text[i] = newstr test.append((text, sent)) if line[0] == "4": sent = "Positive" pos_tweets+=1 if pos_tweets < NUM_TEST: text = tokenizer.tokenize(line[5].decode("utf-8")) for i,token in enumerate(text): text[i] = re.sub("@[\S]+", "USERNAME", text[i]) text[i] = re.sub("www.[\S]+|https://[\S]+", "URL", text[i]) newstr = "" for ch in text[i]: if ord(ch)>128: newstr+= "EMOJI_{0}".format(ord(ch)) #print [ch], ord(ch) else: newstr+=(ch) text[i] = newstr test.append((text, sent)) return test
def _get_nouns(tweet_text): """ Args: tweet_text: Returns: """ tokenizer = TweetTokenizer() tokenizer.tokenize(tweet_text) nouns = [] tag = pos_tag(tokenizer.tokenize(tweet_text)) nouns.extend([t[0] for t in tag if t[1] == 'NN' or t[1] == 'NNP']) return nouns
def get_diff(query, event_name): tknzr = TweetTokenizer() query_strip = tknzr.tokenize(query) name_strip = tknzr.tokenize(event_name) ratio = 0 for word in query_strip: for word2 in name_strip: r = difflib.SequenceMatcher(None, word, word2).ratio() rrr = r*r*r ratio += rrr if ratio >= len(query_strip): # werk om eoa reden niet print ratio ,len(name_strip) ratio = 100 return ratio
def _tag_text(self, tweet_text): tokenizer = TweetTokenizer() tokens = tokenizer.tokenize(tweet_text) tagged = nltk.pos_tag(tokens) entities = nltk.chunk.ne_chunk(tagged) neList = traverse(entities) return neList
def load_data_and_labels_sam(): # load with open("./input/2780_freshmen_tweets.csv", 'rU') as f: rdr = csv.reader(f) dataset = list(rdr)[1:] # remove header # filter out tweets with unknown sentiment dataset = [entry for entry in dataset if entry[4] != '0'] # generate x tk = TweetTokenizer(reduce_len=True) x_text = [entry[3] for entry in dataset] x_text = [clean_str(tweet) for tweet in x_text] x_text = [tk.tokenize(tweet) for tweet in x_text] # generate y y = [entry[4] for entry in dataset] for idx, label in enumerate(y): if label == '1': # positive y[idx] = [1, 0, 0] elif label == '2': # neutral y[idx] = [0, 1, 0] elif label == '3': # negative y[idx] = [0, 0, 1] else: print 'wrong label in sam: ' + label return [x_text, y]
def load_tweetkeywords(): """ Check and see which keywords are used in each tweet, and load the association table linking tweets and keywords """ # TweetKeyword.query.delete() tweets = Tweet.query.all() keyword_query = Keyword.query.all() keywords = [] [keywords.append(word.keyword) for word in keyword_query] tknzr = TweetTokenizer() for tweet in tweets: tokenized_tweets = tknzr.tokenize(tweet.text) for token in tokenized_tweets: if token in keywords: tweet_id = Tweet.query.filter(Tweet.tweet_id == tweet.tweet_id).one() keyword_id = Keyword.query.filter(Keyword.keyword == token).one() tweet_keyword = TweetKeyword(keyword_id=keyword_id.keyword_id, tweet_id=tweet_id.tweet_id) print "Added to TweetKeyword table: {}".format(tweet_keyword.keyword_id) db.session.add(tweet_keyword) db.session.commit()
def get_tweet_tags(tweet): """ Break up a tweet into individual word parts """ tknzr = TweetTokenizer() tokens = tknzr.tokenize(tweet) # replace handles with real names for n, tok in enumerate(tokens): if tok.startswith('@'): handle = tok.strip("@") if handle in user.students: # If we have a database entry for the mentioned user, we can # easily substitute a full name. usr = user.NPUser(handle) tokens[n] = usr.fullname else: # If there is no database entry, we use the user's alias. While # this is the full name in many cases, it is often not reliable usr = api.get_user(handle) tokens[n] = usr.name tagged = nltk.pos_tag(tokens) # In nltk, if a teacher's name is written with a period after an # abbreviated prefix, it is awkwardly broken up into 3 tags for n, tag in enumerate(tagged): # If there is the weird period after the prefix, if tag[1] == '.': # and it is in fact splitting up a person's name, if tagged[n - 1][1] == 'NNP' and tagged[n + 1][1] == 'NNP': if tagged[n - 1][0] in ['Mr', 'Ms', 'Mrs', 'Mx']: # combine it into the actual name, tagged[n - 1] = ('{}. {}'.format(tagged[n - 1][0], tagged[n + 1][0]), 'NNP') # and then remove the extra tags. del tagged[n + 1] del tagged[n] return tagged
def process_tweets(file_name): ''' Person Responsible: Devin Munger + file_name: filename of tweets as returned from API based on query Extract text from file; return dataframe with tweet text, id ''' ## Create empty dataframe tweet_df = pd.DataFrame(columns = ["text", "id"]) tokenizer = TweetTokenizer(preserve_case = False, strip_handles = True) ## Read each JSON from file with open(file_name) as data_file: for entry in data_file.readlines(): tweet = json.loads(entry) tweet_id = str(tweet.get("id", "")) text = tweet.get("text", "") ## Remove links from text text = re.sub(r"http\S+", "", text) ## Remove twitter keywords text.replace("RT ", "") ## Remove handle, punctuation from tweet text text_words = filter(lambda x: x not in string.punctuation, tokenizer.tokenize(text)) ## Add tweet to dataframe tweet_df.loc[len(tweet_df)] = [" ".join(text_words), tweet_id] return tweet_df
def load_csv(): with open('Tweets.csv', 'rb') as csvfile: reader = csv.DictReader(csvfile) count = 1 reviews = [] stars = [] tknzr = TweetTokenizer() for row in reader: try: words=tknzr.tokenize(row['text']) label = 'SENT_%s' % count #print label # TaggedDocument(utils.to_unicode(row['text']).split(), [label]) # print "label:", label #labels = [label] #lab_sent = LabeledSentence(words, label) #print lab_sent #reviews.append(TaggedDocument(utils.to_unicode(row['text']).split(), [label])) reviews.append(TaggedDocument(words, [label])) stars.append(row['airline_sentiment']) count += 1 except: continue print "final count:", count return reviews, stars
def nltk_tokenize(text): tokens = [] tknzr = TweetTokenizer() tokens = tknzr.tokenize(text) return tokens
def reasoning(dList): reasonList = [] tokenizer = TweetTokenizer() for tweet in dList: print tweet # tokenize words = tokenizer.tokenize(tweet) # get POS tag pos_tokens = pos_tag(words) # get name entities tree = ne_chunk(pos_tokens, binary = False) # find relations pairs = relextract.tree2semi_rel(tree) # get interesting name entities reason = [] for s, tree in pairs: reasonStr = ("%s") % tree reasonStr = reasonStr.split(" ") label = reasonStr[0].replace("(","").strip() content = "" for wordTag in reasonStr[1:]: sp = wordTag.split("/") word = sp[0].replace("(","") print word # content.append(word) content += (word + " ") # reason: [(label, content)] reason.append({"label": label, "content": content}) # reasonList [reason] if len(reason) > 0: reasonList.append({"reason": reason}) print str(len(reasonList)) + "/" + str(len(dList)) return reasonList
def createDataset(filename, MAX_VOCAB_SIZE): yaks = [] tokenizer = TweetTokenizer() ids = set() numyaks = 0 for line in open(filename).readlines(): stuff = line.split(":::") id = stuff[0] if len(stuff) > 3 and id not in ids: numyaks+=1 sentence = stuff[3] ids.add(id) tokens = [START_TOKEN] tokens.extend(tokenizer.tokenize(sentence.lower())) tokens.append(END_TOKEN) yaks.append(tokens) token_frequency = nltk.FreqDist(itertools.chain(*yaks)) vocab = token_frequency.most_common(MAX_VOCAB_SIZE-1) i2t = [token[0] for token in vocab] i2t.append(UNKNOWN_TOKEN) t2i = dict() for i,t in enumerate(i2t): t2i[t] = i yaks = [[t if t in t2i else UNKNOWN_TOKEN for t in yak] for yak in yaks] Xtrain = np.asarray([[t2i[token] for token in yak[:-1]] for yak in yaks]) Ytrain = np.asarray([[t2i[token] for token in yak[1:]] for yak in yaks]) print "Num unique Yaks: "+str(numyaks) return (Xtrain, Ytrain, i2t, t2i)
def main(): text = sys.stdin.read().decode("utf-8") tknzr = TweetTokenizer() tok = tknzr.tokenize(text) saved_object = construct_dict(tok) print json.dumps(saved_object)
def keywords_search(reviews): key_map = {} # for k in open(os.getcwd() + "/KeyWord/keyword_map_general.txt", 'r'): for k in open(keyword_general_path, 'r'): a = k.strip().split(", ") key_map[a[0]] = a[1] special_map = {} # for k in open(os.getcwd() + "/KeyWord/keyword_map_special.txt", 'r'): for k in open(keyword_special_path, 'r'): a = k.strip().split(", ") special_map[a[0]] = a[1] raw = reviews.lower() tokenizer = TweetTokenizer() tokens = tokenizer.tokenize(raw) # remove punctuations no_punc_tokens = [i for i in tokens if (not i in string.punctuation+string.digits) and (not "." in i)] # remove stop words from tokens en_stop = get_stop_words('en') stopped_tokens = [i for i in no_punc_tokens if not i in en_stop] # stem tokens # wordnet_lemmatizer = WordNetLemmatizer() # stemmed_tokens = [wordnet_lemmatizer.lemmatize(i) for i in stopped_tokens ] chosen_key_words = [] # Search in general key word key_words_dict = dict.fromkeys(key_map.values(), 0) # Select keyword use only key word to select # s = set(stemmed_tokens) s = set(stopped_tokens) for t in key_map.keys(): if t in s: key_words_dict[key_map[t]] += 1 for d in sorted(zip(key_words_dict.values(), key_words_dict.keys()))[:-4:-1]: if d[0] > 0: chosen_key_words.append(d[1]) # Search in special keyword special_words_dict = dict.fromkeys(special_map.values(), 0) # Select keyword using wordnet # Select keyword use only key word to select # s = set(stemmed_tokens) s = set(stopped_tokens) for t in special_map.keys(): if t in s: special_words_dict[special_map[t]] += 1 for d in sorted(zip(special_words_dict.values(), special_words_dict.keys()))[:-3:-1]: if d[0] > 0: chosen_key_words.append(d[1]) return ' '.join(chosen_key_words)
def parse(self, text): # Tokenize message tokenizer = TweetTokenizer() words = tokenizer.tokenize(text) retweet_term = 'RT' urls = [] users = [] hash_tags = [] for word in words: if (word[0] == '@'): # user in Twitter users.append(word) elif (word[0] == '#'): # hash tags hash_tags.append(word) elif (word.find('http:') == 0 or word.find('https:') == 0): # url urls.append(word) for f in urls + users + hash_tags + [retweet_term]: if f in words: words.remove(f) self.words = words self.urls = urls self.users = users self.hash_tags = hash_tags
def format_text(entries, LSTM_shape=True): THIS_FOLDER = str(os.path.dirname(os.path.abspath(__file__))) sentences = [] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') decoded = base64.b64decode(entries) decoded = str(decoded) decoded = decoded[2:] decoded = decoded[:-1] decoded = decoded.split(".") #print(decoded, "is decoded") for entry in decoded: token_sentences = tokenizer.tokenize(entry) for sentence in token_sentences: sentences.append(sentence) tokenized_sentences = [] #remove_tokens = ['%', ']', '[', '.', ',', '?', '!', '\''] #remove_tokens = string.punctuation remove_tokens = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' stop_words = set(stopwords.words('english')) tweet_tknzr = TweetTokenizer() for sentence in sentences: tokens = tweet_tknzr.tokenize(sentence) tokens = list(filter(lambda a: a not in remove_tokens and a not in stop_words, tokens)) tokenized_sentences.append(tokens) all_ngrams1 = np.load(THIS_FOLDER+'/ngrams1.npy').item() all_ngrams2 = np.load(THIS_FOLDER+'/ngrams2.npy').item() all_ngrams3 = np.load(THIS_FOLDER+'/ngrams3.npy').item() #once the model gets updated with good data, ngrams.py needs to get changed/updated too! X = np.zeros((len(sentences), len(all_ngrams1)+len(all_ngrams2)+len(all_ngrams3))) for i in range(len(tokenized_sentences)): sentence = tokenized_sentences[i] my_ngrams = ngrams(sentence, 1) for gram in my_ngrams: if gram in all_ngrams1: index = all_ngrams1[gram] X[i][index] = 1 for i in range(len(tokenized_sentences)): sentence = tokenized_sentences[i] my_ngrams = ngrams(sentence, 2) for gram in my_ngrams: if gram in all_ngrams2: index = len(all_ngrams1) + all_ngrams2[gram] X[i][index] = 1 for i in range(len(tokenized_sentences)): sentence = tokenized_sentences[i] my_ngrams = ngrams(sentence, 3) for gram in my_ngrams: if gram in all_ngrams3: index = len(all_ngrams1) + len(all_ngrams2) + all_ngrams3[gram] X[i][index] = 1 if LSTM_shape: X = np.reshape(X, (X.shape[0], 1, X.shape[1])) else: X = np.reshape(X, (X.shape[0], X.shape[1])) return X
def getTweetTokens(classification, toRead, info, tags): i=0 tknzr = TweetTokenizer() with open(toRead) as f: content = f.readlines() c = 0 for item in content: #adapt the list into python dictionary format content[c] = item.replace("null", "None") content[c] = content[c].replace("false", "False") content[c] = content[c].replace("true", "True") c+=1 for i in range(len(content)): tweet = eval(content[i])["text"] tokenTweet = tknzr.tokenize(tweet) j = 0 k = 0 while j < (len(tokenTweet) - k): #print j if tokenTweet[j][0] == "#": tokenTweet[j] = tokenTweet[j][1:] elif tokenTweet[j][0] == "@": del tokenTweet[j] j-=1 k+=1 j+=1 info.append((word_feats(tokenTweet), classification))
def check(): check_id = request.args.get("id") if check_id is not None: check_sentence = Sentence.query.get(check_id) if check_sentence is not None: Word.query.filter_by(sentence_id=check_id).delete() tweet_tokenizer = TweetTokenizer() tokens = tweet_tokenizer.tokenize(check_sentence.text) for token in tokens: url = "http://kateglo.com/api.php?format=json&phrase="+token resp = requests.get(url) exist = False if (resp.ok): try: resp_json = json.loads(resp.content) exist = True except ValueError: exist = False word = Word(check_sentence.id, token, exist) db.session.add(word) db.session.commit() sentences = Sentence.query.all() c = ((sentence.id, sentence.source, sentence.text, ((w.word, w.exist,) for w in sentence.words.all()), ) for sentence in sentences) return render_template('check.html', rows=c)
def preprocess_db(): tkn = TweetTokenizer() photos = pd.read_pickle(r'./data/restaurant_photos_with_labels.pkl') img_path = r'./data/restaurant_photos/' sentid = 1 img_list = [] # Split data in such a way that labels are evenly distributed between 6 folds skf = StratifiedKFold(photos['label'], n_folds=6) folds = [] # Initialize all images to train dataset initially photos['split'] = ['train' for i in range(len(photos))] # Obtain the indices for the test and validation splits and change value appropriately for _, test_ix in skf: folds.append(test_ix) photos.split[folds[0]] = 'test' photos.split[folds[1]] = 'val' # Obtain the information from each picture and move the pictures to the appropriate dir. The images are renamed. for i, photo_id in enumerate(photos.photo_id): img_dict = dict() img_dict['sentids'] = [sentid] img_dict['business_id'] = photo_id.business_id[i] if photos.split[i] in ['train']: img_dict['filepath'] = u'train' img_dict['imgid'] = 0 img_dict['split'] = u'train' shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/train/' + str(sentid).zfill(6) + '.jpg') elif photos.split[i] in ['test']: img_dict['filepath'] = u'test' img_dict['imgid'] = 0 img_dict['split'] = u'test' shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/test/' + str(sentid).zfill(6) + '.jpg') else: img_dict['filepath'] = u'val' img_dict['imgid'] = 0 img_dict['split'] = u'val' shutil.copy(img_path + photo_id + '.jpg', './data/restaurant_photos_split/val/' + str(sentid).zfill(6) + '.jpg') img_dict['label'] = photos.label[i] caption_dict = dict() if photos.caption[i]: # Tokenize the captions caption_dict['tokens'] = tkn.tokenize(photos.caption[i]) caption_dict['raw'] = photos.caption[i] else: caption_dict['tokens'] = 'None' caption_dict['raw'] = 'None' caption_dict['imgid'] = 0 caption_dict['sentid'] = sentid img_dict['sentences'] = [caption_dict] img_dict['photoid'] = sentid img_dict['yelpid'] = photo_id img_list.append(img_dict) sentid += 1 # Store the new dataset as a JSON file with open("./data/image_caption_dataset.json", "w") as outfile: json.dump(img_list, outfile)
def load_data_and_labels_gameforum(): # load with open("./input/gameforum-1000.csv", 'rU') as f: rdr = csv.reader(f) dataset = list(rdr)[1:] # remove header dataset = [entry for entry in dataset if (entry[1] == '1' or entry[1] == '2' or entry[1] == '3')] # generate x tk = TweetTokenizer(reduce_len=True) x_text = [entry[0] for entry in dataset] x_text = [clean_str(post) for post in x_text] x_text = [tk.tokenize(post) for post in x_text] # generate y y = [entry[1] for entry in dataset] for idx, label in enumerate(y): if label == '1': # positive y[idx] = [1, 0, 0] elif label == '2': # neutral y[idx] = [0, 1, 0] elif label == '3': # negative y[idx] = [0, 0, 1] else: print 'wrong label in gameforum: ' + label return [x_text, y]
def preprocess_tweets(event_date, dt=datetime.timedelta(seconds=30), match=None, tweet_processor=None, match_type='home'): import collections tknzr = TweetTokenizer() dbname = match['dbname'] collname_home = match['collname_home'] collname_away = match['collname_away'] home_team = match['home_team'] away_team = match['away_team'] if match_type == 'home': coll = client[dbname][collname_home] else: coll = client[dbname][collname_away] # add some padding to the start and end times date_start = event_date - dt date_end = event_date + dt query = { "created_at": {"$gt": date_start, "$lt": date_end}} results = coll.find( query ) clean_tweets = [] for result in results: tweet_id = result['id_str'] tweet_split = tweet_processor.preprocess(result['text'].encode('ascii', 'ignore')) parts = tknzr.tokenize(tweet_split) clean = [i for i in parts if i not in stop] clean_text = " ".join (clean) clean_tweets.append( (clean_text, tweet_id) ) return clean_tweets
def get_best_words(): tokenizer = TweetTokenizer() # Analyze frequencies word_fd = FreqDist() label_word_fd = ConditionalFreqDist() negstr = [obj["text"] for obj in handle.negative_tweets.find()] posstr = [obj["text"] for obj in handle.positive_tweets.find()] negwords = [] poswords = [] for i in range(0, len(negstr)-1): for w in tokenizer.tokenize(Twitter.process_tweet(negstr[i])): if w not in stopwords.words("english"): negwords.append(w) for i in range(0, len(posstr)-1): for w in tokenizer.tokenize(Twitter.process_tweet(posstr[i])): if w not in stopwords.words("english"): poswords.append(w) for word in poswords: word_fd[word] += 1 label_word_fd['pos'][word] += 1 for word in negwords: word_fd[word] += 1 label_word_fd['neg'][word] += 1 pos_word_count = label_word_fd['pos'].N() neg_word_count = label_word_fd['neg'].N() total_word_count = pos_word_count + neg_word_count # Score words word_scores = {} for word, freq in word_fd.iteritems(): pos_score = BigramAssocMeasures.chi_sq( label_word_fd['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq( label_word_fd['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score # Keep best 10000 words best = sorted( word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:10000] bestwords = set([w for w, s in best]) return bestwords
def bothTwitterAndMovie(): tknzr = TweetTokenizer(strip_handles=True) onlyWords = re.compile('^[a-zA-Z]+$') f = open('movieTwitter_semtiment_classifier.pickle', 'rb') classifier = pickle.load(f) # type: nltk.classify.naivebayes.NaiveBayesClassifier f.close() # text,created_at tweets = [] onlyWords = re.compile('^[a-zA-Z]+$') labeledTweets = [] for row in csv.DictReader(open('datafiles/trump.csv')): text = row['text'] features = [] for token in tknzr.tokenize(text): if onlyWords.match(token) is not None: features.append(token.lower()) print row['created_at'] tweets.append({ "created_at": row['created_at'], "text": text, "classification": classifier.classify(word_feats(features)) }) classification = open('trumpClassified_both.json', 'w+') classification.write(json.dumps(tweets, indent=2)) classification.close() tweets = [] labeledTweets = [] for row in csv.DictReader(open('datafiles/clinton.csv')): text = row['text'] features = [] for token in tknzr.tokenize(text): if onlyWords.match(token) is not None: features.append(token.lower()) print row['created_at'] tweets.append({ "created_at": row['created_at'], "text": text, "classification": classifier.classify(word_feats(features)) }) classification = open('clintonClassified_both.json', 'w+') classification.write(json.dumps(tweets, indent=2)) classification.close()
def classify(classifier, featx, strings): print "Classify request" tokenizer = TweetTokenizer() mood = [] for string in strings: string = Twitter.process_tweet(string) tokenized_text = [word.lower() for word in tokenizer.tokenize(string)] mood.append(classifier.classify(featx(tokenized_text))) return mood
from nltk.tokenize import MWETokenizer, TweetTokenizer limit = "limit 5000" #limit number of results Q_ALL = "select text from scraped order by scraped.date_posted " + limit con = sqlite3.connect('baza.db') cur = con.cursor() cur.execute(Q_ALL) data = cur.fetchall() data = [a[0] for a in data] data_string = " ".join(data) #list of texts to string #TweetTokenizer tokenizer = TweetTokenizer(preserve_case=False) tokens = tokenizer.tokenize(data_string) #remove punctuation and links from tokens fil = re.compile('.*[A-Za-z0-9].*') urls = re.compile( 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) tokens = [w for w in tokens if fil.match(w)] tokens = [w for w in tokens if not urls.match(w)] print("All tokens: %d" % len(tokens)) print("Unique tokens: %d" % len(set(tokens))) print("Lexical diveristy: %f" % (len(set(tokens)) / len(tokens))) fdist = nltk.FreqDist([t for t in tokens if len(t) > 4]) common = fdist.most_common(20)
if __name__ == '__main__': count_lines = 0 sents_with_ne = 0 sents_with_comp_super = 0 sents_with_sentiment = 0 with open('/home/dasha/Документы/курс/prj-nlp-2020/tasks/02-structural-linguistics/data/examiner-headlines.txt', 'r') as f: lines = f.readlines() for i, line in enumerate(lines): count_lines += 1 tokens = tknzr.tokenize(line) cleaned_tokens = [t for t in tokens if t not in string.punctuation] #and t.lower() not in stop_words] tagged_tokens = pos_tag(cleaned_tokens) if comparative_superlative(tagged_tokens) == 1: sents_with_comp_super += 1 if find_ne(tagged_tokens) == 1: sents_with_ne += 1 if get_sentiment(tagged_tokens) == 1: sents_with_sentiment += 1 print('Sentences with Named Entities: {0}%\nSentences with sentiment: {1}%\nSentences with Adjectives/Adverbs: {2}%'.format((sents_with_ne/count_lines)*100, round((sents_with_sentiment/count_lines)*100, 2), (sents_with_comp_super/count_lines)*100)) >>> Sentences with Named Entities: 79.64% >>> Sentences with sentiment: 45.3% >>> Sentences with Adjectives/Adverbs: 4.06%
def convert_answers_to_words(text): tokenizer_words = TweetTokenizer() tokens_sentences = [tokenizer_words.tokenize( t) for t in nltk.sent_tokenize(text)] print(tokens_sentences) return tokens_sentences
text_raw = generate_training(num_train, num_add, path) tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) model_path = "lda_model/model" print("Total Number of ", len(text_raw), " Training Data Loaded") # Load stop words words_stop = stopwords.words('english') stop_set = set() for i in range(len(words_stop)): stop_set.add(Remove_Symbols(words_stop[i])) text_split = [] for i in range(len(text_raw)): word_nonstop = [] for word in tknzr.tokenize(text_raw[i]): if word not in stop_set: if len(word) <= 3: continue word_nonstop.append(word) if len(word_nonstop) > 0: text_split.append(word_nonstop) # Word Dictionary dic = corpora.Dictionary(text_split) # Generate Corpus corpus = [dic.doc2bow(text) for text in text_split]
def tokenizeTweet(s): tknzr = TweetTokenizer() s0 = s s1 = tknzr.tokenize(s0) return (s1)
class Preprocessor(object): def __init__(self): self.tokenizer = TweetTokenizer() self.lemmatizer = WordNetLemmatizer() @staticmethod def remove_dates(comment): """ Removes date time and time zone information from the comments """ comment = comment.lower() comment = re.sub( """(jan|january|feb|february|mar|march|apr|april|may|jun|june|jul|july|aug|august|sep|september|oct|october|nov|november|dec|december)\s\d{1,2}\s\d{2,4}""", ' ', comment) comment = re.sub( """\d{1,2}\s(jan|january|feb|february|mar|march|apr|april|may|jun|june|jul|july|aug|august|sep|september|oct|october|nov|november|dec|december)\s\d{2,4}""", ' ', comment) comment = re.sub("""\d{1,2}:\d{1,2}""", ' ', comment) comment = re.sub("""utc""", ' ', comment) comment = " ".join(comment.split()) return comment def clean_text(self, comment): """ This function receives comments and returns clean word-list """ # convert comment to lower case comment = comment.lower() # remove \n (new line characters) comment = re.sub("\\n", " ", comment) # remove URLs comment = re.sub( r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', " ", comment) # remove ip addresses comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", comment) # remove usernames comment = re.sub("\[\[.*\]", " ", comment) # remove date time and time zone comment = self.remove_dates(comment) # remove repeating characters in a word ex: abbbbcd ==> abcd pattern = re.compile(r"(.)\1{2,}", re.DOTALL) comment = pattern.sub(r"\1", comment) # remove repeating words ex: you said that that that ==> you said that comment = re.sub(r'(\W|^)(.+)\s\2', '', comment) # substitute regex patterns for vulgar words ex: f***k ==> f**k for target, patterns in RE_PATTERNS.items(): for pat in patterns: comment = re.sub(pat, target, comment) # remove if there are any extra spaces in comment comment = " ".join(comment.split()) # perform tokenization words = self.tokenizer.tokenize(comment) # (')aphostophe replacement (ie) you're --> you are words = [ APOSTROPHE_MAP[word] if word in APOSTROPHE_MAP else word for word in words ] comment = " ".join(words) # remove special chars comment = re.sub(r"[^a-z0-9!#\$%\^\&\*_\-,\.\'()\/ ]", ' ', comment) # perform lemmatization words = [ self.lemmatizer.lemmatize(word, "v") for word in comment.split() ] # words = [w for w in words if not w in STOPWORDS] clean_sent = " ".join(words) # remove any non alphanum,digit character clean_sent = re.sub("\W+", " ", clean_sent) clean_sent = re.sub(" ", " ", clean_sent) return (clean_sent)
def processTweetText(raw_tweet_dict, sentiment_dict): ## Defining the dictionaries to be used emoticon_bag = defaultdict(dict) hash_tag_bag = defaultdict(dict) user_ref_bag = defaultdict(dict) tweet_dict = defaultdict(dict) tweet_cmplt = defaultdict(dict) sentiment_score_dict = defaultdict(dict) tweetSplitter = TweetTokenizer() sid = SentimentIntensityAnalyzer() ## Downloading Stop words from NLTK nltkStopWords = list(stopwords.words('english')) stopWords = list(get_stop_words('en')) stopWords.extend(nltkStopWords) wordDict = defaultdict() s_token = time.clock() for tweet_id in raw_tweet_dict: tweet = raw_tweet_dict[tweet_id] sentence = tweet.tweet_text sentiment_score = sid.polarity_scores(sentence) wordBag = tweetSplitter.tokenize(sentence.replace('RT ', '')) newWordBag = [] emoticonList = [] hashTagList = [] userRefList = [] for word_case in wordBag: word = word_case.lower() if word not in stopWords: if (word in sentiment_dict): emoticonList.append(word) else: if (word.startswith('@')): userRefList.append(word.replace('@', '')) else: if (word.startswith('#')): print(word) hashTagList.append(word.replace('#', '')) else: if word.isalpha(): if word not in wordDict: wordDict[word] = 1 newWordBag.append(word) if len(newWordBag) > 3: tweet_dict[tweet_id] = newWordBag hash_tag_bag[tweet_id] = hashTagList emoticon_bag[tweet_id] = emoticonList user_ref_bag[tweet_id] = userRefList sentiment_score_dict[tweet_id] = sentiment_score tweet_cmplt[tweet_id] = sentence.replace('\n', ' ') final_dict = defaultdict(dict) final_dict['tweet_dict'] = tweet_dict final_dict['hash_tag_bag'] = hash_tag_bag final_dict['emoticon_bag'] = emoticon_bag final_dict['user_ref_bag'] = user_ref_bag final_dict['sentiment_score_dict'] = sentiment_score_dict final_dict['tweet_cmplt'] = tweet_cmplt final_dict['wordDict'] = wordDict e_token = time.clock() print('processTweetText() Time : ', e_token - s_token) return final_dict
#%% import spacy from nltk.tokenize import TweetTokenizer nlp = spacy.load('en') text = "Mary, don’t slsap the green witch" print([str(token) for token in nlp(text.lower())]) tweet = u"Snow White and the Seven Degrees#MakeAMovieCold@midnight:-)" tokenizer = TweetTokenizer() print(tokenizer.tokenize(tweet.lower())) #%%
class AutoToken(object): def __init__(self, trackTokens=False): from nltk.tokenize import TweetTokenizer self.tweetTokenizer = TweetTokenizer(reduce_len=True) self._reMention = re.compile('@\w+') self._numbers = frozenset('1,2,3,4,5,6,7,8,9,0'.split(',')) self.stopWords = self._calculateStopwords() self.tokenTracker = defaultdict(set) if trackTokens else None self.tokenTransformers = OrderedDict([ ('<URL>', self._isUrl), ('<BTC>', self._isBitcoin), ('<ALTCOIN>', self._isAltCoint), ('<INT_NUMBER>', self._isIntNumber), ('<FLOAT_NUMBER>', self._isFloatNumber), ('<MENTION>', self._isMention), ]) def _calculateStopwords(self): engStopWords = set( stopwords.words('english') ) engStopWords.update( ', . : ( ) " | [ ] \' *'.split(' ') ) return engStopWords def __call__(self, tweet): for token in self.tweetTokenizer.tokenize(tweet): if token in self.stopWords: continue normlizedToken = self._normalizeToken( token ) yield normlizedToken def _normalizeToken(self, token): for tokenType, normalizer in self.tokenTransformers.items(): if normalizer( token ): if self.tokenTracker is not None: self.tokenTracker[tokenType].add( token ) return tokenType return token def _isUrl(self, token): return token.startswith('http') or token.startswith('www') or '.com' in token def _isBitcoin(self, token): return token.replace('#', '') in frozenset('btc,bitcoin'.split(',')) def _isMention(self, token): return self._reMention.match(token) def _isAltCoint(self, token): return token.replace('#', '') in frozenset('eth,ltc,ethereum,litecoin,altcoin'.split(',')) def _preProcessNumber(self, token): token = token.replace(',', '').replace('#', '') if token and token[-1] in {'-', '+'} and token[0] in self._numbers: token = token[-1] + token[:-1] return token def _isFloatNumber(self, token): token = self._preProcessNumber(token) try: _ = float(token) return True except ValueError: pass return False def _isIntNumber(self, token): token = self._preProcessNumber(token) try: _ = int(token) return True except ValueError: pass return False
line = tweet['full_text'] # Leading ja trailing kirjaimien poisto line = line.strip() # Linkkien poisto line = remove_links(line) # New line poisto line = line.replace('\n', ' ') # Non-ascii poisto line = NormalizeText.remove_nonascii(line) # Tokenize text tweet_sent = sent_tokenize(line) # Tokenize sentences tweet_word = tTokenizer.tokenize(line) tweet_unique = list(set(tweet_word)) # Eliminate duplicated words # Analyse sentiment ss1 = TextBlob(line) ss2 = sid.polarity_scores(line) # Kumulaattorien päivitys tss1 += ss1.sentiment.polarity tss2 += ss2['compound'] # lisätään tulokset JSON-tiedostoon tweet.update({ 'sentiment': { 'textblob': ss1.sentiment.polarity, 'nltk': ss2['compound']
# In[1]: import sys import pandas as pd import nltk from nltk.tokenize import TweetTokenizer # In[14]: #que 1 datafile = pd.read_csv("tweets-dataset.csv") total_token = list() total_type = set() tk = TweetTokenizer() for sentence in datafile['Sentence']: word_list = tk.tokenize(sentence) for ele in range(len(word_list)): word = word_list[ele] total_token.append(word) total_type.add(word) # In[15]: print("token :", len(total_token)) print("type :", len(total_type)) ttr = len(total_type) / len(total_token) print("ttr : ", ttr, sep="") # In[23]: #que 3
"weren't", 'what', "what's", 'when', "when's", 'where', "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't", 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves' ] terms_all = [] users = [] for user in list(set(tweets_trimmed['query'])): user_sub = tweets_trimmed[tweets_trimmed['query'] == user] tweets = list(set(user_sub['content'])) # unigrams only user_terms = [] for tweet in tweets: terms = tknzr.tokenize(tweet.lower()) for term in terms: if (not any(rr.search(term) for rr in remove_regex)) and ( term not in punctuation) and not (term.startswith('www')): if term not in en_stop: stemmed_term = p_stemmer.stem(term) user_terms.append(stemmed_term.encode('ascii', 'ignore')) terms_all.append(user_terms) users.append(user) dictionary = corpora.Dictionary(terms_all) corpus = [dictionary.doc2bow(term) for term in terms_all] ldamodel = models.ldamodel.LdaModel(corpus, num_topics=200,
def test_remove_handle(self): """ Test remove_handle() from casual.py with specially crafted edge cases """ tokenizer = TweetTokenizer(strip_handles=True) # Simple example. Handles with just numbers should be allowed test1 = "@twitter hello @twi_tter_. hi @12345 @123news" expected = ['hello', '.', 'hi'] result = tokenizer.tokenize(test1) assert result == expected # Handles are allowed to follow any of the following characters test2 = "@n`@n~@n(@n)@n-@n=@n+@n\\@n|@n[@n]@n{@n}@n;@n:@n'@n\"@n/@n?@n.@n,@n<@n>@n @n\n@n ñ@n.ü@n.ç@n." expected = [ '`', '~', '(', ')', '-', '=', '+', '\\', '|', '[', ']', '{', '}', ';', ':', "'", '"', '/', '?', '.', ',', '<', '>', 'ñ', '.', 'ü', '.', 'ç', '.', ] result = tokenizer.tokenize(test2) assert result == expected # Handles are NOT allowed to follow any of the following characters test3 = "a@n j@n z@n A@n L@n Z@n 1@n 4@n 7@n 9@n 0@n _@n !@n @@n #@n $@n %@n &@n *@n" expected = [ 'a', '@n', 'j', '@n', 'z', '@n', 'A', '@n', 'L', '@n', 'Z', '@n', '1', '@n', '4', '@n', '7', '@n', '9', '@n', '0', '@n', '_', '@n', '!', '@n', '@', '@n', '#', '@n', '$', '@n', '%', '@n', '&', '@n', '*', '@n', ] result = tokenizer.tokenize(test3) assert result == expected # Handles are allowed to precede the following characters test4 = "@n!a @n#a @n$a @n%a @n&a @n*a" expected = ['!', 'a', '#', 'a', '$', 'a', '%', 'a', '&', 'a', '*', 'a'] result = tokenizer.tokenize(test4) assert result == expected # Tests interactions with special symbols and multiple @ test5 = "@n!@n @n#@n @n$@n @n%@n @n&@n @n*@n @n@n @@n @n@@n @n_@n @n7@n @nj@n" expected = [ '!', '@n', '#', '@n', '$', '@n', '%', '@n', '&', '@n', '*', '@n', '@n', '@n', '@', '@n', '@n', '@', '@n', '@n_', '@n', '@n7', '@n', '@nj', '@n', ] result = tokenizer.tokenize(test5) assert result == expected # Tests that handles can have a max length of 20 test6 = "@abcdefghijklmnopqrstuvwxyz @abcdefghijklmnopqrst1234 @abcdefghijklmnopqrst_ @abcdefghijklmnopqrstendofhandle" expected = ['uvwxyz', '1234', '_', 'endofhandle'] result = tokenizer.tokenize(test6) assert result == expected # Edge case where an @ comes directly after a long handle test7 = "@abcdefghijklmnopqrstu@abcde @abcdefghijklmnopqrst@abcde @abcdefghijklmnopqrst_@abcde @abcdefghijklmnopqrst5@abcde" expected = [ 'u', '@abcde', '@abcdefghijklmnopqrst', '@abcde', '_', '@abcde', '5', '@abcde', ] result = tokenizer.tokenize(test7) assert result == expected
def f_create_data(self, args): self.m_min_occ = args.min_occ self.m_max_line = 1e5 self.m_data_dir = args.data_dir self.m_data_name = args.data_name self.m_raw_data_file = args.data_file self.m_raw_data_path = os.path.join(self.m_data_dir, self.m_raw_data_file) self.m_vocab_file = self.m_data_name + ".vocab.json" ### to save new generated data self.m_data_file = "tokenized_" + self.m_data_name + ".pickle" data = pd.read_pickle(self.m_raw_data_path) train_df = data["train"] valid_df = data["valid"] tokenizer = TweetTokenizer(preserve_case=False) train_reviews = train_df.review train_item_ids = train_df.itemid train_user_ids = train_df.userid valid_reviews = valid_df.review valid_item_ids = valid_df.itemid valid_user_ids = valid_df.userid vocab_obj = _Vocab() self._create_vocab(vocab_obj, train_reviews) review_corpus = defaultdict(dict) item_corpus = defaultdict(dict) user_corpus = defaultdict(dict) user2uid = defaultdict() stop_word_ids = [ vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in stopwords.words() ] punc_ids = [ vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in string.punctuation ] print("loading train reviews") ss_time = datetime.datetime.now() non_informative_words = stop_word_ids + punc_ids print("non informative words num", len(non_informative_words)) for index, review in enumerate(train_reviews): if index > self.m_max_line: break item_id = train_item_ids.iloc[index] user_id = train_user_ids.iloc[index] words = tokenizer.tokenize(review) word_ids = [ vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words ] review_id = len(review_corpus['train']) review_obj = _Review() review_obj.f_set_review(review_id, word_ids, non_informative_words) review_corpus["train"][review_id] = review_obj if user_id not in user_corpus: user_obj = _User() user_obj.f_set_user_id(user_id) user_corpus[user_id] = user_obj user2uid[user_id] = len(user2uid) uid = user2uid[user_id] user_obj = user_corpus[user_id] user_obj.f_add_review_id(review_id) if item_id not in item_corpus: item_obj = _Item() item_corpus[item_id] = item_obj item_obj.f_set_item_id(item_id) review_obj.f_set_user_item(uid, item_id) item_obj = item_corpus[item_id] item_obj.f_add_review_id(review_obj, review_id) e_time = datetime.datetime.now() print("load training duration", e_time - ss_time) s_time = datetime.datetime.now() user_num = len(user_corpus) vocab_obj.f_set_user(user2uid) save_item_corpus = {} print("item num", len(item_corpus)) print("loading valid reviews") for index, review in enumerate(valid_reviews): if index > self.m_max_line: break item_id = valid_item_ids.iloc[index] user_id = valid_user_ids.iloc[index] words = tokenizer.tokenize(review) word_ids = [ vocab_obj.m_w2i.get(w, vocab_obj.m_w2i['<unk>']) for w in words ] review_id = len(review_corpus["valid"]) review_obj = _Review() review_obj.f_set_review(review_id, word_ids, non_informative_words) review_corpus["valid"][review_id] = review_obj uid = user2uid[user_id] review_obj.f_set_user_item(uid, item_id) item_obj = item_corpus[item_id] # print(len(item_corpus)) item_obj.f_get_RRe(review_obj) save_data = { "item": save_item_corpus, "review": review_corpus, "user": user_num } print("save data to ", self.m_data_file) data_pickle_file = os.path.join(self.m_data_dir, self.m_data_file) f = open(data_pickle_file, "wb") pickle.dump(save_data, f) f.close() vocab = dict(w2i=vocab_obj.m_w2i, i2w=vocab_obj.m_i2w, user2uid=vocab_obj.m_user2uid) with io.open(os.path.join(self.m_data_dir, self.m_vocab_file), 'wb') as vocab_file: data = json.dumps(vocab, ensure_ascii=False) vocab_file.write(data.encode('utf8', 'replace'))
def judgeWords(infile): # read word list readWordlist() # tweetlist = ['physics', 'chemistry', 1997, 2000] # If it becomes true then @ occur start = False with open(infile) as f1: # for all tweet twitterList = [] spamreader = csv.reader(infile, delimiter=' ') rows = f1.readlines() for row in rows: row = row.replace(".", '') # row = "feels so lovely to be back in the can and she loves you #usa brothermerl looooooooove good bad lol @user @ Navarro College" tknzr = TweetTokenizer() text = tknzr.tokenize(row) tempSen = nltk.pos_tag(text) splitrow = row.split() # print(splitrow) # tweetlist = splitrow tweetlist = [] length = len(splitrow) # print(length) # enumerate(row, start=0) for index, word in enumerate(splitrow, start=0): checkPos(word) checkNeg(word) # print(word) # length = len(splitrow) # print(index) if ((start == True) and (word != "@")): tempWord = tempWord + " " + word # print(tempWord) if ((start == True) and (index == length - 1)): judgelist = [tempWord, "@"] tweetlist.append(judgelist) # print("#######") # print("start t end f") elif ((word == "@") and (start == True)): # start = False # end = True #judgelist = [tempWord, "@"] flagAt = True #tweetlist.append(judgelist) tempWord = word # print(word) start = True elif ((word == "@") and (start == False)): global flagAt flagAt = True tempWord = word start = True # handle # tag # elif(word[0] == "#"): # word = word[1:] # word = word elif (word == "@user"): pass # do not add to list else: if jstop(word): a = "ST" elif nonEnW(word): a = "NE" else: a = tempSen[index][1] ### # AB NN VB JJ # change to four type ### if (a == "NNS"): # NNS and NN would be NN a = "NN" if (a == "VBZ"): # VB and VBZ would be VB a = "VB" if (a == "RB"): # RB would be AB a = "AB" if (a == "VBN"): a = "VB" if (a == "NNP"): a = "NN" if (a == "VBD"): a = "VB" if (a == "VBP"): a = "VB" if (a == "CD"): a = "NN" if (a == "JJR"): a = "JJ" if (a == "VBG"): a = "VB" if (a == "JJS"): a = "JJ" judgelist = [word, a] tweetlist.append(judgelist) # b = np.append(b, judgelist) # tweetlist = tweetlist.append(judgelist) # repeat feature if flagRepeted == True: judgelist = [1, "FT"] #tweetlist.append(judgelist) else: judgelist = [0, "FT"] #tweetlist.append(judgelist) # slang feature if flagSlang == True: judgelist = [1, "FT"] #tweetlist.append(judgelist) else: judgelist = [0, "FT"] #tweetlist.append(judgelist) # pos feature if flagPos == True: judgelist = [1, "FT"] #tweetlist.append(judgelist) else: judgelist = [0, "FT"] #tweetlist.append(judgelist) # neg feature if flagPos == True: judgelist = [1, "FT"] #tweetlist.append(judgelist) else: judgelist = [0, "FT"] #tweetlist.append(judgelist) # @ feature if flagAt == True: judgelist = [1, "FT"] #tweetlist.append(judgelist) else: judgelist = [0, "FT"] #tweetlist.append(judgelist) # print(tweetlist) start = False iniFlag() twitterList.append(tweetlist) return twitterList
fileList = (list(iter_rows(worksheet))) fileList1 = (list(iter_rows(worksheet1))) geoLocation = [] tweet = [] for col in fileList: geoLocation.append(col[5]) # 1 is column index tweet.append(col[6]) # 2 is column index for col in fileList1: sentimental_dictionary[col[0]] = col[1] counter = 2 tweetgrade = {} for i in tweet: currentTweet = tknzr.tokenize(i.casefold()) currentTweet = [ word for word in currentTweet if word not in cachedStopWords ] currentTweet = [word for word in currentTweet if word in english_vocab] currentTweet = [s.strip('.') for s in currentTweet] currentTweet = [s.replace('.', '') for s in currentTweet] currentTweet = [s.strip('#') for s in currentTweet] currentTweet = [s.replace('#', '') for s in currentTweet] currentTweet = [s.strip(':') for s in currentTweet] currentTweet = [s.replace(':', '') for s in currentTweet] currentTweet = [s.strip('!') for s in currentTweet] currentTweet = [s.replace('!', '') for s in currentTweet] currentTweet = [s.strip('?') for s in currentTweet] currentTweet = [s.replace('?', '') for s in currentTweet] currentTweet = [s.strip('\\') for s in currentTweet]
def tokenize(tweets): tknzr = TweetTokenizer() tokenized_tweets = [] for tweet in tweets: tokenized_tweets.append(tknzr.tokenize(tweet)) return tokenized_tweets
# array that contains all words all_words = [] # list of stopwords to filter out STOPWORDS = set(stopwords.words('english')) - set(BLACKLIST_STOPWORDS) longest_tweet = 0 spoiler_counter = 0 nonspoiler_counter = 0 tweet_lengths = [] with open(FILE_NAME, 'r', newline='') as file: reader = csv.reader(file) tweet_tok = TweetTokenizer() for row in reader: tweet = [] words = tweet_tok.tokenize(row[0]) for word in words: if word.lower() not in STOPWORDS: if not re.match(PUNCTUATION, word.lower()): if word.lower() in CONTRACTIONS: # print("contraction changing:", word, 'to', CONTRACTIONS[word.lower()]) for expanded_word in word_tokenize(CONTRACTIONS[word.lower()]): # add lowercase version of word to all_words array if expanded_word not in STOPWORDS: print("Success!") tweet.append(expanded_word) else: print('removing:', expanded_word) else: tweet.append(word.lower()) if len(tweet) > longest_tweet:
emoji_pattern = re.compile( "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" u"\U00002600-\U000027BF" u"\U0001f300-\U0001f64F" u"\U0001f680-\U0001f6FF" u"\u2600-\u27BF" "]+", flags=re.UNICODE) wordnet_lemmatizer = WordNetLemmatizer() for t in tweets: stop = set(stopwords.words('english')) #stop words! framents = tknz.tokenize(t) clean_fragments = [] for f in framents: if f not in stop: # not included in the stop words f = emoji_pattern.sub(r'', f) f = f.lower() #lowercase fragment f = re.sub(r'[.,"!~_:|?\']+', '', f, flags=re.MULTILINE) # Special characters f = re.sub(r'\.\.\.', '', f, flags=re.MULTILINE) # 3 dots f = re.sub(url_expression, '', f, flags=re.MULTILINE) # links f = re.sub(r'@[a-z,A-Z,0-9 ]*', '', f, flags=re.MULTILINE) #clean at person references f = re.sub(r'RT @[a-z,A-Z]*: ', '', f, flags=re.MULTILINE) #Remove retweets f = wordnet_lemmatizer.lemmatize(f) if f:
from sklearn.feature_extraction.text import TfidfVectorizer tweetFile = pd.read_csv("Tweets-Data.csv") dataFrame = pd.DataFrame(tweetFile[['tweet_data']]) tweetData = tweetFile['tweet_data'] tknzr = TweetTokenizer() stopWords = set(stopwords.words("english")) # words = word_tokenize(data[0]) #For 1 line cleanedData = [] cleaned = [] for line in tweetData: tweet = tknzr.tokenize(str(line)) for word in tweet: if word not in string.punctuation: if '@' not in word: cleaned.append(word) cleanedData.append(cleaned) cleaned = [] sentencedData = [] for sentence in cleanedData: sentencedData.append(" ".join(sentence)) tweetFile.insert(4, "clean_data", "")
def preprocessing(dfs, candidate_list): ''' Tokenize tweets - transforms a sentence into separate words Loops through Dataframe Dictionary to: - Turns every word into lowercase - remove symbols such as .?:; - removes hashtags and mentions - creates an exclusive column for hashtags and mentions - removes stopwords (connectors, prepositions) ''' tt = TweetTokenizer() Stop_Words_Spacy = list(STOP_WORDS) Stop_Words_NLTK = list(stopwords.words('portuguese')) All_Stop_Words = list(set(Stop_Words_NLTK + Stop_Words_Spacy + [','])) print('Preprocessing the Data...') for key in dfs: # Tokenizing - takes a phrase and isolate each word dfs[key]['token_list'] = dfs[key].apply(lambda x: tt.tokenize(x.text), axis=1) # Dropping unnecessary labels dfs[key].drop(labels=['id', 'datetime', 'created_at'], axis=1, inplace=True) # Lowering words dfs[key]['token_list'] = [[word.lower() for word in lists] for lists in dfs[key].token_list] # Removing Stop Words - connectives, prepositions... dfs[key]['token_list'] = [[ word for word in lists if word not in All_Stop_Words ] for lists in dfs[key].token_list] # Separating Hashtags dfs[key]['Hashtag'] = [[ word[1:] for word in lists if re.match('#', word) is not None ] for lists in dfs[key].token_list] # Separating Twitter Mentions dfs[key]['Mentions'] = [[ word[1:] for word in lists if re.match('@', word) is not None ] for lists in dfs[key].token_list] # Removing Links, Hashtags and Mentions pattern_twitter = '((https)|@|#)' dfs[key]['token_list'] = [[ word for word in lists if re.match(pattern_twitter, word) is None ] for lists in dfs[key].token_list] # Removing all symbols pattern_words_numbers = '[àÀáÁéÉçôõãúÚíÍóÓ, 0-9a-zA-Z]+' dfs[key]['token_list'] = [[ word for word in lists if re.match(pattern_words_numbers, word) is not None ] for lists in dfs[key].token_list]
class Data: def __init__(self): self.matrix = [] self.truth = [] self.tokenizer = TweetTokenizer() def clear(self): self.matrix = [] self.truth = [] def add_data_sample(self, sample, author): #Normalization #Stilometric features "----------------------------------------------------------------" sample = numpy.append(sample, self.hapax_legomenom_author(author)) sample = numpy.append(sample, self.average_word_per_tweet(author)) sample = numpy.append(sample, self.number_of_words(author)) sample = numpy.append(sample, self.longest_word(author)) sample = numpy.append(sample, self.average_length_of_word(author)) sample = numpy.append(sample, self.four_letter_words(author)) sample = numpy.append(sample, self.five_letter_words(author)) sample = numpy.append(sample, self.six_letter_words(author)) sample = numpy.append(sample, self.seven_letter_words(author)) sample = numpy.append(sample, self.misspelled_words(author)) "------------------------------------------------------------------" normalized_sample = preprocessing.normalize([sample]) sample = numpy.array(normalized_sample[0]) self.matrix.append(sample) def add_gender(self, sample): if (sample == 'F'): self.truth.append(0) else: self.truth.append(1) def add_age(self, sample): if (sample == "18-24"): self.truth.append(0) elif (sample == "25-34"): self.truth.append(1) elif (sample == "35-49"): self.truth.append(2) else: self.truth.append(3) def add_personality(self, sample): self.truth.append(sample) def baseline(self, authors, truth, category): for author in authors: author_id = author.attrib['id'] author_truth = truth[author_id] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) sum = numpy.zeros(300) for word in tokenized_tweet: if (word in model.vocab and word not in stop_words): sum += model[word] self.add_data_sample(sum, author) if (category == "gender"): self.add_gender(author_truth) elif (category == "age"): self.add_age(author_truth) else: self.add_personality(author_truth) #author version def hapax_legomenom_author(self, author): list_of_words = [] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet return len(set(list_of_words)) def average_word_per_tweet(self, author): list_of_words = [] number_of_tweets = 0 for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet number_of_tweets += 1 return (len(list_of_words) / number_of_tweets) def number_of_words(self, author): list_of_words = [] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet return len(list_of_words) def longest_word(self, author): list_of_words = [] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet max = 0 for word in list_of_words: if (len(word) > max): max = len(word) return max def average_length_of_word(self, author): list_of_words = [] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet len_sum = 0 for word in list_of_words: len_sum += len(word) return len_sum / len(list_of_words) def four_letter_words(self, author): list_of_words = [] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet less_then_four_letter_words = 0 for word in list_of_words: if (len(word) < 4): less_then_four_letter_words += 1 return less_then_four_letter_words / len(list_of_words) def five_letter_words(self, author): list_of_words = [] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet less_then_five_letter_words = 0 for word in list_of_words: if (len(word) > 5): less_then_five_letter_words += 1 return less_then_five_letter_words / len(list_of_words) def six_letter_words(self, author): list_of_words = [] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet less_then_six_letter_words = 0 for word in list_of_words: if (len(word) > 6): less_then_six_letter_words += 1 return less_then_six_letter_words / len(list_of_words) def seven_letter_words(self, author): list_of_words = [] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet less_then_seven_letter_words = 0 for word in list_of_words: if (len(word) > 7): less_then_seven_letter_words += 1 return less_then_seven_letter_words / len(list_of_words) def misspelled_words(self, author): list_of_words = [] for tweet in author: tokenized_tweet = self.tokenizer.tokenize(tweet.text) list_of_words = list_of_words + tokenized_tweet number_of_misspelled_words = 0 for word in list_of_words: if (word not in model.vocab): number_of_misspelled_words += 1 return number_of_misspelled_words / len(list_of_words)
# Tweet loading and cleaning wrong = 0 with open('neg.txt', 'r', encoding='utf8') as f: negtweets = [] for line in f.readlines(): tweet = line.replace('\n', '') # Removal of URLs, hashtags and mentions tweet_regex = regex_spaces.sub( ' ', regex_ht_mn.sub('', regex_url.sub('', tweet))).lower() # Removal of caps and accents tweet_raw = unidecode.unidecode(tweet_regex).lower() tokens = [ remove_repeated_chars(stemmer.stem(t)) for t in tweet_tokenizer.tokenize(tweet_regex) if not t in stopwords and not regex_nonword.match(t) ] negtweets.append(([tokens, 'neg'])) with open('pos.txt', 'r', encoding='utf8') as f: postweets = [] for line in f.readlines(): tweet = line.replace('\n', '') # Removal of URLs, hashtags and mentions tweet_regex = regex_spaces.sub( ' ', regex_ht_mn.sub('', regex_url.sub('', tweet))).lower() # Removal of caps and accents tweet_raw = unidecode.unidecode(tweet_regex).lower() tokens = [ remove_repeated_chars(stemmer.stem(t))
# 3장 전처리 - 토큰화-NLTK 내장 토크나이저 사용법 from nltk.tokenize import LineTokenizer, SpaceTokenizer, TweetTokenizer from nltk import word_tokenize # LineTokenizer 사용('줄'로 나누기) lTokenizer = LineTokenizer() print( "Line toknizer 출력 :", lTokenizer.tokenize( "My name is" + "Maximus Decimus Meridius, commander of the Armies of the North, " + "General of the Felix Legions and loyal servant to the true emperor," + "Marcus Aurlius. \nFather to a murdered son, husband to a murdered" + "wife. \nAnd I will have my vengeance, in this life or the next.")) # SpaceTokenizer 사용('공백 문자'로 나누기) rawText = "By 11 o'clock on sunday, the doctor shall open the dispensary." sTokenizer = SpaceTokenizer() print("Space Tokenizer 출력 :", sTokenizer.tokenize(rawText)) # word_tokenize 사용('단어'와 '구두점' 나누기) print("word Tokenizer 출력 :", word_tokenize(rawText)) # TweetTokenizer 사용('특수문자'를 다룰 때 사용) tTokenizer = TweetTokenizer() print("Tweet Tokenizer 출력 :", tTokenizer.tokenize("This is a coooool" + "#dummysmiley: :-) :-P <3"))
outfile.write(',') outfile.write('\n') with open('predict_handles.txt', 'r') as handle_file: # Read usernames from file handles = handle_file.read().split('\n') for handle in handles[:-1]: # Call Twitter API os.system('python get_status.py ' + handle) # Tokenize text text = [] with open('statuses/statuses_' + handle + '_output.txt', 'r', encoding='utf8') as tweet_text: text = tokenizer.tokenize(tweet_text.read().replace( '\nTWEETLINEBREAK\n', ' ')) # Turn into features X = fill_features(text) # Score scores = [] print(X.shape) for i in range(len("ocean")): # Print the second element of the probability array because it's the '1' prob scores.append(str(float(mnbs[i].predict_proba(X)[0][1]))) # Turn the scores into marketing segments with open('output/prediction_service_pca_segments.csv', 'a') as outfile: outfile.write(handle + ',')
def apostrophe_tokenize(self, word): """ Handles the tokenization of apostrophes correctly """ aposToken = TweetTokenizer() return aposToken.tokenize(word)
positive_tweets = positive_tweets_refined negative_tweets = negative_tweets_refined for i in range(len(positive_tweets)): sentiment_tweets.append((positive_tweets[i],'positive')) for i in range(len(negative_tweets)): sentiment_tweets.append((negative_tweets[i],'negative')) random.shuffle(sentiment_tweets) tweets = positive_tweets+negative_tweets for tweet in tweets: for word in tknz.tokenize(tweet): if word.lower() not in stop_words and not word.startswith('https'): all_words.append(word.lower()) all_words = nltk.FreqDist(all_words) # print(all_words['awesome']) all_words = (all_words.most_common(2000)) # print(all_words['awesome']) # for word, freq in all_words: # print(word.encode('utf-8'),freq) word_features = [x[0] for x in all_words]
def tokenize(text): tknzr = TweetTokenizer() return tknzr.tokenize(text)
# nltk demo # Reduce input text to function words. # # Mustafa Hussain # also Daren Thomas and Stephen Falk: # https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python # # Copyright: CC BY-NC 4.0 (https://creativecommons.org/licenses/by-nc/4.0/) # # Must download stopwords first. Run in Python shell: # nltk.download("stopwords") from nltk.corpus import stopwords from nltk.tokenize import TweetTokenizer # example: hulk would like to smash the house please # becomes "hulk would like smash house please" # # example: "to be or not to be that is the question" becomes "question" tkzr = TweetTokenizer() while True: in1 = input("hulk) ") word_list = tkzr.tokenize(in1) #in1.split(" ") filtered_words = [ word for word in word_list if word not in stopwords.words('english') ] print(" ".join(filtered_words).upper())
def __init__(self,DIR,format, content,column, source='unspecified'): self.DIR = DIR if format == 'URL': html = urlopen(content).read() soup = BeautifulSoup(html, 'html.parser') # kill all script, and style elements for script in soup(["script", "style","h1","h2","h3", "h4","h5","a","span","label","button"]): script.extract() # rip it out # get text text = soup.get_text() # break into lines and remove leading and trailing space on each lines = (line.strip() for line in text.splitlines()) # break multi-headlines into a line each chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) # drop blank lines self.text = '\n'.join(chunk for chunk in chunks if chunk) sentences = sent_tokenize(self.text) elif format == 'file': Array = [] with open(content,'r',encoding="utf-8") as f: reader = csv.reader(f) try: for row in reader: Array.append(row) except Exception as e: print(e) df = pd.DataFrame(Array[1:],columns=Array[0]) #df = pd.read_csv(content,encoding="utf-8") sentences = df[column].dropna().astype('str').tolist() self.text = '\n'.join(df[column].astype('str').tolist()) # save phrases # tokenize the sentence phrases = [] # unofficial way to do that! regex = re.compile('[%s%s]' % (string.punctuation,'|\"\',\t\n’”“')) for item in sentences: for i in regex.split(item): if i != '' and i.isdigit() != True and len(i)>20: phrases.append(i.lower()) phrases.insert(0,'Phrase') fname_phrases = self.DIR + '/sentence.csv' with open(fname_phrases, "w", newline='') as f: for item in phrases: try: f.write("{}\n".format(item)) except UnicodeEncodeError: pass print(fname_phrases) # tokenize the word if format == 'URL': self.tokens = [wordpunct_tokenize(t) for t in sentences] elif format == 'file': if source == 'twitter': tknz = TweetTokenizer() elif source == 'reddit': tknz = tokenizer.RedditTokenizer() self.tokens = [tknz.tokenize(t) for t in sentences] # nltk's stopwords are too weak with open(os.path.dirname(__file__)+'/stopwords_en.txt','r') as f: stopwords2 = f.read().split('\n') with open(os.path.dirname(__file__)+'/twitter-customized.txt','r') as f: stopwords3 = f.read().split(',') self.filtered_tokens_lower = [] self.filtered_tokens = [] for token in self.tokens: self.filtered_tokens.append([word for word in token if (word.lower() not in stopwords.words('english')) #nltk stopwords and (word.lower() not in stopwords2) # third party stopwors:https://sites.google.com/site/kevinbouge/stopwords-lists and (word.isdigit() == False) # no numbers and (word.isalnum() == True ) # only english characters and (word.lower() not in stopwords3) ]) # twitter specific stopwordshttps://sites.google.com/site/iamgongwei/home/sw self.filtered_tokens_lower.append([word.lower() for word in token if (word.lower() not in stopwords.words('english')) and (word.lower() not in stopwords2) and (word.isdigit() == False) and (word.isalnum() == True ) and (word.lower() not in stopwords3) ]) fname_filtered = self.DIR + '/tokenized.csv' with open(fname_filtered, "w", newline='') as f: writer = csv.writer(f) try: writer.writerows(self.filtered_tokens_lower) except UnicodeEncodeError: pass print(fname_filtered)