def preprocess_data(): # Set humor labels to fake train_df.loc[train_df['label'] == 'humor', 'label'] = 'fake' # Find tweets in english only train_lang = [guess_language(x) for x in tqdm(train_df['tweetText'])] train_df['language'] = train_lang english_mask = train_df['language'] == 'en' train_en_df = train_df[english_mask].copy() prep_tweets = [] # Preprocess tweets for tweet in tqdm(train_en_df['tweetText']): prep_tweets.append(preprocess_tweet(train_en_df, tweet)) train_en_df['preprocessed_tweets'] = prep_tweets # Save both unbalanced and balanced datasets. train_en_df.to_pickle(DATA['unbalanced_train']) # Balance the data balanced_en_df = train_en_df.copy() # Shuffle dataset balanced_en_df = balanced_en_df.sample(frac=1, random_state=42) # Put all real twitter posts in a separate datasets real_en_df = balanced_en_df.loc[balanced_en_df['label'] == 'real'] false_en_df = balanced_en_df.loc[balanced_en_df['label'] == 'fake'].sample( n=len(real_en_df), random_state=42) normalized_en_df = pd.concat([real_en_df, false_en_df]) # save model normalized_en_df.to_pickle(DATA['balanced_train'])
def check_existence_of_words(tweet, wordlist): """ Function for the slang or curse words and acronyms features :param tweet: semi process tweet (hashtags mentions removed) :param wordlist:List of words :return: the binary vector of word in the tweet """ tweet=preprocess_tweet(tweet) found_word = 0 for word in wordlist: if tweet.find(word) != -1: found_word = 1 break return [found_word]
def predict(self, tweet, seq_length): tweet = preprocess_tweet(tweet, punctuation=True) tweet = preprocess_text(tweet) tokens = [tokenize_custom(tweet, self.vocab_to_int)] features = pad_features(tokens, seq_length=seq_length) self.cuda() with torch.no_grad(): h = self.init_hidden(1) output, h = self( torch.from_numpy(features).type(torch.cuda.LongTensor), h) softmax = nn.Softmax(dim=1) return softmax(output).cpu().numpy()
def regex_vector(tweet): """ Return the binary regex vector of the tweet :param tweet: raw tweet :return: the vector in which each bit represent the existence of this regex """ tweet = preprocess_tweet(tweet) patterns = [ "is (this|that|it) true", "wh[a]*t[?!][?1]*", "(real?|really?|unconfirmed)", "(rumour|debunk)", "(that|this|it) is not true" ] patterns_vector = [0] * len(patterns) pattern_compiled = map(re.compile, patterns) for i in range(0, len(pattern_compiled)): if pattern_compiled[i].findall(tweet): patterns_vector[i] = 1 return patterns_vector
def preprocess_data(): # Set humor labels to fake test_df.loc[test_df['label'] == 'humor', 'label'] = 'fake' # Find tweets in english only test_lang = [guess_language(x) for x in tqdm(test_df['tweetText'])] test_df['language'] = test_lang english_mask = test_df['language'] == 'en' test_en_df = test_df[english_mask].copy() prep_tweets = [] # Preprocess tweets for tweet in tqdm(test_en_df['tweetText']): prep_tweets.append(preprocess_tweet(test_en_df, tweet)) test_en_df['preprocessed_tweets'] = prep_tweets # save model test_en_df.to_pickle(DATA['test'])
def get_ngram_postag_vector(tweet, n): """ Return the ngram POStagging vector of the tweet :param tweet: A nonpreprocessed tweet :param n: the number of gram in range [1,4] :return: Vector of ngram tagging using Universal tagging """ #prepare the tag if n==1: ngram_tag=monogram_tagset elif n==2: ngram_tag=bigram_tagset elif n==3: ngram_tag=trigram_tagset elif n==4: ngram_tag=fourgram_tagset #preprocess tweet, remove emoticons, hashtags, metions tweet=preprocess_tweet(tweet) #tokenize tweet token = nltk.word_tokenize(tweet) tagged_token = nltk.pos_tag(token, tagset="universal") #create the vector size of ngram_tag pos_vector = [0] * len(ngram_tag) #check tag and return vector for i in range(0, (len(tagged_token) - n + 1)): str_list = [] for j in range(0, n): str_list.append("'" + tagged_token[i+j][1] + "'") str1=", ".join(str_list) str="("+str1+")" pos_vector[(ngram_tag.index(str))] = 1 return pos_vector
def sentence_prediction(sentence): tokenizer1 = config.Plrty_Tokenizer tokenizer2 = config.TOKENIZER max_len = config.MAX_LEN tweet = str(sentence) tweet_enc = utils.preprocess_tweet(tweet, tokenizer1, max_len, device) out = Polarity_Model(tweet_enc['ids'].view(1, -1), tweet_enc['mask'].view(1, -1), tweet_enc['type_ids'].view(1, -1)) _, pred = torch.max(torch.softmax(out, dim=1), dim=1) sentiment = 'negative' if pred == 1: sentiment = 'positive' print('Predicted Sentiment: ', sentiment) sentiment_id = {'positive': 3893, 'negative': 4997} tok_tweet = tokenizer2.encode(tweet) tweet_ids = tok_tweet.ids[1:-1] tok_tweet_offsets = tok_tweet.offsets[1:-1] tok_tweet_ids = [101] + [sentiment_id[sentiment]] + [102 ] + tweet_ids + [102] tok_type_ids = [0, 0, 0] + [1] * len(tweet_ids) + [1] mask = [1] * len(tok_type_ids) tok_tweet_offsets = [(0, 0)] * 3 + tok_tweet_offsets + [(0, 0)] padding_len = config.MAX_LEN - len(tok_tweet_ids) if padding_len > 0: ids = tok_tweet_ids + ([0] * padding_len) mask = mask + ([0] * padding_len) tok_type_ids = tok_type_ids + ([0] * padding_len) offsets = tok_tweet_offsets + ([(0, 0)] * padding_len) else: ids = tok_tweet_ids[:config.MAX_LEN] mask = mask[:config.MAX_LEN] tok_type_ids = tok_type_ids[:config.MAX_LEN] offsets = tok_tweet_offsets[:config.MAX_LEN] ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0) mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0) token_type_ids = torch.tensor(tok_type_ids, dtype=torch.long).unsqueeze(0) offsets = torch.tensor(offsets, dtype=torch.long) ids = ids.to(device, dtype=torch.long) token_type_ids = token_type_ids.to(device, dtype=torch.long) mask = mask.to(device, dtype=torch.long) out_start, out_end = bert_model(ids, mask, token_type_ids) out_start = torch.softmax(out_start, dim=1).cpu().detach().numpy() out_end = torch.softmax(out_end, dim=1).cpu().detach().numpy() idx_start = np.argmax(out_start) idx_end = np.argmax(out_end) selected_text = "random" print(idx_start, idx_end) final_text, _ = utils.calculate_jaccard(tweet, offsets, selected_text, idx_start, idx_end) return sentiment, final_text