Exemple #1
0
def preprocess_data():
    # Set humor labels to fake
    train_df.loc[train_df['label'] == 'humor', 'label'] = 'fake'

    # Find tweets in english only
    train_lang = [guess_language(x) for x in tqdm(train_df['tweetText'])]
    train_df['language'] = train_lang

    english_mask = train_df['language'] == 'en'
    train_en_df = train_df[english_mask].copy()

    prep_tweets = []

    # Preprocess tweets
    for tweet in tqdm(train_en_df['tweetText']):
        prep_tweets.append(preprocess_tweet(train_en_df, tweet))

    train_en_df['preprocessed_tweets'] = prep_tweets

    # Save both unbalanced and balanced datasets.
    train_en_df.to_pickle(DATA['unbalanced_train'])

    # Balance the data
    balanced_en_df = train_en_df.copy()
    # Shuffle dataset
    balanced_en_df = balanced_en_df.sample(frac=1, random_state=42)
    # Put all real twitter posts in a separate datasets
    real_en_df = balanced_en_df.loc[balanced_en_df['label'] == 'real']
    false_en_df = balanced_en_df.loc[balanced_en_df['label'] == 'fake'].sample(
        n=len(real_en_df), random_state=42)

    normalized_en_df = pd.concat([real_en_df, false_en_df])

    # save model
    normalized_en_df.to_pickle(DATA['balanced_train'])
def check_existence_of_words(tweet, wordlist):
    """
    Function for the slang or curse words and acronyms features
    :param tweet: semi process tweet (hashtags mentions removed)
    :param wordlist:List of words
    :return: the binary vector of word in the tweet
    """

    tweet=preprocess_tweet(tweet)
    found_word = 0
    for word in wordlist:
        if tweet.find(word) != -1:
            found_word = 1
            break

    return [found_word]
    def predict(self, tweet, seq_length):
        tweet = preprocess_tweet(tweet, punctuation=True)

        tweet = preprocess_text(tweet)

        tokens = [tokenize_custom(tweet, self.vocab_to_int)]

        features = pad_features(tokens, seq_length=seq_length)

        self.cuda()
        with torch.no_grad():
            h = self.init_hidden(1)
            output, h = self(
                torch.from_numpy(features).type(torch.cuda.LongTensor), h)

            softmax = nn.Softmax(dim=1)
        return softmax(output).cpu().numpy()
def regex_vector(tweet):
    """
    Return the binary regex vector of the tweet
    :param tweet: raw tweet
    :return: the vector in which each bit represent the existence of this regex
    """
    tweet = preprocess_tweet(tweet)
    patterns = [
        "is (this|that|it) true", "wh[a]*t[?!][?1]*",
        "(real?|really?|unconfirmed)", "(rumour|debunk)",
        "(that|this|it) is not true"
    ]
    patterns_vector = [0] * len(patterns)
    pattern_compiled = map(re.compile, patterns)
    for i in range(0, len(pattern_compiled)):
        if pattern_compiled[i].findall(tweet):
            patterns_vector[i] = 1

    return patterns_vector
Exemple #5
0
def preprocess_data():
    # Set humor labels to fake
    test_df.loc[test_df['label'] == 'humor', 'label'] = 'fake'

    # Find tweets in english only
    test_lang = [guess_language(x) for x in tqdm(test_df['tweetText'])]
    test_df['language'] = test_lang

    english_mask = test_df['language'] == 'en'
    test_en_df = test_df[english_mask].copy()

    prep_tweets = []

    # Preprocess tweets
    for tweet in tqdm(test_en_df['tweetText']):
        prep_tweets.append(preprocess_tweet(test_en_df, tweet))

    test_en_df['preprocessed_tweets'] = prep_tweets

    # save model
    test_en_df.to_pickle(DATA['test'])
def get_ngram_postag_vector(tweet, n):
    """
    Return the ngram POStagging vector of the tweet
    :param tweet: A nonpreprocessed tweet
    :param n: the number of gram in range [1,4]
    :return: Vector of ngram tagging using Universal tagging
    """
    #prepare the tag
    if n==1:
        ngram_tag=monogram_tagset
    elif n==2:
        ngram_tag=bigram_tagset
    elif n==3:
        ngram_tag=trigram_tagset
    elif n==4:
        ngram_tag=fourgram_tagset
    #preprocess tweet, remove emoticons, hashtags, metions
    tweet=preprocess_tweet(tweet)

    #tokenize tweet
    token = nltk.word_tokenize(tweet)
    tagged_token = nltk.pos_tag(token, tagset="universal")

    #create the vector size of ngram_tag
    pos_vector = [0] * len(ngram_tag)

    #check tag and return vector
    for i in range(0, (len(tagged_token) - n + 1)):
        str_list = []
        for j in range(0, n):
            str_list.append("'" + tagged_token[i+j][1] + "'")
        str1=", ".join(str_list)
        str="("+str1+")"
        pos_vector[(ngram_tag.index(str))] = 1

    return pos_vector
def sentence_prediction(sentence):
    tokenizer1 = config.Plrty_Tokenizer
    tokenizer2 = config.TOKENIZER
    max_len = config.MAX_LEN

    tweet = str(sentence)
    tweet_enc = utils.preprocess_tweet(tweet, tokenizer1, max_len, device)
    out = Polarity_Model(tweet_enc['ids'].view(1, -1),
                         tweet_enc['mask'].view(1, -1),
                         tweet_enc['type_ids'].view(1, -1))

    _, pred = torch.max(torch.softmax(out, dim=1), dim=1)

    sentiment = 'negative'
    if pred == 1:
        sentiment = 'positive'

    print('Predicted Sentiment: ', sentiment)

    sentiment_id = {'positive': 3893, 'negative': 4997}

    tok_tweet = tokenizer2.encode(tweet)
    tweet_ids = tok_tweet.ids[1:-1]
    tok_tweet_offsets = tok_tweet.offsets[1:-1]

    tok_tweet_ids = [101] + [sentiment_id[sentiment]] + [102
                                                         ] + tweet_ids + [102]
    tok_type_ids = [0, 0, 0] + [1] * len(tweet_ids) + [1]
    mask = [1] * len(tok_type_ids)
    tok_tweet_offsets = [(0, 0)] * 3 + tok_tweet_offsets + [(0, 0)]

    padding_len = config.MAX_LEN - len(tok_tweet_ids)

    if padding_len > 0:

        ids = tok_tweet_ids + ([0] * padding_len)
        mask = mask + ([0] * padding_len)
        tok_type_ids = tok_type_ids + ([0] * padding_len)
        offsets = tok_tweet_offsets + ([(0, 0)] * padding_len)
    else:
        ids = tok_tweet_ids[:config.MAX_LEN]
        mask = mask[:config.MAX_LEN]
        tok_type_ids = tok_type_ids[:config.MAX_LEN]
        offsets = tok_tweet_offsets[:config.MAX_LEN]

    ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)
    token_type_ids = torch.tensor(tok_type_ids, dtype=torch.long).unsqueeze(0)
    offsets = torch.tensor(offsets, dtype=torch.long)

    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)

    out_start, out_end = bert_model(ids, mask, token_type_ids)

    out_start = torch.softmax(out_start, dim=1).cpu().detach().numpy()
    out_end = torch.softmax(out_end, dim=1).cpu().detach().numpy()

    idx_start = np.argmax(out_start)
    idx_end = np.argmax(out_end)
    selected_text = "random"

    print(idx_start, idx_end)

    final_text, _ = utils.calculate_jaccard(tweet, offsets, selected_text,
                                            idx_start, idx_end)

    return sentiment, final_text