Exemple #1
0
def gen_sequence(filename):
    y_map = {
            'joy': 0,
            'anger': 1,
            'surprise': 2,
            'disgust':3,
            'fear':4,
            'sad':5
            }

    X, y = [], []
    flag = True
    if filename == 'tokenized_tweets_train.txt':
        for tweet in train_tweets:
            text = glove_tokenize(tweet['text'].lower())
            seq, _emb = [], []
            for word in text:
                seq.append(vocab.get(word, vocab['UNK']))
            X.append(seq)
            y.append(y_map[tweet['label']])
        return X, y
    else:
        for tweet in test_tweets:
            text = glove_tokenize(tweet['text'].lower())
            seq, _emb = [], []
            for word in text:
                seq.append(vocab.get(word, vocab['UNK']))
            X.append(seq)
            y.append(y_map[tweet['label']])
        return X, y
Exemple #2
0
def gen_vocab():
    # Processing
    vocab_index = 1
    for tweet in train_tweets:
        text = glove_tokenize(tweet['text'].lower())
        text = ' '.join([c for c in text if c not in punctuation])
        words = text.split()
        # words = [word for word in words if word not in STOPWORDS]

        for word in words:
            if word not in vocab:
                vocab[word] = vocab_index
                reverse_vocab[vocab_index] = word       # generate reverse vocab as well
                vocab_index += 1
            freq[word] += 1

    for tweet in test_tweets:
        text = glove_tokenize(tweet['text'].lower())
        text = ' '.join([c for c in text if c not in punctuation])
        words = text.split()
        # words = [word for word in words if word not in STOPWORDS]

        for word in words:
            if word not in vocab:
                vocab[word] = vocab_index
                reverse_vocab[vocab_index] = word       # generate reverse vocab as well
                vocab_index += 1
            freq[word] += 1
    
    vocab['UNK'] = len(vocab) + 1
    reverse_vocab[len(vocab)] = 'UNK'
def gen_sequence():
    y_map = {'none': 0, 'racism': 1, 'sexism': 2}

    X, y = [], []
    flag = True
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower())
        seq, _emb = [], []
        for word in text:
            seq.append(vocab.get(word, vocab['UNK']))
        X.append(seq)
        y.append(y_map[tweet['label']])
    return X, y
Exemple #4
0
def gen_data():
    X, y = pd.read_csv('data/SD_dataset_FINAL.csv')
    X_e = []
    for s in X:
        words = glove_tokenize(s)
        emb = np.zeros(word_embed_size)
        for word in words:
            try:
                emb += word2vec_model[word]
            except:
                pass
        emb /= len(words)
        X_e.append(emb)
    return X_e, y
Exemple #5
0
def select_tweets_whose_embedding_exists(tweets, word2vec_model):
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = glove_tokenize(tweet['text'])
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb += 1
        if _emb:  # Not a blank tweet
            tweet_return.append(tweet)
    print('Tweets selected:', len(tweet_return))
    print(len(tweet_return))
    return tweet_return
Exemple #6
0
def select_tweets(filename):
    # selects the tweets as in mean_glove_embedding method
    # Processing
    if filename == 'tokenized_tweets_train.txt':
        train_tweets = get_data('tokenized_tweets_train.txt')
    elif filename == 'tokenized_tweets_test.txt':
        test_tweets = get_data('tokenized_tweets_test.txt')
    tweet_return = []
    if filename == 'tokenized_tweets_train.txt':
        c = 1
        for tweet in train_tweets:
            _emb = 0
            words = glove_tokenize(tweet['text'].lower())
            for w in words:
                if w in word2vec_model:  # Check if embeeding there in GLove model
                    _emb+=1
            c = c+1
            # if _emb:   # Not a blank tweet
            tweet_return.append(tweet)
        print('Tweets selected:', len(tweet_return))
        #pdb.set_trace()
        return tweet_return
    else:
        c = 1
        for tweet in test_tweets:
            _emb = 0
            words = glove_tokenize(tweet['text'].lower())
            for w in words:
                if w in word2vec_model:  # Check if embeeding there in GLove model
                    _emb+=1
            c = c+1
            # if _emb:   # Not a blank tweet
            tweet_return.append(tweet)
        print('Tweets selected:', len(tweet_return))
        #pdb.set_trace()
        return tweet_return
def getAbusiveFeatures():
    f = open('abusive_dict.txt', 'r')
    m = {}
    for line in f:
        line = line.strip()
        m[line] = True
    tweets = get_data()
    X = []
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower())
        c = 0
        for word in text:
            if word in m:
                c = c + 1
        X.append(c)
    return np.array(X)
def gen_vocab():
    # Processing
    vocab_index = 1
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower())
        text = ' '.join([c for c in text if c not in punctuation])
        words = text.split()

        for word in words:
            if word not in vocab:
                vocab[word] = vocab_index
                reverse_vocab[vocab_index] = word
                vocab_index += 1
            freq[word] += 1
    vocab['UNK'] = len(vocab) + 1
    reverse_vocab[len(vocab)] = 'UNK'  
Exemple #9
0
def gen_data():
    y_map = {'none': 0, 'racism': 1, 'sexism': 2}

    X, y = [], []
    for tweet in tweets:
        words = glove_tokenize(tweet['text'].lower())
        emb = np.zeros(EMBEDDING_DIM)
        for word in words:
            try:
                emb += word2vec_model[word]
            except:
                pass
        emb /= len(words)
        X.append(emb)
        y.append(y_map[tweet['label']])
    return X, y
Exemple #10
0
def select_tweets(dataset, strategy):
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets, users_none = get_data_waseem4(dataset, strategy)
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = glove_tokenize(tweet['text'].lower())
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb += 1
        if _emb:  # Not a blank tweet
            tweet_return.append(tweet)
    #pdb.set_trace()
    return tweet_return, users_none
def select_tweets_whose_embedding_exists():
    # selects the tweets as in mean_glove_embedding method
    # Processing
    tweets = get_data()
    X, Y = [], []
    tweet_return = []
    for tweet in tweets:
        _emb = 0
        words = glove_tokenize(tweet['text'])
        for w in words:
            if w in word2vec_model:  # Check if embeeding there in GLove model
                _emb += 1
        if _emb:  # Not a blank tweet
            tweet_return.append(tweet)
    print('Tweets selected:', len(tweet_return))
    #pdb.set_trace()
    return tweet_return
def gen_data(tweets):
    y_map = {'none': 0, 'racism': 1, 'sexism': 2}

    X, y = [], []
    for tweet in tweets:
        words = glove_tokenize(tweet['text'])
        emb = np.zeros(word_embed_size)
        for word in words:
            try:
                emb += word2vec_model[word]
            except:
                pass
        emb /= len(words)
        X.append(emb)
        y.append(y_map[tweet['label']])
    X = np.array(X)
    y = np.array(y)
    return X, y
def get_tfidf_features():
    tweets = get_data()
    X, y = [], []
    flag = True
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower())
        text = ' '.join([c for c in text if c not in punctuation])
        if y_map[tweet['label']] == 2:
            X.append(text)
            y.append(int([tweet['label']]))
    tfidf_transformer = TfidfVectorizer(ngram_range=(1, 2),
                                        analyzer='word',
                                        stop_words='english',
                                        max_features=2000)
    X_tfidf = tfidf_transformer.fit_transform(X)
    print(X_tfidf.shape)

    get_top_features(tfidf_transformer)

    return X_tfidf, np.array(y)
def get_tfidf_features():
    tweets = get_data() # getting list of tweets (each tweet in a map format with keys text, label and user)
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }

    X, y = [], []
    flag = True
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower()) # tokenizing like converting # into <hashtag> etc.
        text = ' '.join([c for c in text if c not in punctuation]) # removing punctuation
        X.append(text)
        y.append(y_map[tweet['label']])
    tfidf_transformer = TfidfVectorizer(ngram_range=(1,2), analyzer='word',stop_words='english',max_features=5000)
    X_tfidf = tfidf_transformer.fit_transform(X)
    print(X_tfidf.shape)

    return X_tfidf, np.array(y)
def getAbusiveFeatures():
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }
    f = open('abusive_dict.txt','r')
    m = {}
    for line in f:
        line = line.strip()
        m[line]=True
    tweets = get_data()
    X, y = [], []
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower()) # does it correct spelling as well?
        c = 0
        for word in text:
            if word in m:
                c = c+1
        X.append([c])
        y.append(y_map[tweet['label']])
    return np.array(X),np.array(y)
Exemple #16
0
def gen_data():
    # In this function, for all accepted tweets, we turn them into an
    # embedding of EMBEDDING_DIM. We then sum the embeddings of all
    # words within the tweet that have an embedding and divide
    # by the number of words. Hence, the final embedding of the tweet
    # will be the average of the embeddings of its words.

    X_file = "BoWV_X.pickle"
    y_file = "BoWV_y.pickle"

    # Load if pickled files are available
    try:
        X = pickle.load(open(X_file, "rb"))
        y = pickle.load(open(y_file, "rb"))
        print "Features and labels loaded from pickled files."

    # Create and save otherwise
    except (OSError, IOError) as e:
        print "Creating features and labels..."

        y_map = {'none': 0, 'racism': 1, 'sexism': 2}

        X, y = [], []
        for tweet in tweets:
            words = glove_tokenize(tweet['text'].lower())
            emb = np.zeros(EMBEDDING_DIM)
            for word in words:
                try:
                    emb += word2vec_model[word]
                except:
                    pass
            emb /= len(words)
            X.append(emb)
            y.append(y_map[tweet['label']])

        pickle.dump(X, open(X_file, "wb"))
        pickle.dump(y, open(y_file, "wb"))

    return X, y
Exemple #17
0
def gen_data(tweets_list, word2vec_model, flag):
    if flag == 'binary':
        y_map = {'none': 0, 'racism': 1, 'sexism': 1, 'hate': 1}
    else:
        y_map = {'none': 0, 'racism': 1, 'sexism': 2, 'hate': 1}

    X, y = [], []
    word_embed_size = 200
    for tweet in tweets_list:
        words = glove_tokenize(tweet['text'])
        emb = np.zeros(word_embed_size)
        for word in words:
            try:
                emb += word2vec_model[word]
            except:
                pass
        emb /= len(words)
        X.append(emb)
        y.append(y_map[tweet['label']])
    X = np.array(X)
    y = np.array(y)
    return X, y
def gen_data():
    # Generate features and labels
    # Features will be given by the average
    # embedding for all words in the sentence.

    y_map = {'none': 0, 'racism': 1, 'sexism': 2}

    X, y = [], []
    for tweet in tweets:
        words = glove_tokenize(tweet['text'])  # .lower()
        emb = np.zeros(word_embed_size)
        for word in words:
            try:
                emb += word2vec_model[word]
            except:
                pass
        emb /= len(words)
        X.append(emb)
        y.append(y_map[tweet['label']])

    X = np.array(X)
    y = np.array(y)

    return X, y
def get_liwc_features_from_text():
    filenames = glob.glob("./LIWC_features/*.csv")
    print(filenames)
    y_map = {
            'none': 0,
            'racism': 1,
            'sexism': 2
            }
    tweets = get_data()
    X, y = [], []
    # create a dict of lists of words in all liwc files
    features_dict = {}
    for file in filenames:
        f = open(file,'r')
        m = {}
        for line in f:
            line = line.strip()
            m[line]=True
        features_dict[file] = m
    
    for tweet in tweets:
        text = glove_tokenize(tweet['text'].lower())
        features = []
        for file in filenames:
            c = 1
            for word in text:
                if any([word.startswith(s) for s in features_dict[file]]):
                    c = c+1
            features.append(c)
        X.append(features)
        y.append(y_map[tweet['label']])

    # normalised results
    X = np.array(X)
    X = (X - X.mean(axis=0)) / X.std(axis=0)
    return X, np.array(y)
from data_handler import get_data
from my_tokenizer import glove_tokenize

model = gensim.models.KeyedVectors.load_word2vec_format("glove_embeddings/glove.twitter.27B.200d.txt",binary=True)
print('Finished loading original model %.2f min' % ((time.time()-start)/60))
print('word2vec: %d' % len(model.vocab))

indices_to_delete = []
j = 0
st= set()
tweets = get_data()
X, Y = [], []
tweet_return = []
for tweet in tweets:
    _emb = 0
    words = glove_tokenize(tweet['text'].lower())
    for w in words:
        st.update(w)

for i,w in enumerate(model.index2word):
    l = w.strip().lower()
    found = False
    if l in st:
        found = True
    if found:
        model.vocab[w].index = j
        j += 1
    else:
        del model.vocab[w]
        indices_to_delete.append(i)