Exemple #1
0
def main():

    locations = lrf_config.get_locations()

    INTERMED_DATA_PATH = locations['INTERMED_DATA_PATH']

    intermedJsonPath = os.path.join(INTERMED_DATA_PATH, 'intermed_dict.json')

    tweets_classified_path = os.path.join(INTERMED_DATA_PATH, 'tweets_classified.txt')

    refRiskCatPath = os.path.join(INTERMED_DATA_PATH, 'risk_category_file.json')

    with open(intermedJsonPath,'r') as f:

        intermed_data = json.load(f)

    with open(refRiskCatPath,'r') as f:

        risk_data = json.load(f)

    ## reading data into the dictionaries again
    tweet_dict = dict(intermed_data['tweet_dict'])

    tweet_cmplt = dict(intermed_data['tweet_cmplt'])



    processTweets(tweet_dict,risk_data,tweet_cmplt,tweets_classified_path,vector_type='word_embeddings')

    print('DONE')
Exemple #2
0
def get_sentiment_data():

    locations = lc.get_locations()

    ref_data_path = locations['REF_DATA_PATH']

    sentiment_file_path = os.path.join(ref_data_path,
                                       'sentiment_data/sentiment_lexicon.txt')

    sentiment_dict = defaultdict(dict)

    ## Reading and storing Emoticon words
    with open(sentiment_file_path, 'r') as f:

        emoticon_data = f.readlines()

    ## Creating Sentiment Dictionary
    for line in emoticon_data:

        line_split = line.split('\t')

        emoticon = line_split[0]

        mean_variance = [float(line_split[1]), float(line_split[2])]

        sentiment_dict[emoticon] = mean_variance

    return sentiment_dict
Exemple #3
0
def get_tweet_data(file_type, file_name):

    locations = lc.get_locations()
    if file_type == 'json':

        TWEETS_DATA_PATH = os.path.join(locations['INTERMED_DATA_PATH'],
                                        file_name)

        tweet_data = pd.read_json(TWEETS_DATA_PATH,
                                  orient='records',
                                  convert_axes=False)

        tweet_data['tweet_cmplt'] = prepare_data(tweet_data['tweet_cmplt'],
                                                 'x_data')

        return tweet_data

    elif file_type == 'txt':

        TWEETS_DATA_PATH = os.path.join(locations['INTERMED_DATA_PATH'],
                                        file_name)

        if file_name == 'tweets_classified.txt':

            tweet_data = pd.read_csv(TWEETS_DATA_PATH,
                                     sep='|',
                                     names=[
                                         'tweet_id', 'class_pos', 'class_both',
                                         'tweet_word_list', 'tweet_cmplt'
                                     ]).drop_duplicates().set_index('tweet_id')

        elif file_name == 'tweet_truth.txt':

            tweet_data = pd.read_csv(
                TWEETS_DATA_PATH, sep='|').drop_duplicates(
                    subset='tweet_id').set_index('tweet_id')

            tweet_data['class_annotated'] = prepare_data(
                tweet_data['class_annotated'], 'y_data', list_type=str)

        tweet_data['tweet_cmplt'] = prepare_data(tweet_data['tweet_cmplt'],
                                                 'x_data')

        tweet_data['tweet_word_list'] = prepare_data(
            tweet_data['tweet_word_list'], 'word_bag')

        tweet_data['class_pos'] = prepare_data(tweet_data['class_pos'],
                                               'y_data')

        tweet_data['class_both'] = prepare_data(tweet_data['class_both'],
                                                'y_data')

        tweet_data.to_dict('index')

        return tweet_data
Exemple #4
0
def main():

    locations = lrf_config.get_locations()

    ref_data_dir = locations['REF_DATA_PATH'] + 'sentiment_data'

    intermed_data_dir = locations['INTERMED_DATA_PATH']

    x_filename = 'tweets.txt'

    ##load and process samples
    print('start loading and process samples...')

    tweets = []

    more_features = []

    with open(os.path.join(ref_data_dir, x_filename)) as f:

        for i, line in enumerate(f):

            tweet_meta_features = {}

            tweet_obj = json.loads(line.strip(), encoding='utf-8')

            # Twitter Text contents
            content = tweet_obj['text'].replace("\n", " ")

            postprocessed_tweet, microblogging_features, lexicon_features = pre_process(
                content)

            tweets.append(postprocessed_tweet)

            tweet_meta_features[
                'microblogging_features'] = microblogging_features

            tweet_meta_features['lexicon_features'] = lexicon_features

            more_features.append(tweet_meta_features)

    # Write process tweet text to file
    with open(os.path.join(ref_data_dir, 'tweets_processed.txt'), 'w') as f:
        for tweet in tweets:
            f.write('%s\n' % tweet)

    # write additional tweet features to file
    with open(os.path.join(ref_data_dir, 'more_tweet_features.txt'),
              'w',
              encoding='utf-8') as f:
        f.write(json.dumps(more_features, ensure_ascii=False))

    print("Preprocessing is completed")
Exemple #5
0
def get_twitter_abbreviations_data():

    locations = lc.get_locations()

    ref_data_path = locations['REF_DATA_PATH']

    abbr_file_path = os.path.join(ref_data_path,
                                  'twitter_slang/twitter_slang_data.txt')

    data = pd.read_csv(abbr_file_path, sep='|', names=['abbr', 'meaning'])
    data_len = len(data['abbr'])

    abbr_list = data['abbr'].values
    meaning_list = data['meaning'].values
    slang_dict = {}

    for ind in range(data_len):
        slang_dict[abbr_list[ind]] = meaning_list[ind]

    return slang_dict
Exemple #6
0
def get_news_data(folder_name, file_name):

    locations = lc.get_locations()

    NEWS_DATA_PATH = os.path.join(locations['REF_DATA_PATH'],
                                  folder_name + '/' + file_name)

    news_data = pd.read_csv(NEWS_DATA_PATH)

    news_data = news_data.drop(["Unnamed: 0"],
                               axis=1).set_index('Unnamed: 0.1')

    news_data['category'] = prepare_data(news_data['category'],
                                         'y_data',
                                         list_type=int)

    news_data['text'] = prepare_data(news_data['text'], 'x_data')

    news_data = news_data[news_data.text != 'None']

    news_data.to_dict('index')

    return news_data
Exemple #7
0
def get_mpqa_data():

    locations = lc.get_locations()

    ref_data_path = locations['REF_DATA_PATH']

    mpqa_file_path = os.path.join(
        ref_data_path,
        'subjectivity_clues_hltemnlp05/subjclueslen1-HLTEMNLP05.tff')

    sentiment_map = lc.get_sentiment_map()

    with open(mpqa_file_path, 'r') as f:

        data = f.readlines()

    mpqa_dict = {}

    for line in data:

        elem_bag = line.strip('\n').split(' ')

        for elem in elem_bag:

            item = elem.split('=')

            if item[0] == 'word1':

                word = item[1]

            elif item[0] == 'priorpolarity':
                binary_output = [0] * len(sentiment_map)
                if sentiment_map.get(item[1]) is not None:
                    binary_output[sentiment_map.get(item[1])] = 1
                    mpqa_dict[word] = binary_output

    return mpqa_dict
def main():

    locations = lrf_config.get_locations()

    ref_data_dir = locations['REF_DATA_PATH']

    x_filename = 'sentiment_data/tweets.txt'
    y_filename = 'sentiment_data/labels.txt'

    ##load and process samples
    print('start loading and process samples...')

    tweets = []
    microblog_features = []
    lexicon_features = []
    tweets_lst = []

    with open(os.path.join(ref_data_dir, x_filename)) as f:

        for i, line in enumerate(f):

            tweet_obj = json.loads(line.strip(), encoding='utf-8')

            # Twitter Text contents
            content = tweet_obj['text'].replace("\n", " ")

            tweets_lst.append(pre_process_lst(content))

            postprocessed_tweet, microblogging_features, mpqa_sentiment_score = pre_process(
                content)

            tweets.append(postprocessed_tweet)

            microblog_features.append(microblogging_features)

            lexicon_features.append(mpqa_sentiment_score)

    lexicon_features = np.asarray(lexicon_features)
    microblog_features = np.asarray(microblog_features)

    tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets_lst,
                                                      ngram_range=2)

    transformed_data_rahul = tf_idf_vectorizer.fit_transform(tweets_lst)
    #
    # tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets,ngram_range=2)
    #
    # transformed_data_mine = tf_idf_vectorizer.fit_transform(tweets)

    with open(os.path.join(ref_data_dir, y_filename)) as f:
        y_data = f.readlines()

    y_data = [y.strip('\n') for y in y_data]
    y_data = np.asarray(y_data)
    num_of_features = 50
    accuracy_in_each_turn = []
    while num_of_features <= 3000:
        X_new = SelectKBest(chi2, k=num_of_features).fit_transform(
            transformed_data_rahul, y_data)

        extended_features_1 = np.append(X_new.toarray(),
                                        lexicon_features,
                                        axis=1)
        extended_features_2 = np.append(extended_features_1,
                                        microblog_features,
                                        axis=1)

        sentiment_map = lrf_config.get_sentiment_map()
        inv_sentiment_map = {str(v): k for k, v in sentiment_map.items()}

        X_data = X_new.toarray()

        kf = KFold(n_splits=5)
        kf.get_n_splits(X_data)
        train_list = []
        test_list = []

        for train_index, test_index in kf.split(X_data):
            X_train = X_data[train_index]
            Y_train = y_data[train_index]
            X_test = X_data[test_index]
            Y_test = y_data[test_index]

            Y_pred, train_acc, test_acc = classifier.classify(
                'svc',
                X_train=X_train,
                Y_train=Y_train,
                X_test=X_test,
                Y_test=Y_test,
                class_map=inv_sentiment_map,
                is_X_text=False)

            # print('_______________________________________________________')
            # print(train_acc)
            # print(test_acc)
            train_list.append(train_acc)
            test_list.append(test_acc)

        # print('Train_Acc : ',np.mean(train_acc))
        # print('Test_Acc : ', np.mean(test_acc))
        accuracy_in_each_turn.append([np.mean(train_acc), np.mean(test_acc)])

    for elem in accuracy_in_each_turn:
        print(elem)
Exemple #9
0
from sklearn import preprocessing
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from time import time
import json
import matplotlib.pyplot as plt
from tqdm import tqdm
from lrf.configs import lrf_config
import os
from scipy.sparse import coo_matrix, hstack

locations = lrf_config.get_locations()
# intermed_data_dir = locations['INTERMED_DATA_PATH']
ref_data_dir = os.path.join(locations['REF_DATA_PATH'], 'sentiment_data')


########################################################################################################################
def select_top_k_features(data, labels, n_components=1700):
    data = SelectKBest(chi2, k=n_components).fit_transform(data, labels)
    return data


########################################################################################################################
def load_and_process(data_file, label_file):
    print("Loading data...")
    with open(os.path.join(ref_data_dir, data_file), 'r') as f:
        x = f.readlines()
Exemple #10
0
def main():
    locations = lrf_config.get_locations()
    glove_data_dict = get_glove_dict(locations['INTERMED_DATA_PATH'] +
                                     'glove_key_subset.json')
        def glove_classification(self,data_dict,keywords,keyword_type,glove_data_file,glove_key_file):

            locations = lrf_config.get_locations()

            glove_data_dict = utility.get_glove_dict(locations['INTERMED_DATA_PATH'] + glove_data_file)

            glove_key_dict = utility.get_glove_dict(locations['INTERMED_DATA_PATH'] + glove_key_file)

            glove_crux_pos = utility.getWordToCategMap(keywords, glove_key_dict, 'pos')

            pos_key_glove_arr = glove_crux_pos['key_glove_arr']

            inv_pos_key_index = glove_crux_pos['inv_key_index']

            pos_risk_dict = glove_crux_pos['risk_dict']


            if keyword_type == 'both':

                glove_crux_neg = utility.getWordToCategMap(keywords, glove_key_dict, 'neg')

                neg_key_glove_arr = glove_crux_neg['key_glove_arr']

                inv_neg_key_index = glove_crux_neg['inv_key_index']

                neg_risk_dict = glove_crux_neg['risk_dict']

            pos_predictions = []

            both_predictions = []

            for tweet in data_dict:

                data_lst = []

                for word in tweet:

                    if word in glove_data_dict:

                        data_lst.append(glove_data_dict[word][0])


                ## Preparing Tweet Array
                data_arr = np.asarray(data_lst)

                if len(data_arr) != 0:

                    ## Calculating cosine similarity
                    pos_cos_similarity = cosine_similarity(data_arr, pos_key_glove_arr)

                    pos_nearest_neighbors = np.argsort(pos_cos_similarity, axis=1)[:, -10:]

                    pos_tweet_neighbors = [item for sublist in pos_nearest_neighbors for item in sublist]

                    membership_count = {}

                    membership_count_pos = utility.getMembershipCount(pos_tweet_neighbors, inv_pos_key_index,
                                                                      pos_risk_dict,
                                                                      membership_count)

                    v_pos = list(membership_count_pos.values())

                    k_pos = list(membership_count_pos.keys())

                    output_pos = k_pos[v_pos.index(max(v_pos))]

                    if keyword_type == 'both':

                        neg_cos_similarity = cosine_similarity(data_arr, neg_key_glove_arr)

                        neg_nearest_neighbors = np.argsort(neg_cos_similarity, axis=1)[:, :10]

                        neg_tweet_neighbors = [item for sublist in neg_nearest_neighbors for item in sublist]

                        membership_count_both = utility.getMembershipCount(neg_tweet_neighbors, inv_neg_key_index,
                                                                           neg_risk_dict,
                                                                           membership_count_pos.copy())
                        v_both = list(membership_count_both.values())

                        k_both = list(membership_count_both.keys())

                        output_both = k_both[v_both.index(max(v_both))]

                    pos_predictions.append(output_pos)

                    both_predictions.append(output_both if keyword_type == 'both' else None)

            return pos_predictions,both_predictions