Ejemplo n.º 1
0
def load_and_process(data_file, label_file):

    with open(os.path.join(data_dir, data_file), 'r') as f:
        x = f.readlines()
    with open(os.path.join(data_dir, label_file), 'r') as f:
        y = np.array(f.readlines())

    tfidf_vectorizer = utility.get_tf_idf_vectorizer(data=x, ngram_range=2)

    x_feats = tfidf_vectorizer.transform(x)

    x_feats = scale(x_feats, with_mean=False)

    return (x_feats, y)
Ejemplo n.º 2
0
def main():

    fs = FeatureSelection()

    data = datasets.get_news_data('keyword_data',
                                  'annotator_data_dump_with_text')

    X_data, Y_data = data['text'].values, data['category'].values

    tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_data)

    X_data_tf = tf_idf_vectorizer.transform(X_data)

    Y_binary = utility.binarize_data(data=Y_data)

    res = fs.ChiSquare.chiSquare(X_data_tf, Y_binary, 50)

    for i in range(20):
        print('%%%%%%%%%%%%%%%%%%%%%%%%%%%')
        print(res[0])
Ejemplo n.º 3
0
def main():

    locations = lrf_config.get_locations()

    ref_data_dir = locations['REF_DATA_PATH']

    x_filename = 'sentiment_data/tweets.txt'
    y_filename = 'sentiment_data/labels.txt'

    ##load and process samples
    print('start loading and process samples...')

    tweets = []
    microblog_features = []
    lexicon_features = []
    tweets_lst = []

    with open(os.path.join(ref_data_dir, x_filename)) as f:

        for i, line in enumerate(f):

            tweet_obj = json.loads(line.strip(), encoding='utf-8')

            # Twitter Text contents
            content = tweet_obj['text'].replace("\n", " ")

            tweets_lst.append(pre_process_lst(content))

            postprocessed_tweet, microblogging_features, mpqa_sentiment_score = pre_process(
                content)

            tweets.append(postprocessed_tweet)

            microblog_features.append(microblogging_features)

            lexicon_features.append(mpqa_sentiment_score)

    lexicon_features = np.asarray(lexicon_features)
    microblog_features = np.asarray(microblog_features)

    tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets_lst,
                                                      ngram_range=2)

    transformed_data_rahul = tf_idf_vectorizer.fit_transform(tweets_lst)
    #
    # tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets,ngram_range=2)
    #
    # transformed_data_mine = tf_idf_vectorizer.fit_transform(tweets)

    with open(os.path.join(ref_data_dir, y_filename)) as f:
        y_data = f.readlines()

    y_data = [y.strip('\n') for y in y_data]
    y_data = np.asarray(y_data)
    num_of_features = 50
    accuracy_in_each_turn = []
    while num_of_features <= 3000:
        X_new = SelectKBest(chi2, k=num_of_features).fit_transform(
            transformed_data_rahul, y_data)

        extended_features_1 = np.append(X_new.toarray(),
                                        lexicon_features,
                                        axis=1)
        extended_features_2 = np.append(extended_features_1,
                                        microblog_features,
                                        axis=1)

        sentiment_map = lrf_config.get_sentiment_map()
        inv_sentiment_map = {str(v): k for k, v in sentiment_map.items()}

        X_data = X_new.toarray()

        kf = KFold(n_splits=5)
        kf.get_n_splits(X_data)
        train_list = []
        test_list = []

        for train_index, test_index in kf.split(X_data):
            X_train = X_data[train_index]
            Y_train = y_data[train_index]
            X_test = X_data[test_index]
            Y_test = y_data[test_index]

            Y_pred, train_acc, test_acc = classifier.classify(
                'svc',
                X_train=X_train,
                Y_train=Y_train,
                X_test=X_test,
                Y_test=Y_test,
                class_map=inv_sentiment_map,
                is_X_text=False)

            # print('_______________________________________________________')
            # print(train_acc)
            # print(test_acc)
            train_list.append(train_acc)
            test_list.append(test_acc)

        # print('Train_Acc : ',np.mean(train_acc))
        # print('Test_Acc : ', np.mean(test_acc))
        accuracy_in_each_turn.append([np.mean(train_acc), np.mean(test_acc)])

    for elem in accuracy_in_each_turn:
        print(elem)
Ejemplo n.º 4
0
        def tf_idf_classification(self,data_dict,keywords,keyword_type='both'):

            data_arr = []

            data_index = {}

            for i, ind in enumerate(data_dict):

                record = data_dict[ind]

                data_index[i] = ind

                if type(record) == list:

                    new_rec = ' '.join(record)

                    data_arr.append(new_rec)

                elif type(record) == str:

                    data_arr.append(record)

            pos_risk = []

            neg_risk = []

            category_index = {}

            ind = 0

            for category in keywords:

                pos_rec = ' '.join(keywords[category]['pos'].keys())

                pos_risk.append(pos_rec)

                category_index[ind] = category

                if keyword_type == 'both':

                    neg_rec = ' '.join(keywords[category]['neg'].keys())

                    neg_risk.append(neg_rec)

            tf_idf_vectorizer = utility.get_tf_idf_vectorizer(data_arr)

            data_tfidf = tf_idf_vectorizer.transform(data_arr)

            pos_category_tfidf = tf_idf_vectorizer.transform(pos_risk)

            cos_sim_pos = cosine_similarity(data_tfidf, pos_category_tfidf)

            pos_res = np.argmax(cos_sim_pos, axis=1)

            if keyword_type == 'both':

                neg_category_tfidf = tf_idf_vectorizer.transform(neg_risk)

                cos_sim_neg = cosine_similarity(data_tfidf, neg_category_tfidf)

                cos_sim_both = cos_sim_pos - cos_sim_neg

                both_res = np.argmax(cos_sim_both, axis=1)

            pos_result = {}

            both_result = {}

            for i in data_index:

                pos_result[data_index[i]] = category_index[pos_res[i]]

                if keyword_type == 'both':

                    both_result[data_index[i]] = category_index[both_res[i]]

            return pos_result, both_result
Ejemplo n.º 5
0
def classify(classifier_type,
             X_train,
             Y_train=None,
             X_test=None,
             Y_test=None,
             keywords=None,
             class_map=None,
             is_X_text=True):

    if (classifier_type in ['svc', 'lr', 'ada_boost']):

        if Y_train is None:

            raise ValueError(
                classifier_type,
                ' is a Supervised Algorithm, pass training labels ...')

        elif X_test is None and Y_test is None:

            train_data = zip(X_train, Y_train)

            train_data, test_data = sklearn.model_selection.train_test_split(
                pd.DataFrame.from_records(train_data))

            X_train, Y_train = train_data[0], train_data[1]

            X_test, Y_test = test_data[0], test_data[1]

            print(
                'Since no TEST Data provided, splitting given data into train and test'
            )

        X_train = utility.get_str_from_list(X_train)

        X_test = utility.get_str_from_list(X_test)

        # if class_map is not None:
        #
        #     fitted_binarizer, Y_train_binary = utility.binarize_data(Y_train,class_mapping=class_map)
        # else:
        #     fitted_binarizer, Y_train_binary = utility.binarize_data(Y_train)

        print(Y_train)
        exit(0)

        if Y_test is not None:

            f, Y_test_binary = utility.binarize_data(Y_test,
                                                     class_mapping=class_map)

        if is_X_text == True:
            tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_train)

            X_train_tf_idf = tf_idf_vectorizer.transform(X_train)

            X_test_tf_idf = tf_idf_vectorizer.transform(X_test)
        else:
            X_train_tf_idf = X_train
            X_test_tf_idf = X_test

        if classifier_type == 'svc':

            svc_class = supervised.SupervisedClassifier.SvcClassifier()

            if Y_test is not None:

                Y_pred, train_acc, test_acc = svc_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf,
                    Y_test_binary)

                return Y_pred, train_acc, test_acc

            else:

                Y_pred, train_acc = svc_class.classify(X_train_tf_idf,
                                                       Y_train_binary,
                                                       X_test_tf_idf)

                return Y_pred, train_acc

            return fitted_binarizer.inverse_transform(
                Y_pred), train_acc, test_acc

        elif classifier_type == 'lr':

            lr_class = supervised.SupervisedClassifier.LogisticRClassifier()

            if Y_test is not None:
                Y_pred, train_acc, test_acc = lr_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf,
                    Y_test_binary)
                return Y_pred, train_acc, test_acc

            else:
                Y_pred, train_acc = lr_class.classify(X_train_tf_idf,
                                                      Y_train_binary,
                                                      X_test_tf_idf)
                return Y_pred, train_acc

        elif classifier_type == 'ada_boost':

            ada_boost_class = supervised.SupervisedClassifier.AdaBoostClassifier(
            )

            if Y_test is not None:
                Y_pred, train_acc, test_acc = ada_boost_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf,
                    Y_test_binary)

                return Y_pred, train_acc, test_acc

            else:
                Y_pred, train_acc = ada_boost_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf)

                return Y_pred, train_acc

    elif classifier_type == 'cosineSim':

        cosine_sim_class = unsupervised.UnsupervisedClassifiers.CosineSimilarity(
        )

        Y_pred_pos, Y_pred_both = cosine_sim_class.classify(
            X_train, keywords, vector_type='word_embeddings')

        return Y_pred_pos, Y_pred_both