Beispiel #1
0
def load_and_process(data_file, label_file):

    with open(os.path.join(data_dir, data_file), 'r') as f:
        x = f.readlines()
    with open(os.path.join(data_dir, label_file), 'r') as f:
        y = np.array(f.readlines())

    tfidf_vectorizer = utility.get_tf_idf_vectorizer(data=x, ngram_range=2)

    x_feats = tfidf_vectorizer.transform(x)

    x_feats = scale(x_feats, with_mean=False)

    return (x_feats, y)
Beispiel #2
0
def keyword_driver(classifier_type,X_train,Y_train,num_of_keywords=50):

    if all(isinstance(n, list) for n in X_train):
        X_train = utility.get_str_from_list(X_train)

    binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map())

    Y_train_binary = utility.binarize_data(Y_train)

    tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train)
    X_tfidf = tfidf_vectorizer.transform(X_train)

    model = classifier.get_classification_model(classifier_type, X_tfidf, Y_train_binary[1])

    keywords = get_keywords(X_train, model, binarizer,num_of_keywords=num_of_keywords)
    return keywords
Beispiel #3
0
def get_keywords(train_data,classifier,binarizer,class_mapping=lrf_config.get_class_map(),num_of_keywords=20):

    tfidf_vocab = utility.get_tf_idf_vectorizer(train_data).vocabulary_

    tfidf_reversed_vocab = {i: word for word, i in tfidf_vocab.items()}

    top_keywords = {}

    for key in class_mapping.keys():
        if key!='skip':

            keywords = get_keywords_for_tag(binarizer,classifier, key, tfidf_reversed_vocab,num_of_keywords)

            top_keywords[key] = keywords

    return top_keywords
Beispiel #4
0
def main():

    fs = FeatureSelection()

    data = datasets.get_news_data('keyword_data',
                                  'annotator_data_dump_with_text')

    X_data, Y_data = data['text'].values, data['category'].values

    tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_data)

    X_data_tf = tf_idf_vectorizer.transform(X_data)

    Y_binary = utility.binarize_data(data=Y_data)

    res = fs.ChiSquare.chiSquare(X_data_tf, Y_binary, 50)

    for i in range(20):
        print('%%%%%%%%%%%%%%%%%%%%%%%%%%%')
        print(res[0])
Beispiel #5
0
def main():
    print("main_code goes here")

    classifier_type = 'svc'

    news_data = datasets.get_news_data('keyword_data',
                                       'annotator_data_dump_with_text')
    train_data, test_data = utility.split_data(news_data)

    X_train = train_data['text']
    Y_train = train_data['category']

    binarizer = utility.get_multilabel_binarizer(lrf_config.get_class_map())
    Y_train_binary = utility.binarize_data(Y_train)

    tfidf_vectorizer = utility.get_tf_idf_vectorizer(X_train)
    X_tfidf = tfidf_vectorizer.transform(X_train)

    model = classifier.get_classification_model(classifier_type, X_tfidf,
                                                Y_train_binary)

    h = get_keywords(X_train, model, binarizer)
    print(h)
    for k in np.argsort(cof[0])[:number_of_words]:

        top_negative_words[tfidf_reversed_vocab[k]] = cof[0][k]

    keywords = {'pos' : top_positive_words,'neg' : top_negative_words}

    return keywords

######################## BLOCK 24
<<<<<<< HEAD
def get_keywords(train_data, classifier, binarizer, class_mapping=lrf_config.get_class_map(), num_of_keywords=20):
=======
def get_keywords(train_data,classifier,binarizer,class_mapping=lrf_config.get_class_map(),num_of_keywords=20):
>>>>>>> 9af9d4b1e982bbca90d4f5989640d5baa4cc6377

    tfidf_vocab = utility.get_tf_idf_vectorizer(train_data).vocabulary_

    tfidf_reversed_vocab = {i: word for word, i in tfidf_vocab.items()}

    top_keywords = {}

    for key in class_mapping.keys():

        keywords = get_keywords_for_tag(binarizer, classifier, key, tfidf_reversed_vocab, num_of_keywords)

        top_keywords[key] = keywords

    return top_keywords

######################### Main
def main():
Beispiel #7
0
def classify_bkp(classifier_type,
                 X_train,
                 Y_train=None,
                 X_test=None,
                 Y_test=None,
                 keywords=None,
                 class_map=None,
                 is_X_text=True):

    if (classifier_type in ['svc', 'lr', 'ada_boost']):

        if Y_train is None:

            raise ValueError(
                classifier_type,
                ' is a Supervised Algorithm, pass training labels ...')

        elif X_test is None and Y_test is None:

            train_data = zip(X_train, Y_train)

            train_data, test_data = sklearn.model_selection.train_test_split(
                pd.DataFrame.from_records(train_data))

            X_train, Y_train = train_data[0], train_data[1]

            X_test, Y_test = test_data[0], test_data[1]

            print(
                'Since no TEST Data provided, splitting given data into train and test'
            )

        X_train = utility.get_str_from_list(X_train)

        X_test = utility.get_str_from_list(X_test)

        if class_map is not None:

            fitted_binarizer, Y_train_binary = utility.binarize_data(
                Y_train, class_mapping=class_map)
        else:
            fitted_binarizer, Y_train_binary = utility.binarize_data(Y_train)

        if Y_test is not None:

            f, Y_test_binary = utility.binarize_data(Y_test,
                                                     class_mapping=class_map)

        if is_X_text == True:
            tf_idf_vectorizer = utility.get_tf_idf_vectorizer(X_train)

            X_train_tf_idf = tf_idf_vectorizer.transform(X_train)

            X_test_tf_idf = tf_idf_vectorizer.transform(X_test)
        else:
            X_train_tf_idf = X_train
            X_test_tf_idf = X_test

        if classifier_type == 'svc':

            svc_class = supervised.SupervisedClassifier.SvcClassifier()

            if Y_test is not None:

                Y_pred, train_acc, test_acc = svc_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf,
                    Y_test_binary)

                return Y_pred, train_acc, test_acc

            else:

                Y_pred, train_acc = svc_class.classify(X_train_tf_idf,
                                                       Y_train_binary,
                                                       X_test_tf_idf)

                return Y_pred, train_acc

            return fitted_binarizer.inverse_transform(
                Y_pred), train_acc, test_acc

        elif classifier_type == 'lr':

            lr_class = supervised.SupervisedClassifier.LogisticRClassifier()

            if Y_test is not None:
                Y_pred, train_acc, test_acc = lr_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf,
                    Y_test_binary)
                return Y_pred, train_acc, test_acc

            else:
                Y_pred, train_acc = lr_class.classify(X_train_tf_idf,
                                                      Y_train_binary,
                                                      X_test_tf_idf)
                return Y_pred, train_acc

        elif classifier_type == 'ada_boost':

            ada_boost_class = supervised.SupervisedClassifier.AdaBoostClassifier(
            )

            if Y_test is not None:
                Y_pred, train_acc, test_acc = ada_boost_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf,
                    Y_test_binary)

                return Y_pred, train_acc, test_acc

            else:
                Y_pred, train_acc = ada_boost_class.classify(
                    X_train_tf_idf, Y_train_binary, X_test_tf_idf)

                return Y_pred, train_acc

    elif classifier_type == 'cosineSim':

        cosine_sim_class = unsupervised.UnsupervisedClassifiers.CosineSimilarity(
        )

        Y_pred_pos, Y_pred_both = cosine_sim_class.classify(
            X_train, keywords, vector_type='word_embeddings')

        return Y_pred_pos, Y_pred_both
Beispiel #8
0
def main():

    locations = lrf_config.get_locations()

    ref_data_dir = locations['REF_DATA_PATH']

    x_filename = 'sentiment_data/tweets.txt'
    y_filename = 'sentiment_data/labels.txt'

    ##load and process samples
    print('start loading and process samples...')

    tweets = []
    microblog_features = []
    lexicon_features = []
    tweets_lst = []

    with open(os.path.join(ref_data_dir, x_filename)) as f:

        for i, line in enumerate(f):

            tweet_obj = json.loads(line.strip(), encoding='utf-8')

            # Twitter Text contents
            content = tweet_obj['text'].replace("\n", " ")

            tweets_lst.append(pre_process_lst(content))

            postprocessed_tweet, microblogging_features, mpqa_sentiment_score = pre_process(
                content)

            tweets.append(postprocessed_tweet)

            microblog_features.append(microblogging_features)

            lexicon_features.append(mpqa_sentiment_score)

    lexicon_features = np.asarray(lexicon_features)
    microblog_features = np.asarray(microblog_features)

    tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets_lst,
                                                      ngram_range=2)

    transformed_data_rahul = tf_idf_vectorizer.fit_transform(tweets_lst)
    #
    # tf_idf_vectorizer = utility.get_tf_idf_vectorizer(tweets,ngram_range=2)
    #
    # transformed_data_mine = tf_idf_vectorizer.fit_transform(tweets)

    with open(os.path.join(ref_data_dir, y_filename)) as f:
        y_data = f.readlines()

    y_data = [y.strip('\n') for y in y_data]
    y_data = np.asarray(y_data)
    num_of_features = 50
    accuracy_in_each_turn = []
    while num_of_features <= 3000:
        X_new = SelectKBest(chi2, k=num_of_features).fit_transform(
            transformed_data_rahul, y_data)

        extended_features_1 = np.append(X_new.toarray(),
                                        lexicon_features,
                                        axis=1)
        extended_features_2 = np.append(extended_features_1,
                                        microblog_features,
                                        axis=1)

        sentiment_map = lrf_config.get_sentiment_map()
        inv_sentiment_map = {str(v): k for k, v in sentiment_map.items()}

        X_data = X_new.toarray()

        kf = KFold(n_splits=5)
        kf.get_n_splits(X_data)
        train_list = []
        test_list = []

        for train_index, test_index in kf.split(X_data):
            X_train = X_data[train_index]
            Y_train = y_data[train_index]
            X_test = X_data[test_index]
            Y_test = y_data[test_index]

            Y_pred, train_acc, test_acc = classifier.classify(
                'svc',
                X_train=X_train,
                Y_train=Y_train,
                X_test=X_test,
                Y_test=Y_test,
                class_map=inv_sentiment_map,
                is_X_text=False)

            # print('_______________________________________________________')
            # print(train_acc)
            # print(test_acc)
            train_list.append(train_acc)
            test_list.append(test_acc)

        # print('Train_Acc : ',np.mean(train_acc))
        # print('Test_Acc : ', np.mean(test_acc))
        accuracy_in_each_turn.append([np.mean(train_acc), np.mean(test_acc)])

    for elem in accuracy_in_each_turn:
        print(elem)
Beispiel #9
0
        def tf_idf_classification(self,
                                  data_dict,
                                  keywords,
                                  keyword_type='both'):

            data_arr = []

            data_index = {}

            for i, ind in enumerate(data_dict):

                record = data_dict[ind]

                data_index[i] = ind

                if type(record) == list:

                    new_rec = ' '.join(record)

                    data_arr.append(new_rec)

                elif type(record) == str:

                    data_arr.append(record)

            pos_risk = []

            neg_risk = []

            category_index = {}

            ind = 0

            for category in keywords:

                pos_rec = ' '.join(keywords[category]['pos'].keys())

                pos_risk.append(pos_rec)

                category_index[ind] = category

                if keyword_type == 'both':

                    neg_rec = ' '.join(keywords[category]['neg'].keys())

                    neg_risk.append(neg_rec)

            tf_idf_vectorizer = utility.get_tf_idf_vectorizer(data_arr)

            data_tfidf = tf_idf_vectorizer.transform(data_arr)

            pos_category_tfidf = tf_idf_vectorizer.transform(pos_risk)

            cos_sim_pos = cosine_similarity(data_tfidf, pos_category_tfidf)

            pos_res = np.argmax(cos_sim_pos, axis=1)

            if keyword_type == 'both':

                neg_category_tfidf = tf_idf_vectorizer.transform(neg_risk)

                cos_sim_neg = cosine_similarity(data_tfidf, neg_category_tfidf)

                cos_sim_both = cos_sim_pos - cos_sim_neg

                both_res = np.argmax(cos_sim_both, axis=1)

            pos_result = {}

            both_result = {}

            for i in data_index:

                pos_result[data_index[i]] = category_index[pos_res[i]]

                if keyword_type == 'both':

                    both_result[data_index[i]] = category_index[both_res[i]]

            return pos_result, both_result