Esempio n. 1
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()

    # gamma : float, optional (default='auto') 2**-15 ~ 2**3
    #     Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
    #     If gamma is 'auto' then 1/n_features will be used instead.
    # C : float, optional (default=1.0) 2**-5 ~ 2**15
    #     Penalty parameter C of the error term.
    # kernel : string, optional (default='rbf')
    #     Specifies the kernel type to be used in the algorithm.
    #     It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or
    #     a callable.
    #     If none is given, 'rbf' will be used. If a callable is given it is
    #     used to pre-compute the kernel matrix from data matrices; that matrix
    #     should be an array of shape ``(n_samples, n_samples)``.
    # degree : int, optional (default=3)
    #     Degree of the polynomial kernel function ('poly').
    #     Ignored by all other kernels.
    # coef0 : float, optional (default=0.0)
    #    Independent term in kernel function.
    #    It is only significant in 'poly' and 'sigmoid'.
    parameters = {
        'gamma': 'auto',
        'C': 1.0,
        'kernel': 'rbf',
        'degree': 3,
        'coef0': 0.0
    }
    do_log = False
    do_norm = None
    do_bern = True
    ##..................................#
    # get vocabulary
    vocabulary = get_vocabulary(strategy_instance)
    # get y_train
    y_train = get_y_train(strategy_instance)
    # compare different parameters
    remained_iteration = 399
    for i in range(-5, 16):
        c = 2**i
        parameters['C'] = c
        for j in range(-15, 4):
            parameters['gamma'] = 2**j
            calculate_parameter(strategy_instance, vocabulary, parameters,\
                y_train, do_bern, do_log, do_norm, remained_iteration)
            remained_iteration -= 1
    # debug('y_train:', type(y_train), y_train.shape)
    # clf = strategy_instance.train_svm(parameters, x_train, y_train)
    # log_result(clf, vocabulary, do_bern, do_log, do_norm)
    ##..................................#
    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    # assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 2
0
def train_target_and_validate_changed_txt(
        test_file,
        modified):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    np.set_printoptions(threshold=np.nan)

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()

    with open(test_file, 'r') as test_text:
        test_samples = [line.strip().split(' ') for line in test_text]

    parameters = {
        'C': 1,
        'kernel': 'linear',
        'degree': 3,
        'gamma': 'auto',
        'coef0': 1
    }

    investigate_train_results(
        train(strategy_instance, parameters, test_samples), modified)
    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...

    return strategy_instance  ## NOTE: You are required to return the instance of this class.
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    np.set_printoptions(threshold=np.nan)

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()

    with open(test_data, 'r') as test_text:
        test_samples = [line.strip().split(' ') for line in test_text]

    class_0 = strategy_instance.class0
    class_1 = strategy_instance.class1

    # print(full_dict)
    # print(len(full_dict))

    parameters = {
        'C': 0.4,
        'kernel': 'linear',
        'degree': 3,
        'gamma': 'auto',
        'coef0': 0.0
    }

    investigate_train_results(
        train(strategy_instance, parameters, 220, test_samples))
    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)

    return strategy_instance  ## NOTE: You are required to return the instance of this class.
def fool_classifier(test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    vectorizer = CountVectorizer()
    strategy_instance = helper.strategy()
    x_train = []
    x_train_ = []
    list_x_train = strategy_instance.class0 + strategy_instance.class1
    for i in list_x_train:
        print(i)
        x_train_ += i

    print('x_train_: \n')
    print(x_train_)



    X = vectorizer.fit_transform(x_train)
    print(X.toarray())
    print(vectorizer.get_feature_names())


    y = np.zeros((540, 1), dtype=np.int)
    y[360:] = 1
    y = y.ravel()  # Convert a multidimensional array to a one-dimensional array
    #print(y)
    print('end of printing y')
Esempio n. 5
0
def fool_classifier(test_data):
    with open(test_data, 'r') as file:
        data = [line.strip().split(' ') for line in file]

    strategy_instance = helper.strategy()
    parameters = {
        'gamma': 'auto',
        'C': 1.0,
        'kernel': 'linear',
        'degree': 3,
        'coef0': 0.0
    }

    x_train, y_train, word_list = train_generator(strategy_instance.class0,
                                                  strategy_instance.class1)
    clf = strategy_instance.train_svm(parameters, x_train, y_train)

    data_modified(word_list, clf.coef_[0], data)

    with open('modified_data.txt', 'w') as file:
        file.write('\n'.join([' '.join(a) for a in data]))

    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance
Esempio n. 6
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    test_dt = None
    with open(test_data, 'r') as infile:
        test_dt = [line.strip().split(' ') for line in infile]

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    parameters = {
        'C': 1,
        'gamma': 'auto',
        'kernel': 'linear',
        'coef0': 0.0,
        'degree': 3
    }
    lines = [' '.join(line) for line in strategy_instance.class0] \
            + [' '.join(line) for line in strategy_instance.class1]

    # prepare X(samples), y(targets)
    # bag of words vectorizer
    cv = CountVectorizer()
    cv.fit(lines)
    X_train = cv.transform(lines)
    model = strategy_instance.train_svm(parameters, X_train,
                                        np.array([0] * 360 + [1] * 180))

    # sort the coefficients from positive to negative
    top_coef_sorted = np.argsort(model.coef_.toarray()[0])[::-1]
    top_features = np.array(cv.get_feature_names())
    ##..................................#
    modified_list = []

    # go through each test record, delete features in the records
    # which correspond to positive coefficient in the trained model
    for record in test_dt:
        record_new = record
        for coef_index in top_coef_sorted:
            feature = top_features[coef_index]
            record_new = [word for word in record_new if word != feature]

            if len((set(record) - set(record_new)) | \
                   (set(record_new) - set(record))) == 20: # no more modifications
                break

        modified_list.append(record_new)

    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...
    new_file = open("modified_data.txt", "w")

    for i in modified_list:
        new_file.write(' '.join(i))
        new_file.write('\n')
    new_file.close()

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 7
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    parameters = {}

    ##..................................#
    #
    #
    #
    ## Your implementation goes here....#
    #
    #
    #
    ##..................................#

    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    parameters = {}

    pre, list_name, test_data1, class_1, class_0 = extract(test_data)
    fool(pre, list_name, test_data1, class_1, class_0)
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model trainin    g (if any),
    #  and modifications limit checkin
    strategy_instance = helper.strategy()
    parameters = {}
    # initial parameters
    parameters['gamma'] = 'auto'
    parameters['C'] = 100
    parameters['kernel'] = 'linear'
    parameters['degree'] = 3
    parameters['coef0'] = 0.0
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.feature_extraction.text import CountVectorizer
    for i in range(150, 450, 5):
        vectorizer = CountVectorizer(stop_words='english',
                                     min_df=10,
                                     preprocessor=None,
                                     max_features=i)
        try:
            vectorizer.fit(merge(['class-0.txt', 'class-1.txt']))
        except Exception:
            continue
        X_train = vectorizer.transform(merge(['class-0.txt', 'class-1.txt']))
        X_test = vectorizer.transform(open('test_data.txt', 'r'))
        Y_train = ['class-0'] * 360 + ['class-1'] * 180
        Y_test = ['class-1'] * 200
        # print(X_train.shape)
        # print(vectorizer.get_feature_names())
        clf = strategy_instance.train_svm(parameters, X_train, Y_train)
        # print(clf.predict(X_test))
        print('c = ', 100, 'mindf = ', 10, 'i = ', i, end='')
        print(correctrate(clf.predict(X_test), Y_test),
              correctrate(clf.predict(X_train), Y_train))
        # print(X_train)

    ##..................................#
    #
    #
    #
    ## Your implementation goes here....#
    #
    #
    #
    ##..................................#

    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...

    ## You can check that the modified text is within the modification limits.
    # modified_data='./modified_data.txt'
    # assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 10
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    test_dt = None
    with open('test_data.txt', 'r') as infile:
        test_dt = [line.strip().split(' ') for line in infile]

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    parameters = {
        'C': 2**-5,
        'gamma': 'auto',
        'kernel': 'linear',
        'coef0': 0.0,
        'degree': 3
    }
    lines = [' '.join(line) for line in strategy_instance.class0] \
            + [' '.join(line) for line in strategy_instance.class1]

    cv = CountVectorizer()
    cv.fit(lines)
    X_train = cv.transform(lines)
    model = strategy_instance.train_svm(parameters, X_train,
                                        np.array([0] * 360 + [1] * 180))
    top_positive_coef = np.argsort(model.coef_.data)[::-1]
    top_features = np.array(cv.get_feature_names())
    ##..................................#
    modified_list = []

    for record in test_dt:
        del_count = 0
        record_new = record
        for feature in top_features[top_positive_coef]:
            if feature in record_new:
                if del_count == 20:
                    break
                record_new = [word for word in record_new if word != feature]
                del_count += 1
        modified_list.append(record_new)

    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...
    new_file = open("modified_data.txt", "w")

    for i in modified_list:
        new_file.write(' '.join(i))
        new_file.write('\n')
    new_file.close()

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    parameters = {}
    x_train = strategy_instance.class0 + strategy_instance.class1
    train = []
    for i in range(len(x_train)):
        train.append(' '.join(x_train[i]))

    with open(test_data, 'r') as test1:
        test1 = [line.strip().split(' ') for line in test1]

    test = []
    for i in range(len(test1)):
        test.append(' '.join(test1[i]))

    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
    tfv = CountVectorizer()
    tfv.fit(list(train))
    word = tfv.get_feature_names()

    xtrain_tfv = tfv.transform(train)
    xvalid_tfv = tfv.transform(test)
    print('len of word', len(word))

    import numpy as np
    y = np.zeros((540, 1), dtype=np.int)
    y[360:] = 1
    y = y.ravel()

    from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
    parameters['C'] = 1.0
    parameters['kernel'] = 'linear'
    parameters['degree'] = 3
    parameters['gamma'] = 10
    parameters['coef0'] = 1
    clf = strategy_instance.train_svm(parameters, xtrain_tfv, y)

    predict_test = clf.predict(xvalid_tfv)
    count_1 = 0
    count_0 = 0
    for x in predict_test:
        if x == 1:
            count_1 += 1
        if x == 0:
            count_0 += 1
    print('count_1: ', count_1)
    print('count_0: ', count_0)
    #   完毕

    w = clf.coef_.toarray()
    index = np.where(w[0] < 0)[0]

    dic_w = {}
    for i in index:
        dic_w[i] = w[0][i]
    dic_w = sorted(dic_w.items(), key=lambda d: d[1])[0:100]

    index = [dic_w[i][0] for i in range(len(dic_w))]

    add_word = []
    for i in index:
        add_word.append(tfv.get_feature_names()[i])

    n = 0
    for i in range(len(test)):
        n = 0
        for w in add_word:
            if n == 20:
                break

            if w not in test1[i]:
                test[i] = test[i] + " " + w

            else:
                continue
            n = n + 1

    file = open('./modified_data.txt', 'w')
    for i in range(len(test)):
        file.write(test[i])
        file.write("\n")
    file.close()

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 12
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    strategy_instance = helper.strategy()
    parameters = {}
    list_class0 = strategy_instance.class0
    list_class1 = strategy_instance.class1
    vertical_dim_of_trainx = len(list_class0) + len(list_class1)
    paragraphs = []
    for para in list_class0:
        little_paragraph = ""
        for word in para:
            little_paragraph += word
            little_paragraph += " "
        paragraphs.append(little_paragraph)
    for para in list_class1:
        little_paragraph = ""
        for word in para:
            little_paragraph += word
            little_paragraph += " "
        paragraphs.append(little_paragraph)
    y_train = []
    for i in range(len(list_class0)):
        y_train.append(0)
    for j in range(len(list_class1)):
        y_train.append(1)
    x_vector = TfidfVectorizer(token_pattern='[^\s]+')
    x_train = x_vector.fit_transform(paragraphs)
    words_bag = x_vector.vocabulary_
    # Looking for the best 'C'
    C_parameter = np.arange(0.01, 1.2, 0.01)
    parameters_for_grid = {'kernel': ['linear'], 'C': C_parameter}
    clf_for_grid = GridSearchCV(svm.SVC(), parameters_for_grid)
    clf_for_grid.fit(x_train, y_train)
    c_best = clf_for_grid.best_params_
    word_list = x_vector.get_feature_names()
    parameters = {
        'kernel': 'linear',
        'C': c_best['C'],
        'degree': 1,
        'coef0': 0,
        'gamma': 'auto'
    }
    clf = strategy_instance.train_svm(parameters, x_train, y_train)
    weight_list = clf.coef_.toarray().tolist()[0]
    for i in range(len(weight_list)):
        if weight_list[i] > 0:
            weight_list[i] = weight_list[i] * 2
    x_data, data_list = transform_data(test_data, words_bag)
    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...
    dict_for_word = {}
    for idx in range(len(word_list)):
        dict_for_word[word_list[idx]] = weight_list[idx]
    sorted_dict_for_word = sorted(dict_for_word.items(), key=lambda x: x[1])
    reversed_dict_for_word = sorted(dict_for_word.items(),
                                    key=lambda x: x[1],
                                    reverse=True)
    list_for_test_dict = []
    sorted_test_dict = []
    rsorted_test_dict = []
    for idx in range(len(data_list)):
        paragraph_list = data_list[idx]
        test_data_dict = {}
        for word in paragraph_list:
            if word in dict_for_word:
                test_data_dict[word] = dict_for_word[word]
        list_for_test_dict.append(test_data_dict)
        sorted_test_dict.append(
            sorted(test_data_dict.items(), key=lambda x: x[1]))
        rsorted_test_dict.append(
            sorted(test_data_dict.items(), key=lambda x: x[1], reverse=True))

    for i in range(len(list_for_test_dict)):
        time = 20
        s_j = 0
        add_index = 0
        while (time > 0):
            if sorted_dict_for_word[add_index][1] + rsorted_test_dict[i][s_j][
                    1] > 0:
                rm_all(data_list[i], (rsorted_test_dict[i][s_j][0]))
                time -= 1
                s_j += 1
            else:
                if sorted_dict_for_word[add_index][0] not in data_list[i]:
                    data_list[i].append(sorted_dict_for_word[add_index][0])
                    time -= 1
                add_index += 1
    f = open('modified_data.txt', 'w')
    for i in data_list:
        line = ''
        for word in i:
            line += word
            line += ' '
        line += '\n'
        f.write(line)
    f.close()
    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    modify_x, m_list = transform_data('modified_data.txt', words_bag)
    return strategy_instance
Esempio n. 13
0
def fool_classifier(test_data): ## Please do not change the function defination...

    strategy_instance=helper.strategy()

    parameters={'kernel':'linear',
                'C':1.0,
                'gamma':'auto',
                'degree':3,
                'coef0':0.0}

    # generate training dictionary
    training_set = set()
    for line in strategy_instance.class0:
        for token in line:
            training_set.add(token)
    for line in strategy_instance.class1:
        for token in line:
            training_set.add(token)
    dictionary = {}
    for token in training_set:
        dictionary[token] = 0
    dictionary_index = list(dictionary.keys())

    # generate x_train and y_train
    x = []
    y = []


    for line in strategy_instance.class0:
        tmp_dic = copy.deepcopy(dictionary)
        for token in line:
            tmp_dic[token] += 1
        x.append(list(tmp_dic.values()))
        y.append(0)
    for line in strategy_instance.class1:
        tmp_dic = copy.deepcopy(dictionary)
        for token in line:
            tmp_dic[token] += 1
        x.append(list(tmp_dic.values()))
        y.append(1)
    x = np.array(x)
    #  vector of n_sample that contains the token
    ti = []
    for col in range(len(x[0])):
        count = 0
        for row in range(len(x)):
            if x[row][col] >= 1:
                count += 1
        ti.append(count)

    # compute tf-idf to generate x_train
    tf_vector = []
    idf_vector = []
    for i in range(len(x)):
        tf = []
        idf = []
        row_sum = sum(x[i])
        for j in range(len(x[0])):
            tf.append( x[i][j] / row_sum)
            idf.append(math.log(x.shape[0] / (ti[j] + 1) ,2))
        tf_vector.append(tf)
        idf_vector.append(idf)

    x_train = np.array(tf_vector) * np.array(idf_vector)
    y_train = np.array(y)

    # training
    clf = strategy_instance.train_svm(parameters,x_train,y_train)

    #get the weights of features and sort them
    support_vector = clf.coef_[0]
    weighted_dictionary = copy.deepcopy(dictionary)
    for i in range(len(dictionary_index)):
        weighted_dictionary[dictionary_index[i]] = support_vector[i]
    sorted_weight = sorted(weighted_dictionary.items(),key = lambda x:x[1],reverse = False)
    
    ## modify test.txt based on sorted_weight
    with open(test_data,'r') as f:
        data=[line.strip().split(' ') for line in f]
    for n_line in range(len(data)):
        count = 0
        while count < 20:
            for i in range(len(sorted_weight)):
                if sorted_weight[-i-1][0] in data[n_line]:
                    tmp = [word for word in data[n_line] if word != sorted_weight[-i-1][0]]
                    data[n_line] = tmp
                    count += 1
                    break
    # for n_line in range(len(data)):
    #     count = 0
    #     while count < 10:
    #         for i in range(len(sorted_weight)):
    #             if sorted_weight[i][0] not in data[n_line]:
    #                 data[n_line] = data[n_line] + [sorted_weight[i][0]]
    #                 count += 1
    #                 break
    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory
    modified_data='./modified_data.txt'
    path = os.getcwd()
    os.chdir(path)
    with open(modified_data,"w") as f:
        for line in data:
            f.write(" ".join(sorted(line)) + "\n")

    # #### my test script########################
    # with open(modified_data,'r') as f:
    #     data=[line.strip().split(' ') for line in f]
    # x = []
    # for line in data:
    #     tmp_dic = copy.deepcopy(dictionary)
    #     for token in line:
    #         if token in tmp_dic:
    #             tmp_dic[token] += 1
    #     x.append(list(tmp_dic.values()))
    # x_test = np.array(x)
    # y_test = np.array([0 for _ in range(len(data))])
    # print(clf.predict(x_test))
    # print(clf.predict(x_test).shape)
    # print(clf.score(x_test,y_test))
    # ########################################




    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    parameters = {
        'gamma': 'auto',
        'C': 0.02,
        'kernel': 'linear',
        'degree': 3,
        'coef0': 0.0
    }

    # This step is clean the modified_data.txt if there have some context in this file
    modified_data = './modified_data.txt'
    f = open(modified_data, "w")
    f.close()
    ##..................................#
    # first step open strategy().class0 and strategy().class1
    class0_data = strategy_instance.class0  # type is list, 2 dimensions
    class1_data = strategy_instance.class1
    # second step open test_data file

    with open(test_data, 'r') as test_d:
        test = test_d.read()
    # transform class0_data(list) to become dictionary and label them
    features_and_labels = []
    store_test_features = []
    for line in class0_data:
        #         if(line != ''):
        class0_dict = get_freq_of_tokens_from_list(line)
        features_and_labels.append((class0_dict, 0))
    for line in class1_data:
        #         if(line != ''):
        class1_dict = get_freq_of_tokens_from_list(line)
        features_and_labels.append((class1_dict, 1))
        #     print('len(features_and_labels)', len(features_and_labels))

    # transform test(txt file) to becaome dictionary and labe them
    for line in test.split('\n'):
        if (line != ''):
            test_dict = get_freq_of_tokens_from_file(line)
            store_test_features.append((test_dict, 1))
            # transform features_and_labels to become x_train(matrix) and y_train(array)
    encoder = LabelEncoder()
    vectorizer = DictVectorizer(dtype=int, sparse=True)

    x_train, y_train = list(zip(*features_and_labels))
    x_train = vectorizer.fit_transform(x_train)
    y_train = encoder.fit_transform(y_train)

    x_test, y_test = list(zip(*store_test_features))
    x_test = vectorizer.transform(x_test)
    #     print(x_train.shape)

    # According to the svm(Support Vector Machine) to do the training to train training data(540, 5178)
    training = strategy_instance.train_svm(parameters, x_train, y_train)
    #     print(training)
    # result = training.predict(x_test)  # this is the test data's predict result
    # this function is using to get the relationship about the word and it's appear frequency
    store_each_test_sample_frequency_dict_list = test_samples_word_to_frequency(
        x_test, vectorizer)

    # This funtion is using to according to the svm training result to find each feature's weight,
    # and get the relationship about the word and it's weight
    sorted_traing_data_word_weight_dict = training_data_coeffecient(
        training, vectorizer)

    # This fuction is using to get each test sample's relationship with word and weight
    sorted_whole_test_sample_word_weight_dict_list = dict_between_test_words_and_weight(
        store_each_test_sample_frequency_dict_list,
        sorted_traing_data_word_weight_dict, vectorizer)

    ## Your implementation goes here....#
    # This is using to store the traing_data_word_weight(2 demensions list) which is used to compare the test sample's word weight
    sorted_traing_data_word_weight_list = dict_to_list(
        sorted_traing_data_word_weight_dict)

    # This is using to store the test_data_word_weight(3demesins list) which is used to compare the whole training sample's word weight
    sorted_whole_test_sample_word_weight_3_demesions_list = []
    for each_sample in sorted_whole_test_sample_word_weight_dict_list:
        sorted_whole_test_sample_word_weight_3_demesions_list.append(
            dict_to_list(each_sample))
        #     print('sorted_whole_test_sample_word_weight_3_demesions_list----------', sorted_whole_test_sample_word_weight_3_demesions_list)
    #
    final_rsult = final_add_or_delete(
        sorted_whole_test_sample_word_weight_3_demesions_list,
        sorted_traing_data_word_weight_list, test)

    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...
    with open(modified_data, 'w') as modified:
        modified.write(final_rsult)

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
def fool(pre, list_name, test_data, class_1, class_0):
    strategy = helper.strategy()

    weight_ = list(pre.coef_)[0]
    #print(pre.coef_)
    word_weight = dict.fromkeys(list_name, 0)
    n_w = 0
    for word in list_name:
        word_weight[word] = weight_[n_w]
        n_w += 1
    weight_list = []
    for word, val in word_weight.items():
        weight_list.append([val, word])
    weight_class1 = sorted(weight_list, reverse=True)
    weight_class0 = sorted(weight_list, reverse=False)
    print(weight_class1[:10])
    print(weight_class0[:10])
    len_class_1 = len(class_1)
    len_class_0 = len(class_0)
    rotio = 0
    de_r = 0
    if len_class_1 < len_class_0:
        r = float(len_class_0 / len_class_1)
        if r == 1:
            rotio = 0.5
        if r > 1 and r < 1.5:
            rotio = 0.7
        if r > 1.5 and r < 2:
            rotio = 0.8
        if r == 2:
            rotio = 0.85
        if r > 2 and r < 3:
            rotio = 0.9
        if r > 3:
            rotio = 0.95
        de_r = int(20 * rotio)
    if len_class_1 > len_class_0:
        r = float(len_class_1 / len_class_0)
        if r == 1:
            rotio = 0.5
        if r > 1 and r < 1.5:
            rotio = 0.3
        if r > 1.5 and r < 2:
            rotio = 0.2
        if r == 2:
            rotio = 0.15
        if r > 2 and r < 3:
            rotio = 0.1
        if r > 3:
            rotio = 0.05
        de_r = int(20 * rotio)

    with open('log.txt', 'w') as l_f:
        with open('modified_data.txt', 'w') as t_f:
            for t_line in test_data:
                count = de_r
                add_change = []
                delete_change = []

                add_index = 0
                t_delete = []
                t_add = []
                delete = []
                add = []
                #########################################################################
                #---------------delete

                for index in range(len(weight_class1)):
                    if count > 0:
                        if weight_class1[index][1] in t_line:
                            count -= 1
                            delete_change.append(weight_class1[index][1])

                for t_data in t_line:
                    if t_data in delete_change:
                        delete.append(t_data)
                        continue

                    else:
                        t_delete.append(t_data)
                        t_f.write(t_data + ' ')

                #####################################################################################
                # ----------add
                count = 20 - de_r

                for index in range(len(weight_class0)):
                    if count > 0:
                        if weight_class0[index][1] not in t_line:
                            count -= 1
                            add_change.append(weight_class0[index][1])
                            add_index = index
                for a_data in add_change:
                    t_add.append(a_data)
                    add.append(a_data)
                    t_f.write(a_data + ' ')
                total = t_add + t_delete

                ####################################################################################################
                #######----------check add

                T = set(total)
                L = set(t_line)
                if len(set(T - L) | set(L - T)) < 20:
                    for index in range(add_index, len(weight_class0)):
                        T = set(total)
                        if len(set(T - L) | set(L - T)) > 20:
                            break
                        else:
                            total.append(weight_class0[index][1])
                            T = set(total)
                            if len(set(T - L) | set(L - T)) <= 20:
                                add.append(weight_class0[index][1])
                                t_f.write(weight_class0[index][1] + ' ')

            # print('total',total_change)
            #  print('c0',class_0_list)
            #  print('c1',class_1_list)
            #  print(len(set(T - L)))

                t_f.write('\n')
                l_f.write('-----------' + 'delete:' + '\n' + str(set(delete)) +
                          '\n')
                l_f.write('-----------' + 'add:' + '\n' + str(set(add)) + '\n')
                l_f.write('\n')
Esempio n. 16
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()

    ########################### define parameter ###########################
    parameters = {
        'gamma': 'auto',
        'C': 1.0,
        'kernel': 'linear',
        'degree': 3,
        'coef0': 0.0
    }

    ########################### feature extraction #############################
    # debug_matrix('class0', strategy_instance.class0)
    # debug_matrix('class1', strategy_instance.class1)
    y_train = get_y_train(strategy_instance)
    # debug('y_train =\n', y_train)

    x_train, vectorizer = get_x_train(strategy_instance)
    # debug('x_train =\n', x_train)

    ############################## train model #################################
    clf = strategy_instance.train_svm(parameters, x_train, y_train)
    # debug(clf)

    # grid search
    # param_range = [2**i for i in range(-5, 16)]
    # param_grid = [{'C': param_range, 'kernel': ['linear']}]
    # grid = GridSearchCV(clf_start, param_grid)
    # grid.fit(x_train,y_train)
    # clf = grid.best_estimator_

    vocabulary = vectorizer.get_feature_names()
    # debug('vocabulary =\n', vocabulary)

    weight_list = clf.coef_.tolist()[0]
    # debug('weight_list =\n', weight_list)

    # ############################# modify file ##############################
    modified_data = './modified_data.txt'
    #read file
    test_matrix = read_test_matrix(test_data)

    # get modified matrix
    modified_matrix = []
    for test_vector in test_matrix:
        modified_vector = get_modified_vector(\
                test_vector, vocabulary, weight_list, vectorizer)
        modified_matrix.append(modified_vector)

    # write file
    write_modified_matrix(modified_matrix, modified_data)

    ################################## test  ###################################
    # show_test_result(clf, vectorizer)
    assert strategy_instance.check_data(test_data, modified_data)

    return strategy_instance
def extract(test_data1):
    strategy = helper.strategy()
    #class0 = strategy.class0
    # class1 = strategy.class1
    class_0 = strategy.class0
    class_1 = strategy.class1
    #---------------------------------------------------------------
    # class_0_a = []
    # class_1_a = []
    # count_0 = 360
    # count_1 = 180
    # while count_0:
    #     count_line = randint(4,8)
    #     t = []
    #     while count_line:
    #         line_index = randint(0,359)
    #         index_data = randint(0, len(class0[line_index]) - 20)
    #         for index in range(index_data, index_data + 20):
    #             t.append(class0[line_index][index])
    #         count_line -= 1
    #
    #     class_0_a.append(t)
    #     count_0 -= 1
    # while count_1:
    #     count_line = randint(4,8)
    #     t = []
    #     while count_line:
    #         line_index = randint(0,179)
    #         index_data = randint(0, len(class1[line_index]) - 20)
    #         for index in range(index_data,index_data + 20):
    #             t.append(class1[line_index][index])
    #         count_line -=1
    #
    #     class_1_a.append(t)
    #     count_1 -= 1
    #
    # class_0 = class0 + class_0_a
    # class_1 = class1 + class_1_a
    # print(len(class_0))
    # print(len(class_1))
    #--------------------------------------------------------------------------

    with open(test_data1, 'r') as test_file:
        test_data = [line.strip().split(' ') for line in test_file]

    vectorizer = TfidfVectorizer(max_features=5720,
                                 use_idf=True,
                                 norm='l2',
                                 analyzer='word',
                                 token_pattern='[^\s]+')
    #class_0 = class_0[int(len(class_0)/2):]
    # data = vectorizer.fit_transform([' '.join(line) for line in class_0 + class_1 + test_data])
    data = vectorizer.fit_transform(
        [' '.join(line) for line in class_0 + class_1])
    #   data1 = vectorizer.fit_transform([' '.join(line) for line in class_1])
    print(data.toarray())
    t_data = vectorizer.fit_transform([' '.join(line) for line in test_data])

    parameters = {
        'gamma': 'auto',
        'C': 1,
        'kernel': 'linear',
        'degree': 0,
        'coef0': 0
    }

    x_train = data.toarray()[:len(class_0 + class_1)]
    y_train = [0 for _ in range(len(class_0))
               ] + [1 for _ in range(len(class_1))]
    pre = strategy.train_svm(parameters, x_train, y_train)

    #######################################################
    #print(pre.predict(data.toarray()[len(class_0+class_1):]))
    # total = pre.predict(data.toarray()[len(class_0+class_1):])
    # one = [_ for _ in total if _==1]
    # print(len(one)/len(total))
    ###############################################################
    #final_predict = pre.predict(data.toarray()[len(class_0 + class_1):])
    # final_predict = pre.predict(t_data.toarray()[:])
    list_name = vectorizer.get_feature_names()
    return pre, list_name, test_data, class_1, class_0
Esempio n. 18
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()

    ############################# define parameter #############################
    parameters = {
        'gamma': 'auto',
        'C': 1.0,
        'kernel': 'linear',
        'degree': 3,
        'coef0': 0.0
    }
    # gamma : float, optional (default='auto') 2^-15 ~ 2^3
    #     Kernel coefficient for 'rbf', 'poly' and 'sigmoid'.
    #     If gamma is 'auto' then 1/n_features will be used instead.
    # C : float, optional (default=1.0) 2^-5 ~ 2^15
    #     Penalty parameter C of the error term.
    # kernel : string, optional (default='rbf')
    #     Specifies the kernel type to be used in the algorithm.
    #     It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or
    #     a callable.
    #     If none is given, 'rbf' will be used. If a callable is given it is
    #     used to pre-compute the kernel matrix from data matrices; that matrix
    #     should be an array of shape ``(n_samples, n_samples)``.
    # degree : int, optional (default=3)
    #     Degree of the polynomial kernel function ('poly').
    #     Ignored by all other kernels.
    # coef0 : float, optional (default=0.0)
    #    Independent term in kernel function.
    #    It is only significant in 'poly' and 'sigmoid'.

    ############################### train data #################################
    debug_matrix('class0', strategy_instance.class0)
    debug_matrix('class1', strategy_instance.class1)
    # get vocabulary
    vocabulary = get_vocabulary(strategy_instance)
    debug('vocabulary =\n', vocabulary)
    # get y_train
    y_train = get_y_train(strategy_instance)
    debug('y_train =\n', y_train)
    # get x_train
    x_train = get_x_train(strategy_instance, vocabulary)
    debug('x_train =\n', x_train)
    # training
    clf = strategy_instance.train_svm(parameters, x_train, y_train)

    ############################### modify file ################################
    # generate weight_table corresponded by vocabulary
    # weight_table = clf.coef_.tolist()[0]
    # class0_vocabulary = []
    # for i in range(2):
    #     class0_word = vocabulary[weight_table.index(sorted(weight_table)[i])]
    #     class0_vocabulary.append(class0_word)
    # class1_vocabulary = []
    # for i in range(2):
    #     class1_word = vocabulary[weight_table.index(sorted(weight_table)[-i - 1])]
    #     class1_vocabulary.append(class1_word)
    # # debug(vocabulary)
    # # debug(weight_table)
    # debug(class0_vocabulary)
    # debug(class1_vocabulary)
    # # read file
    # with open('./test_data.reduced','r') as test_data_file:
    #     test_data_matrix=[line.strip().split(' ') for line in test_data_file]
    # # debug(test_data_matrix)
    # # generate weight by index
    # # exchange_data_matrix = [
    # #       sample_exchange_data = [
    # #           word_exchange_data = [
    # #               [weight, index, word_test_data]]]]
    # # Note: word_exchange_data is sorted by weight
    # exchange_data_matrix = []
    # for sample_test_data in test_data_matrix:
    #     sample_exchange_data = []
    #     for index in range(len(sample_test_data)):
    #         word_test_data = sample_test_data[index]
    #         try:
    #             weight = weight_table[vocabulary.index(word_test_data)]
    #         except ValueError:
    #             weight = 0
    #         word_exchange_data = [weight, index, word_test_data]
    #         sample_exchange_data.append(word_exchange_data)
    #     sample_exchange_data = sorted(sample_exchange_data,key=lambda l:l[0])
    #     exchange_data_matrix.append(sample_exchange_data)
    # # debug(exchange_data_matrix)
    # # generate the index of class1 feature word
    # class1_feature_word_matrix = []
    # for vector_exchange in exchange_data_matrix:
    #     vector_feature = []
    #     for word_exchange in vector_exchange[0:2]:
    #         # vector_feature.append(word_exchange[1])
    #         pass
    #     class1_feature_word_matrix.append(vector_feature)
    # # debug(class1_feature_word_matrix)
    # # write modified data
    # with open('./modified_data.txt', 'w') as modiefied_data_file:
    #     for i in range(len(test_data_matrix)):
    #         modiefied_data_list = test_data_matrix[i][:]
    #         for j in range(len(class1_feature_word_matrix[i])):
    #             index = class1_feature_word_matrix[i][j]
    #             modiefied_data_list[j] = class0_vocabulary[i]
    #         modiefied_data_str = ' '.join(modiefied_data_list)
    #         # debug(modiefied_data_str)
    #         modiefied_data_file.write(modiefied_data_str + '\n')

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    # assert strategy_instance.check_data(test_data, modified_data)
    print_test(clf, vocabulary)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 19
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    x_train = strategy_instance.class0 + strategy_instance.class1

    import numpy as np
    y = np.zeros((540, 1), dtype=np.int)
    y[360:] = 1
    y = y.ravel(
    )  # Convert a multidimensional array to a one-dimensional array

    #print(y)
    #print('end of printing y')

    def createVocabList(dataSet):
        vocabSet = set([])
        for document in dataSet:
            vocabSet = vocabSet | set(document)
        return list(vocabSet)

    def setOfWords2Vec(vocabSet, inputSet):
        returnVec = [0] * len(vocabSet)
        for word in inputSet:
            if word in vocabSet:
                returnVec[vocabSet.index(word)] = 1
        return returnVec

    data = createVocabList(x_train)

    trainAll = []
    for postinDoc in x_train:
        trainAll.append(setOfWords2Vec(data, postinDoc))

    trainAll = np.array(trainAll)
    with open('test_data.txt', 'r') as test1:
        test1 = [line.strip().split(' ') for line in test1]

    testAll = []
    for postinDoc in test1:
        testAll.append(setOfWords2Vec(data, postinDoc))

    parameters = {}
    parameters['C'] = 0.05
    parameters['kernel'] = 'linear'
    parameters['degree'] = 3
    parameters['gamma'] = 1
    parameters['coef0'] = 1
    clf = strategy_instance.train_svm(parameters, trainAll, y)

    # -----------------
    #print('end of setting parameters')
    w = clf.coef_
    index = np.where(w[0] < 0)[0]

    dic_w = {}
    for i in index:
        dic_w[i] = w[0][i]
    dic_w = sorted(dic_w.items(), key=lambda d: d[1])[0:200]

    index = [dic_w[i][0] for i in range(len(dic_w))]

    add_word = []
    for i in index:
        add_word.append(data[i])

    n = 0
    for i in range(len(test1)):
        n = 0
        for w in add_word:
            if n == 20:
                break
            if w not in test1[i]:
                test1[i].append(w)
            else:
                continue
            n = n + 1

    #print('before open')
    file = open('./modified_data.txt', 'w')
    for i in range(len(test1)):
        file.write(" ".join(test1[i]))
        file.write("\n")
    file.close()

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    #print('before assert')
    assert strategy_instance.check_data(test_data, modified_data)
Esempio n. 20
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    with open(test_data, 'r') as file:
        test_data = [line.strip().split(' ') for line in file]
    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    parameters = {}
    #use Native Bayes classification to calculate the possibilities, P(word|class0) and P(word|class1).
    class0 = strategy_instance.class0
    class1 = strategy_instance.class1
    p_class0 = len(class0) / (len(class0) + len(class1))
    p_class1 = len(class1) / (len(class0) + len(class1))
    class0_dic = {}
    class1_dic = {}
    for i in range(len(class0)):
        for j in range(len(class0[i])):
            if class0[i][j] not in class0_dic:
                class0_dic[class0[i][j]] = 1
            else:
                class0_dic[class0[i][j]] += 1

    for m in range(len(class1)):
        for n in range(len(class1[m])):
            if class1[m][n] not in class1_dic:
                class1_dic[class1[m][n]] = 1
            else:
                class1_dic[class1[m][n]] += 1

    class0_key_list = class0_dic.keys()
    class1_key_list = class1_dic.keys()
    class0_value_list = class0_dic.values()
    class1_value_list = class1_dic.values()

    j_number = len(class1_key_list)
    for k in class0_key_list:
        if k not in class1_key_list:
            j_number += 1

    class0_posibility = {}
    class1_posibility = {}
    smooth = 1
    for g in class0_key_list:
        class0_posibility[g] = (class0_dic[g] +
                                smooth) / (sum(class0_value_list) + j_number)
    for h in class1_key_list:
        class1_posibility[h] = (class1_dic[h] +
                                smooth) / (sum(class1_value_list) + j_number)
    # Remove the interference -- a word has a higher possibility both in class1 and class0.
    characteristic_class0 = {}
    characteristic_class1 = {}
    for value in class0_key_list:
        if value in class1_key_list:
            characteristic_class0[
                value] = class0_posibility[value] - class1_posibility[value]
        else:
            characteristic_class0[value] = class0_posibility[value]
    char_class0 = sorted(characteristic_class0.items(),
                         key=lambda d: d[1],
                         reverse=True)
    char_class0 = returntodic(char_class0)
    insert_list = list(char_class0.keys())
    for value in class1_key_list:
        if value in class0_key_list:
            characteristic_class1[
                value] = class1_posibility[value] - class0_posibility[value]
        else:
            characteristic_class1[value] = class1_posibility[value]
    char_class1 = sorted(characteristic_class1.items(),
                         key=lambda d: d[1],
                         reverse=True)
    char_class1 = returntodic(char_class1)
    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...
    for x in range(len(test_data)):
        indx = 0
        fix_list = []
        final_insert_list = []
        for value in insert_list:
            if value not in test_data[x]:
                final_insert_list.append(value)
        for values in test_data[x]:
            if values in char_class1 and values not in final_insert_list:
                fix_list.append(values)
        a = set(fix_list)
        fix_list = [b for b in a]
        if len(fix_list) < 10:
            fix_dic = {}
            for value in fix_list:
                fix_dic[value] = char_class1[value]
            fix_dic = sorted(fix_dic.items(), key=lambda d: d[1], reverse=True)
            fix_dic = returntodic(fix_dic)
            final_list = list(fix_dic.keys())
            count = 0
            for y in range(len(final_list)):
                for z in range(len(test_data[x])):
                    if test_data[x][z] == final_list[y]:
                        index = test_data[x].index(final_list[y])
                        test_data[x][index] = final_insert_list[indx]
                indx += 1
                count += 1
            rest = 20 - count * 2
            for g in range(rest):
                test_data[x].append(final_insert_list(indx))
                indx += 1

        if len(fix_list) >= 10:
            fix_dic = {}
            for value in fix_list:
                fix_dic[value] = char_class1[value]
            fix_dic = sorted(fix_dic.items(), key=lambda d: d[1], reverse=True)
            fix_dic = returntodic(fix_dic)
            final_list = list(fix_dic.keys())
            count = 0
            #            print("The line is: " + str(x) )
            #            print(final_list)
            #            print(test_data[x])
            for y in range(len(final_list)):
                if count != 10:
                    for z in range(len(test_data[x])):
                        if test_data[x][z] == final_list[y]:
                            index = test_data[x].index(final_list[y])
                            test_data[x][index] = final_insert_list[indx]
                    indx += 1
                    count += 1
                else:
                    break

    with open("modified_data.txt", "w") as file:
        for i in range(len(test_data)):
            for j in range(len(test_data[i])):
                file.write(str(test_data[i][j]) + " ")
            file.write("\n")

    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    test_data = './test_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 21
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    parameters = {}
    #mycodes
    #training model using class_0 and class_1
    class_0 = [' '.join(i) for i in strategy_instance.class0]
    class_1 = [' '.join(i) for i in strategy_instance.class1]
    class_all = class_0 + class_1
    vectorizer = CountVectorizer()
    count = vectorizer.fit_transform(class_all)
    l = [0] * 360 + [1] * 180
    X = count.toarray()
    y = np.array(l)
    #find the best parameter
    param_grid = [{
        'kernel': ['rbf'],
        'gamma': [0.01],
        'C': [1, 10]
    }, {
        'kernel': ['linear'],
        'C': [1, 10, 100, 1000]
    }]
    #find the best parameters
    #grid_search = GridSearchCV(svm.SVC(),param_grid,cv=5)
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
    #grid_search.fit(X_train,y_train)
    parameters['C'] = 10
    parameters['kernel'] = 'rbf'
    parameters['gamma'] = 0.01
    parameters['degree'] = 3
    parameters['coef0'] = 1
    #train the model
    clf = strategy_instance.train_svm(parameters, X, y)
    '''
    #predict the rate for original test data
    with open(test_data, 'r') as test:
        t = [line.strip().split(' ') for line in test]
        
    features = vectorizer.get_feature_names()
    d = dict(zip(features, [0] * len(features)))
    predict = []
    for sample in t:
        a = []
        for i in sample:
            if i in d:
                d[i] += 1
        for key in d.keys():
            a.append(d[key])
        predict.append(a)
        for key in d.keys():
            d[key] = 0
    '''

    #change the dataset
    with open(test_data, 'r') as test:
        t = [line.strip().split(' ') for line in test]
    t1 = []
    x = 0
    for line in t:
        original = {}
        for i in line:
            if i not in original:
                original[i] = 1
            else:
                original[i] += 1
        original = sorted(original.items(), key=lambda x: x[1], reverse=True)
        c = 0
        flag = 0
        for i in [
                'david', 'yeltsin', 'replacement', 'mubarak', 'phone',
                'powell', 'wimbledon', 'brazilian', 'title', 'm.', 'bulgaria',
                'confront', 'hosni', 'inspire', 'lap', 'denial', 'benedict',
                'maryland', 'giant', 'midfielder', 'uribe', 'maliki', 'insult',
                'cypriot', 'gyanendra', 'bhutto', 'autonomy', 'apparently',
                'wild', 'jan', 'tunisia', 'social', 'magazine', 'xvi', 'sick',
                'pence', 'migratory', 'multiparty', 'unite', 'postal', 'upcom',
                'universal', 'actress', 'highly', 'colin', 'penalty', 'aol',
                'huber', 'dissolve', 'augusto', 'lauck', 'internal', '22nd',
                'christopher', 'visegrad', 'takeda', 'bratislava', 'fallujah',
                'rush', 'whom', 'hat', 'topple', 'online', 'suspicious',
                'irreparable', 'funeral', 'mbeki', 'karimov', 'heat', 'bar',
                'walid', 'chris', 'these', 'curb', 'vs.', 'institute',
                'unnamed', 'forum', 'reluctant', 'strength', 'tribute',
                'recovery', 'puk', 'zvornik', 'dutroux', 'forma', 'koizumi',
                'guy', 'view', 'plo', 'eighth', 'indict', 'merkel', 'nervous',
                'kamal', 'zionism', 'maleeva', '350', 'sponsor', 'missionary',
                'industrialize', 'reconstruction', 'cheney', 'lynch',
                'grimsby', 'morales', 'constitute', 'congratulate', 'initiate',
                'hansa', 'cancel', 'awareness', 'performance', 'sing', 'savic',
                'cape', 'veto', 'choose', 'abdullahi', 'nathalie', 'jose',
                'contribution', 'wildlife', 'stake', 'francisco', 'gunfight',
                'tibetan', 'levee', 'personal', 'transfer', 'kevin', 'sean',
                'belgian', 'goss', 'wiesner', 'slovenia', 'hardline',
                'indigenous', 'compound', 'contractor', 'trick', 'medal',
                'stick', 'championship', 'voice', 'abdullah', 'rostock',
                'suleiman', '1975', 'really', 'angela', 'nouri', 'j.',
                'society', 'offset', 'reunification', 'levy', 'nationwide',
                'exceed', 'emirate', 'tribe', 'conte', 'ill', 'chancellor',
                'bin', 'bhutan', 'm', 'worship', 'marseille', 'pauli',
                'pinochet', 'flush', 'aggressive', 'atoll', 'delegation',
                'judith', 'thani', 'quickly', 'blessing', 'meadows',
                'kyrgyzstan', 'enrich', 'avenge', 'inc.', 'croatia',
                'moratorium', 'inability', 'surayud', 'plea', 'roe', 'armenia',
                'philippine', 'wang', 'vogts', 'lebed', 'outgoing', 'hiddink',
                'abdel', 'hampshire', 'demolition', 'sundown', 'gnassingbe',
                'google', 'moya', 'ball', 'emanuele', 'sprint', '1971',
                'clayton', '275', 'leash', 'aoki', 'chirac', 'intervene',
                'togolese', 'olsza', 'lower', 'carolina', 'coulthard',
                'hearing', 'optimistic', 'chaudhry', 'andre', 'safin', 'jerry',
                'wheat', 'rallying', 'akram', 'matt', 'solve', '3.8',
                'subsidize', 'dash', 'nationalize', 'motive', 'ira', '280',
                'willingness', 'alex', 'estrada', 'humenne', 'left', 'depress',
                'hymn', 'mob', 'candle', '7/8', 'metal', 'plummet', 'wedding',
                'rebate', 'okuda', 'guitar', 'stab', 'bristol', 'ruutel', 'w.',
                'europeans', 'dealing', 'abnormal', 'golf', 'iftikhar',
                'effigy', 'abstention', 'prevention', 'compromise', 'sphere',
                'calderon', 'fitzgerald', 'ting', 'barroso', 'handful',
                'arlington', 'restoration', 'prove', 'prachanda', 'departure',
                'oz', 'montpelli', 'relegate', 'confinement', 'magdalena',
                'conscience', 'wilma', 'zapatero', 'hosseini', 'clarence',
                'penrose', 'rumor', 'quarterfinal', 'revolt', 'engineering',
                'felipe', 'ransom', 'seventy', '260', 'hyderabad', 'chang',
                'erdogan', 'guus', 'principe', 'gear', 'milf', 'khaled',
                'erosion', 'partition', 'eta', 'atp', 'bossa', 'at&t',
                'anabel', 'pipe', 'advertising', 'ski', 'mediterranean',
                'ernesto', 'botham', 'hitter', 'hole', 'sampras', 'roger',
                'vacancy', 'manhattan', 'salih', 'rid', 'pure', 'ieng',
                '10-year', 'kohl', 'chesnot', 'shetty', 'fifty', 'porter',
                '36th', 'studio', 'patrick', 'buffett', 'justin', 'bellerive',
                'cemetery', 'slovan', 'r.', 'revitalize', 'joseph',
                'significant', 'ideology', 'caldwell', 'acquire', 'lodge',
                'coetzer', 'rba', 'nursultan', 'lg', 'remarkable', 'gonzales',
                'jockey', 'pretoria', 'barry', 'dissent', 'modest', 'ioc',
                'arc', 'scholar', "'d", 'blind', 'lahoud', 'sieg', 'canonica',
                'archipelago', 'kabir', 'hakimi', 'wwf', 'segment', 'falter',
                'prodi', 'ceo', 'haarlem', 'panic', 'rafik', 'cola', 'rauf',
                'amir', 'continuous', 'fortuna', 'staffer', 'scandinavian',
                'kubis', 'retaliation', 'bankruptcy', 'consistent', 'jesus',
                'mashhadani', 'federer', 'proper', 'paedophile', 'hyun',
                'shed', 'repeatedly', 'osama', 'anguilla', 'religion',
                'kilometre', 'debut', 'harsh', '756', 'guiana', 'abubakar',
                'maynard', 'evacuation', 'tactic', 'falkland', 'ad', 'lethal',
                '430', 'barter', 'gratitude', 'centre', 'mercosur', 'hispanic',
                'sympathizer', 'prolific', 'perth', 'bandar', 'retail', 'axum',
                'injection', 'h5n2', 'kuchma', 'annan', 'boris',
                'deterioration', 'myth', '83', 'cannes', 'f.w.', 'salesman',
                'da', 'sanader', 'volkova', 'radulescu', 'stir', '15-member',
                'benzene', 'talha', 'arnold', 'medalist', 'attorney', 'danny',
                '...', 'cotton', 'riel', '2020', 'stein', 'mccormack',
                'incomplete', 'secondhand', 'tool', 'turki', 'youth',
                'reassure', 'lien', 'calm', 'reformist', 'impeachment',
                'tablet', 'elizabeth', 'hamburg', 'korda', 'improvised',
                'addis', 'manuel', 'zelaya', 'postponement', 'jorge', '165',
                'popov', 'citgo', 'powder', 'feyenoord', 'lage', 'web',
                'effect', 'ravine', 'beit', 'justify', 'apologise', 'cocu',
                'broad', 'sole', 'professor', 'nicholas', 'georges', 'nagin',
                'mccain', '1956', 'haga', 'faith', '06-jan', 'clan',
                'rebellion', 'cameraman', 'jazz', 'lukoil', 'junior',
                'motivated', 'erik', 'punishment', 'taylor', 'pearson',
                'nepali', 'forgery', 'ackerman', 'jamie', 'bernd', 'argentina',
                '5,000', 'icy', 'male', 'terrorize', 'rough', '600,000',
                'frederic', '1978', 'aleksandra', 'punjab', 'zardari',
                'offence', 'mini', 'twitter', 'labour', 'mottaki', '286',
                'nwr', 'yoshikawa', 'defender', 'edwin', 'wolfowitz',
                'haringa', 'nazarbayev', 'mainstream', 'richard', 'medina',
                'retreat', 'tower', 'lament', 'baeron', 'lovato', 'hitch',
                'miller', 'milan', 'hans', 'gm', 'grave', 'portsmouth', 'sven',
                'stagger', 'supplier', 'shilpa', 'sarkozy', 'multiple',
                'gunship', 'reshuffle', 'kucera', '110', 'protectorate',
                'forward', 'rescuer', 'stormy', 'aug', 'icac', 'bargain',
                'slalom', 'regulation', 'baghdadi', 'expos', 'fade',
                'advisory', 'ridge', 'henri', '18-man', '6.4', 'koninklijke',
                'agassi', 'garrigue', 'romano', 'astros', 'karami', '199',
                'hossein', 'sussex', 'indebted', 'actor', 'clearly',
                'livestock', 'joyful', 'ass', 'insert', 'apology', 'ghad',
                'jennifer', 'henry', 'veiled', 'transcript', 'manila',
                'tranmere', 'renegotiate', 'assumption', 'java', 'shallah',
                'milton', 'semifinalist', 'faisal', 'lucrative', 'restriction',
                'iva', 'mclaren', 'upswing', 'leopold', 'unocal', 'ion',
                'riddled', 'willem', 'confidence', 'accidentally', 'bakool',
                'gough', 'rabinovich', 'prosecute', 'sisco', 'scoreboard',
                'mary', 'narrowly', 'exhume', 'confess', 'concede', 'burst',
                'chisinau', '195', 'laugh', 'toyota', 'metz', 'karsten',
                'native', 'manufacture', 'reverse', 'eager', 'dining', 'juba',
                'credible', 'backer', 'soft', 'veteran', 'directly', 'siphon',
                'eliminate', 'donation', 'burns', 'greeting', 'leslie',
                'italians', 'amy', 'askar', 'oliver', 'confer', 'bluefin',
                'saeed', 'constructorul', 'interpreter', 'jim', 'sofia',
                'infant', 'basuki', 'bernard', 'favorable', 'exact', 'gwyneth',
                'trainer', 'difficult', '54', 'sultan', 'caraballo', 'hefty',
                'steve', 'phnom', 'kdp', 'verizon', 'facilitate', 'adams',
                'telecom', 'boardman', 'cologne', 'luiz', 'breda', 'martic',
                'application', 'rose', 'puerto', 'subdue', 'disappointed',
                'competent', 'klerk', 'bode', 'garang', 'jewelry', 'lakes',
                'dundee', 'laden', 'character', 'genoa', 'relic', 'sorry',
                've', 'vanuatu', 'sack', 'batticaloa', 'empty', 'gray',
                'wielgus', 'oncin', 'ignacio', 'johnstone', 'gilford',
                'commentary', 'lak', 'martin', 'opec', 'karlsruhe',
                'clearance', 'joaquin', '2.6', 'kai', '86', 'ramadan',
                'deepen', 'majoli', 'holocaust', 'ceremonial', 'u.n', 'film',
                'lumpur', 'kuala', 'beg', 'snap', 'harass', 'celica', 'spla',
                'saran', 'architect', 'composer', 'huot', 'boss', '7.4',
                'partisan', 'countryside', 'solution', 'emile', 'tauziat',
                'compensation', 'define', 'brand', 'representation',
                'barricade', 'groenefeld', 'cattle', 'secede', 'chair',
                'frenchman', 'melinda', 'fragmented', 'sittard', 'antibiotic',
                'exhibition', 'nv', 'balkans', 'artillery', 'alvaro', 'skier',
                'takemura', 'pork', 'monaco', 'graham', 'chemlon', 'briefly',
                'breakaway', 'nour', 'lady', 'batchelor', 'mock', 'dazzle',
                'perform', 'principle', 'baltimore', 'partially', 'automatic',
                'disburse', 'kocinski', 'patriot', 'perez', 'occasion',
                'remnant', 'conspire', 'counsel', 'medvedev', '6-', 'skid',
                'sergeant', 'roque', 'caution', 'jumblatt', 'almere', 'scope',
                'lula', 'stride', 'ringleader', 'cosmonaut', 'stumble', 'nist',
                'invade', 'assignment', 'kumaratunga', 'smoke', 'exploration',
                'apostolic', 'gloucestershire', 'ivo', 'cement', 'razuri',
                'jabaliya', 'teen', 'meshaal', 'guide', 'framework', 'jans',
                'afterward', 'nec', 'minnesota', 'marcus', 'assistant', 'mind',
                'hemisphere', 'seiki', 'kingdom', 'stretch', 'super', 'shadab',
                'uncensored', 'sorensen', 'importance', 'lancashire', 'arroyo',
                'rodriguez', 'taking', 'wood', 'roh', 'delp', 'djindjic',
                'preval', 'vacuum', 'wrongdoing', 'benazir', 'mofaz',
                'vierklau', 'error', 'buyer', 'grower', 'gary', 'petr',
                'gronholm', 'estonian', 'else', 'ghanaian', 'axe', 'arson',
                'victor', 'precaution', 'rene', 'corporate', 'jericho', 'eld',
                'basque', 'crush', 'tsang', 'hillary', 'hoax', '84', 'chela',
                'witschge', 'coffin', 'tim', 'yesterday', 'd.', 'blogger',
                'wipe', 'dechy', 'panis', 'thermal', 'checkpoint', 'nijmegen',
                'stephane', 'bail', 'bride', 'decree', 'tome', 'prix',
                'hudson', 'momentum', 'forget', 'idema', 'northeastern',
                'costello', 'quite', 'exclusive', 'teutenberg', 'submission',
                'constantinople', 'hipc', 'silva', 'aggravate', 'nonetheless',
                'sino', 'ethnicity', 'thabo', 'jalalabad', 'smuggler',
                'marketing', 'memorandum', 'ariane', 'deplete', 'floor',
                'rivalry', 'claire', 'eduard', 'mortar', 'khan', 'eide',
                'robot', 'caller', 'label', 'solitary', 'implementation',
                'barno', 'impeach', 'solo', 'kaluwitharana', 'wrack', 'soul',
                'sharapova', 'crippled', 'evelyn', 'interfax', 'bakara',
                'angry', 'smith', 'cheat', 'caucasus', 'yusuf',
                'congressional', 'egeland', 'blair', '*', 'disabled', 'brad',
                'vitesse', 'debbie', 'marriage', 'vincent', 'courthouse',
                'kph', 'sport', 'airway', 'operational', 'barton', 'outsider',
                'jeff', 'hero', 'royalty', 'barbara', 'ruegen', 'harald',
                'zapatista', 'n.', 'stanford', 'portugal', 'nac', 'litigation',
                'makinen', 'shake', 'tennessee', 'villepin', 'durham',
                '26-year', 'mercantile', 'bennett', 'luis', 'hog',
                'extraordinary', 'sight', 'attach', 'ligi', 'azahari',
                'abortion', 'clock', 'actual', 'fein', 'respective', 'adjust',
                'turin', 'bleed', 'bob', 'greg', 'finland', 'grab',
                'injunction', 'micheletti', 'karol', 'bashir', 'commend',
                'copy', '21st', 'boraine', 'restrictive', 'except', 'cyclone',
                'automobile', 'tire', 'porto', 'culminate', 'hanoun',
                'leonard', 'ingrid', 'richards', 'organisation', 'erase',
                'trans', 'ghazni', 'mukasey', 'element', 'wasim', 'vancouver',
                '19-year', 'earmark', 'oval', '176', 'anderson', 'amend',
                'malbrunot', 'treasure', 'luge', 'shaul', 'kiss', 'empower',
                'ravi', 'harare', 'slovakia', 'vouch', 'behavior', 'ugandan',
                'tilburg', '8th', 'frazier', 'archbishop', 'dmitry', 'samper',
                'mistake', 'itamar', 'junichiro', 'precedent', '83rd', '17th',
                'optimism', 'dell', 'bot', 'vision', 'olivier', 'rag', 'radar',
                'striker', 'h.i.v.', 'wikileaks', 'franz', 'greet', 'baluch',
                'ratner', 'smile', 'ababa', '210', 'container', 'veldman',
                'harvesting', 'tendulkar', 'skating', 'flore', 'hometown',
                'serie', 'penh', 'dick', 'dye', 'incumbent', 'consultation',
                'science', 'pilot', 'oppressed', 'kanyarukiga', 'object',
                'plantation', 'liaon', 'eyewitness', 'karim', 'anwar',
                'palmans', 'yankees', 'baker', 'annex', '111', 'fate',
                'endanger', 'suitable', 'mehrtens', 'famous', 'dominguez',
                'spin', 'jabalya', 'carol', 'howes', 'zimbabwean', 'drc',
                'moo', 'oic', 'impasse', "o'brien", 'moussaoui', 'rodrigo',
                'bedside', '5.1', 'jemaah', 'ashore', 'accuracy', 'gloria',
                'guerrero', 'preparation', 'belarus', 'publishing', '48',
                'gujral', 'accra', 'marie', 'breed', 'najaf', 'craig',
                'courage', 'brazilians', 'hunter', '1972', 'netanyahu',
                'ortiz', 'pickens', 'cage', 'enqvist', 'larijani', 'zacarias',
                'poisoning', 'russell', 'firewood', 'credibility', 'midway',
                'provider', '87', 'dark', 'apartheid', 'lawlessness',
                'armenian', 'bourlet', 'adamkus', 'mutola', 'tiananmen',
                'incur', 'inter', 'initial', 'publicize', '103', 'fellow',
                'achievement', 'tarango', 'cable', 'koert', 'starve', 'zambia',
                'administrator', 'erkinbayev', 'lithuanian', '941', 'mike',
                'reaffirm', 'sinn', 'motorcycle', 'aig', 'northeast',
                'spiritual', 'cheer', 'nova', 'casey', 'embark', 'kharrazi',
                'akerson', 'fiery', 'nangarhar', 'irregularity', 'silver',
                'inacio', 'pire'
        ]:
            if i in line:
                line = [j for j in line if j != i]
                c += 1
            if c == 20:
                t1.append(line)
                flag = 1
                break
        if flag:
            continue

        for i in original:
            i = i[0]
            if i not in [
                    'david', 'yeltsin', 'replacement', 'mubarak', 'phone',
                    'powell', 'wimbledon', 'brazilian', 'title', 'm.',
                    'bulgaria', 'confront', 'hosni', 'inspire', 'lap',
                    'denial', 'benedict', 'maryland', 'giant', 'midfielder',
                    'uribe', 'maliki', 'insult', 'cypriot', 'gyanendra',
                    'bhutto', 'autonomy', 'apparently', 'wild', 'jan',
                    'tunisia', 'social', 'magazine', 'xvi', 'sick', 'pence',
                    'migratory', 'multiparty', 'unite', 'postal', 'upcom',
                    'universal', 'actress', 'highly', 'colin', 'penalty',
                    'aol', 'huber', 'dissolve', 'augusto', 'lauck', 'internal',
                    '22nd', 'christopher', 'visegrad', 'takeda', 'bratislava',
                    'fallujah', 'rush', 'whom', 'hat', 'topple', 'online',
                    'suspicious', 'irreparable', 'funeral', 'mbeki', 'karimov',
                    'heat', 'bar', 'walid', 'chris', 'these', 'curb', 'vs.',
                    'institute', 'unnamed', 'forum', 'reluctant', 'strength',
                    'tribute', 'recovery', 'puk', 'zvornik', 'dutroux',
                    'forma', 'koizumi', 'guy', 'view', 'plo', 'eighth',
                    'indict', 'merkel', 'nervous', 'kamal', 'zionism',
                    'maleeva', '350', 'sponsor', 'missionary', 'industrialize',
                    'reconstruction', 'cheney', 'lynch', 'grimsby', 'morales',
                    'constitute', 'congratulate', 'initiate', 'hansa',
                    'cancel', 'awareness', 'performance', 'sing', 'savic',
                    'cape', 'veto', 'choose', 'abdullahi', 'nathalie', 'jose',
                    'contribution', 'wildlife', 'stake', 'francisco',
                    'gunfight', 'tibetan', 'levee', 'personal', 'transfer',
                    'kevin', 'sean', 'belgian', 'goss', 'wiesner', 'slovenia',
                    'hardline', 'indigenous', 'compound', 'contractor',
                    'trick', 'medal', 'stick', 'championship', 'voice',
                    'abdullah', 'rostock', 'suleiman', '1975', 'really',
                    'angela', 'nouri', 'j.', 'society', 'offset',
                    'reunification', 'levy', 'nationwide', 'exceed', 'emirate',
                    'tribe', 'conte', 'ill', 'chancellor', 'bin', 'bhutan',
                    'm', 'worship', 'marseille', 'pauli', 'pinochet', 'flush',
                    'aggressive', 'atoll', 'delegation', 'judith', 'thani',
                    'quickly', 'blessing', 'meadows', 'kyrgyzstan', 'enrich',
                    'avenge', 'inc.', 'croatia', 'moratorium', 'inability',
                    'surayud', 'plea', 'roe', 'armenia', 'philippine', 'wang',
                    'vogts', 'lebed', 'outgoing', 'hiddink', 'abdel',
                    'hampshire', 'demolition', 'sundown', 'gnassingbe',
                    'google', 'moya', 'ball', 'emanuele', 'sprint', '1971',
                    'clayton', '275', 'leash', 'aoki', 'chirac', 'intervene',
                    'togolese', 'olsza', 'lower', 'carolina', 'coulthard',
                    'hearing', 'optimistic', 'chaudhry', 'andre', 'safin',
                    'jerry', 'wheat', 'rallying', 'akram', 'matt', 'solve',
                    '3.8', 'subsidize', 'dash', 'nationalize', 'motive', 'ira',
                    '280', 'willingness', 'alex', 'estrada', 'humenne', 'left',
                    'depress', 'hymn', 'mob', 'candle', '7/8', 'metal',
                    'plummet', 'wedding', 'rebate', 'okuda', 'guitar', 'stab',
                    'bristol', 'ruutel', 'w.', 'europeans', 'dealing',
                    'abnormal', 'golf', 'iftikhar', 'effigy', 'abstention',
                    'prevention', 'compromise', 'sphere', 'calderon',
                    'fitzgerald', 'ting', 'barroso', 'handful', 'arlington',
                    'restoration', 'prove', 'prachanda', 'departure', 'oz',
                    'montpelli', 'relegate', 'confinement', 'magdalena',
                    'conscience', 'wilma', 'zapatero', 'hosseini', 'clarence',
                    'penrose', 'rumor', 'quarterfinal', 'revolt',
                    'engineering', 'felipe', 'ransom', 'seventy', '260',
                    'hyderabad', 'chang', 'erdogan', 'guus', 'principe',
                    'gear', 'milf', 'khaled', 'erosion', 'partition', 'eta',
                    'atp', 'bossa', 'at&t', 'anabel', 'pipe', 'advertising',
                    'ski', 'mediterranean', 'ernesto', 'botham', 'hitter',
                    'hole', 'sampras', 'roger', 'vacancy', 'manhattan',
                    'salih', 'rid', 'pure', 'ieng', '10-year', 'kohl',
                    'chesnot', 'shetty', 'fifty', 'porter', '36th', 'studio',
                    'patrick', 'buffett', 'justin', 'bellerive', 'cemetery',
                    'slovan', 'r.', 'revitalize', 'joseph', 'significant',
                    'ideology', 'caldwell', 'acquire', 'lodge', 'coetzer',
                    'rba', 'nursultan', 'lg', 'remarkable', 'gonzales',
                    'jockey', 'pretoria', 'barry', 'dissent', 'modest', 'ioc',
                    'arc', 'scholar', "'d", 'blind', 'lahoud', 'sieg',
                    'canonica', 'archipelago', 'kabir', 'hakimi', 'wwf',
                    'segment', 'falter', 'prodi', 'ceo', 'haarlem', 'panic',
                    'rafik', 'cola', 'rauf', 'amir', 'continuous', 'fortuna',
                    'staffer', 'scandinavian', 'kubis', 'retaliation',
                    'bankruptcy', 'consistent', 'jesus', 'mashhadani',
                    'federer', 'proper', 'paedophile', 'hyun', 'shed',
                    'repeatedly', 'osama', 'anguilla', 'religion', 'kilometre',
                    'debut', 'harsh', '756', 'guiana', 'abubakar', 'maynard',
                    'evacuation', 'tactic', 'falkland', 'ad', 'lethal', '430',
                    'barter', 'gratitude', 'centre', 'mercosur', 'hispanic',
                    'sympathizer', 'prolific', 'perth', 'bandar', 'retail',
                    'axum', 'injection', 'h5n2', 'kuchma', 'annan', 'boris',
                    'deterioration', 'myth', '83', 'cannes', 'f.w.',
                    'salesman', 'da', 'sanader', 'volkova', 'radulescu',
                    'stir', '15-member', 'benzene', 'talha', 'arnold',
                    'medalist', 'attorney', 'danny', '...', 'cotton', 'riel',
                    '2020', 'stein', 'mccormack', 'incomplete', 'secondhand',
                    'tool', 'turki', 'youth', 'reassure', 'lien', 'calm',
                    'reformist', 'impeachment', 'tablet', 'elizabeth',
                    'hamburg', 'korda', 'improvised', 'addis', 'manuel',
                    'zelaya', 'postponement', 'jorge', '165', 'popov', 'citgo',
                    'powder', 'feyenoord', 'lage', 'web', 'effect', 'ravine',
                    'beit', 'justify', 'apologise', 'cocu', 'broad', 'sole',
                    'professor', 'nicholas', 'georges', 'nagin', 'mccain',
                    '1956', 'haga', 'faith', '06-jan', 'clan', 'rebellion',
                    'cameraman', 'jazz', 'lukoil', 'junior', 'motivated',
                    'erik', 'punishment', 'taylor', 'pearson', 'nepali',
                    'forgery', 'ackerman', 'jamie', 'bernd', 'argentina',
                    '5,000', 'icy', 'male', 'terrorize', 'rough', '600,000',
                    'frederic', '1978', 'aleksandra', 'punjab', 'zardari',
                    'offence', 'mini', 'twitter', 'labour', 'mottaki', '286',
                    'nwr', 'yoshikawa', 'defender', 'edwin', 'wolfowitz',
                    'haringa', 'nazarbayev', 'mainstream', 'richard', 'medina',
                    'retreat', 'tower', 'lament', 'baeron', 'lovato', 'hitch',
                    'miller', 'milan', 'hans', 'gm', 'grave', 'portsmouth',
                    'sven', 'stagger', 'supplier', 'shilpa', 'sarkozy',
                    'multiple', 'gunship', 'reshuffle', 'kucera', '110',
                    'protectorate', 'forward', 'rescuer', 'stormy', 'aug',
                    'icac', 'bargain', 'slalom', 'regulation', 'baghdadi',
                    'expos', 'fade', 'advisory', 'ridge', 'henri', '18-man',
                    '6.4', 'koninklijke', 'agassi', 'garrigue', 'romano',
                    'astros', 'karami', '199', 'hossein', 'sussex', 'indebted',
                    'actor', 'clearly', 'livestock', 'joyful', 'ass', 'insert',
                    'apology', 'ghad', 'jennifer', 'henry', 'veiled',
                    'transcript', 'manila', 'tranmere', 'renegotiate',
                    'assumption', 'java', 'shallah', 'milton', 'semifinalist',
                    'faisal', 'lucrative', 'restriction', 'iva', 'mclaren',
                    'upswing', 'leopold', 'unocal', 'ion', 'riddled', 'willem',
                    'confidence', 'accidentally', 'bakool', 'gough',
                    'rabinovich', 'prosecute', 'sisco', 'scoreboard', 'mary',
                    'narrowly', 'exhume', 'confess', 'concede', 'burst',
                    'chisinau', '195', 'laugh', 'toyota', 'metz', 'karsten',
                    'native', 'manufacture', 'reverse', 'eager', 'dining',
                    'juba', 'credible', 'backer', 'soft', 'veteran',
                    'directly', 'siphon', 'eliminate', 'donation', 'burns',
                    'greeting', 'leslie', 'italians', 'amy', 'askar', 'oliver',
                    'confer', 'bluefin', 'saeed', 'constructorul',
                    'interpreter', 'jim', 'sofia', 'infant', 'basuki',
                    'bernard', 'favorable', 'exact', 'gwyneth', 'trainer',
                    'difficult', '54', 'sultan', 'caraballo', 'hefty', 'steve',
                    'phnom', 'kdp', 'verizon', 'facilitate', 'adams',
                    'telecom', 'boardman', 'cologne', 'luiz', 'breda',
                    'martic', 'application', 'rose', 'puerto', 'subdue',
                    'disappointed', 'competent', 'klerk', 'bode', 'garang',
                    'jewelry', 'lakes', 'dundee', 'laden', 'character',
                    'genoa', 'relic', 'sorry', 've', 'vanuatu', 'sack',
                    'batticaloa', 'empty', 'gray', 'wielgus', 'oncin',
                    'ignacio', 'johnstone', 'gilford', 'commentary', 'lak',
                    'martin', 'opec', 'karlsruhe', 'clearance', 'joaquin',
                    '2.6', 'kai', '86', 'ramadan', 'deepen', 'majoli',
                    'holocaust', 'ceremonial', 'u.n', 'film', 'lumpur',
                    'kuala', 'beg', 'snap', 'harass', 'celica', 'spla',
                    'saran', 'architect', 'composer', 'huot', 'boss', '7.4',
                    'partisan', 'countryside', 'solution', 'emile', 'tauziat',
                    'compensation', 'define', 'brand', 'representation',
                    'barricade', 'groenefeld', 'cattle', 'secede', 'chair',
                    'frenchman', 'melinda', 'fragmented', 'sittard',
                    'antibiotic', 'exhibition', 'nv', 'balkans', 'artillery',
                    'alvaro', 'skier', 'takemura', 'pork', 'monaco', 'graham',
                    'chemlon', 'briefly', 'breakaway', 'nour', 'lady',
                    'batchelor', 'mock', 'dazzle', 'perform', 'principle',
                    'baltimore', 'partially', 'automatic', 'disburse',
                    'kocinski', 'patriot', 'perez', 'occasion', 'remnant',
                    'conspire', 'counsel', 'medvedev', '6-', 'skid',
                    'sergeant', 'roque', 'caution', 'jumblatt', 'almere',
                    'scope', 'lula', 'stride', 'ringleader', 'cosmonaut',
                    'stumble', 'nist', 'invade', 'assignment', 'kumaratunga',
                    'smoke', 'exploration', 'apostolic', 'gloucestershire',
                    'ivo', 'cement', 'razuri', 'jabaliya', 'teen', 'meshaal',
                    'guide', 'framework', 'jans', 'afterward', 'nec',
                    'minnesota', 'marcus', 'assistant', 'mind', 'hemisphere',
                    'seiki', 'kingdom', 'stretch', 'super', 'shadab',
                    'uncensored', 'sorensen', 'importance', 'lancashire',
                    'arroyo', 'rodriguez', 'taking', 'wood', 'roh', 'delp',
                    'djindjic', 'preval', 'vacuum', 'wrongdoing', 'benazir',
                    'mofaz', 'vierklau', 'error', 'buyer', 'grower', 'gary',
                    'petr', 'gronholm', 'estonian', 'else', 'ghanaian', 'axe',
                    'arson', 'victor', 'precaution', 'rene', 'corporate',
                    'jericho', 'eld', 'basque', 'crush', 'tsang', 'hillary',
                    'hoax', '84', 'chela', 'witschge', 'coffin', 'tim',
                    'yesterday', 'd.', 'blogger', 'wipe', 'dechy', 'panis',
                    'thermal', 'checkpoint', 'nijmegen', 'stephane', 'bail',
                    'bride', 'decree', 'tome', 'prix', 'hudson', 'momentum',
                    'forget', 'idema', 'northeastern', 'costello', 'quite',
                    'exclusive', 'teutenberg', 'submission', 'constantinople',
                    'hipc', 'silva', 'aggravate', 'nonetheless', 'sino',
                    'ethnicity', 'thabo', 'jalalabad', 'smuggler', 'marketing',
                    'memorandum', 'ariane', 'deplete', 'floor', 'rivalry',
                    'claire', 'eduard', 'mortar', 'khan', 'eide', 'robot',
                    'caller', 'label', 'solitary', 'implementation', 'barno',
                    'impeach', 'solo', 'kaluwitharana', 'wrack', 'soul',
                    'sharapova', 'crippled', 'evelyn', 'interfax', 'bakara',
                    'angry', 'smith', 'cheat', 'caucasus', 'yusuf',
                    'congressional', 'egeland', 'blair', '*', 'disabled',
                    'brad', 'vitesse', 'debbie', 'marriage', 'vincent',
                    'courthouse', 'kph', 'sport', 'airway', 'operational',
                    'barton', 'outsider', 'jeff', 'hero', 'royalty', 'barbara',
                    'ruegen', 'harald', 'zapatista', 'n.', 'stanford',
                    'portugal', 'nac', 'litigation', 'makinen', 'shake',
                    'tennessee', 'villepin', 'durham', '26-year', 'mercantile',
                    'bennett', 'luis', 'hog', 'extraordinary', 'sight',
                    'attach', 'ligi', 'azahari', 'abortion', 'clock', 'actual',
                    'fein', 'respective', 'adjust', 'turin', 'bleed', 'bob',
                    'greg', 'finland', 'grab', 'injunction', 'micheletti',
                    'karol', 'bashir', 'commend', 'copy', '21st', 'boraine',
                    'restrictive', 'except', 'cyclone', 'automobile', 'tire',
                    'porto', 'culminate', 'hanoun', 'leonard', 'ingrid',
                    'richards', 'organisation', 'erase', 'trans', 'ghazni',
                    'mukasey', 'element', 'wasim', 'vancouver', '19-year',
                    'earmark', 'oval', '176', 'anderson', 'amend', 'malbrunot',
                    'treasure', 'luge', 'shaul', 'kiss', 'empower', 'ravi',
                    'harare', 'slovakia', 'vouch', 'behavior', 'ugandan',
                    'tilburg', '8th', 'frazier', 'archbishop', 'dmitry',
                    'samper', 'mistake', 'itamar', 'junichiro', 'precedent',
                    '83rd', '17th', 'optimism', 'dell', 'bot', 'vision',
                    'olivier', 'rag', 'radar', 'striker', 'h.i.v.',
                    'wikileaks', 'franz', 'greet', 'baluch', 'ratner', 'smile',
                    'ababa', '210', 'container', 'veldman', 'harvesting',
                    'tendulkar', 'skating', 'flore', 'hometown', 'serie',
                    'penh', 'dick', 'dye', 'incumbent', 'consultation',
                    'science', 'pilot', 'oppressed', 'kanyarukiga', 'object',
                    'plantation', 'liaon', 'eyewitness', 'karim', 'anwar',
                    'palmans', 'yankees', 'baker', 'annex', '111', 'fate',
                    'endanger', 'suitable', 'mehrtens', 'famous', 'dominguez',
                    'spin', 'jabalya', 'carol', 'howes', 'zimbabwean', 'drc',
                    'moo', 'oic', 'impasse', "o'brien", 'moussaoui', 'rodrigo',
                    'bedside', '5.1', 'jemaah', 'ashore', 'accuracy', 'gloria',
                    'guerrero', 'preparation', 'belarus', 'publishing', '48',
                    'gujral', 'accra', 'marie', 'breed', 'najaf', 'craig',
                    'courage', 'brazilians', 'hunter', '1972', 'netanyahu',
                    'ortiz', 'pickens', 'cage', 'enqvist', 'larijani',
                    'zacarias', 'poisoning', 'russell', 'firewood',
                    'credibility', 'midway', 'provider', '87', 'dark',
                    'apartheid', 'lawlessness', 'armenian', 'bourlet',
                    'adamkus', 'mutola', 'tiananmen', 'incur', 'inter',
                    'initial', 'publicize', '103', 'fellow', 'achievement',
                    'tarango', 'cable', 'koert', 'starve', 'zambia',
                    'administrator', 'erkinbayev', 'lithuanian', '941', 'mike',
                    'reaffirm', 'sinn', 'motorcycle', 'aig', 'northeast',
                    'spiritual', 'cheer', 'nova', 'casey', 'embark',
                    'kharrazi', 'akerson', 'fiery', 'nangarhar',
                    'irregularity', 'silver', 'inacio', 'pire'
            ]:
                line = [j for j in line if j != i]
                c += 1
                if c == 20:
                    t1.append(line)
                    break

    #write the data to a modified file
    with open('modified_data.txt', 'w') as file:
        for line in t1:
            file.write(' '.join(line))
            file.write('\n')

    #predict the data
    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...
    ## You can check that the modified text is within the modification limits.
    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 22
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    strategy_instance = helper.strategy()
    features_and_labels = []
    for i in (strategy_instance.class0):
        tokens = get_freq_of_tokens(i)
        features_and_labels.append((tokens, 0))
    for i in (strategy_instance.class1):
        tokens = get_freq_of_tokens(i)
        features_and_labels.append((tokens, 1))
    encoder = LabelEncoder()
    vectorizer = DictVectorizer(dtype=int, sparse=True)
    x, y = list(zip(*features_and_labels))
    x = vectorizer.fit_transform(x)
    y = encoder.fit_transform(y)

    parameters = {
        'C': 10.0,
        'coef0': 0.0,
        'degree': 3,
        'gamma': 'auto',
        'kernel': 'linear'
    }

    #x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, random_state=1, train_size=0.8)

    clf = strategy_instance.train_svm(parameters, x, y)

    coef = clf.coef_
    # print(coef)
    data = coef.data.tolist()
    idx = coef.indices.tolist()

    _list = [(vectorizer.get_feature_names()[idx[e]], data[e])
             for e in range(len(data))]

    words = []
    for i in range(len(data)):
        words.append((idx[i], data[i]))
    words = sorted(words, key=lambda x: x[1])
    class0_dict = {}
    class1_dict = {}
    for i in range(len(words)):
        if words[i][1] < 0:
            class0_dict[words[i][0]] = abs(words[i][1])
        else:
            class1_dict[words[i][0]] = abs(words[i][1])

    word_0 = np.array(
        sorted(class0_dict.items(), key=lambda x: x[1], reverse=1)[:100])
    word_1 = np.array(
        sorted(class1_dict.items(), key=lambda x: x[1], reverse=1)[:100])
    word_0_index = word_0[:, 0].tolist()
    word_1_index = word_1[:, 0].tolist()
    # for i in word_0_index:
    #     print(vectorizer.feature_names_[int(i)])
    # print(word_0)

    class0_weight = []
    class1_weight = []
    # for_now0 = []
    # for i in word_0_index:
    #     for_now0.append(vectorizer.feature_names_[int(i)])
    # print(for_now0)
    # for_now1 = []
    # for i in word_1_index:
    #     for_now1.append(vectorizer.feature_names_[int(i)])
    # print(for_now1)

    with open('modified_data.txt', "w") as modified_data:
        with open(test_data, "r") as test:
            for line in test:
                l = line.strip().split(' ')
                global l_mod_top10
                l = set(l)
                l = list(l)
                l_new = copy.deepcopy(l)
                for j in range(len(l)):
                    if l[j] not in vectorizer.feature_names_:
                        continue
                    else:
                        index = vectorizer.feature_names_.index(l[j])
                        if index in class0_dict.keys():
                            class0_weight.append((index, class0_dict[index]))
                        if index in class1_dict.keys():
                            class1_weight.append((index, class1_dict[index]))
                class0_weight = sorted(class0_weight,
                                       key=lambda x: x[1],
                                       reverse=True)
                class1_weight = sorted(class1_weight,
                                       key=lambda x: x[1],
                                       reverse=True)
                # print(class0_weight)
                # print(class1_weight)
                # print(l_new)
                l_mod = []
                for m in range(len(class1_weight)):
                    if class1_weight[m][
                            0] not in word_0_index:  #and class1_weight[m][0] in word_1_index:
                        l_mod.append(vectorizer.feature_names_[int(
                            class1_weight[m][0])])
                if len(l_mod) >= 10:
                    l_mod_top10 = l_mod[:10]
                else:
                    l_mod_top10 = copy.deepcopy(l_mod)

                count = 0
                original_len = len(l_new)
                l_new = list(filter(in_it, l_new))
                diff = original_len - len(l_new)
                # print(diff)
                count_1 = diff
                for n in range(len(l_mod)):
                    if count_1 < 20:
                        for x in range(count, len(word_0_index)):
                            aa = vectorizer.feature_names_[int(
                                word_0_index[x])]
                            if aa not in l_new:
                                l_new.append(aa)
                                count += 1
                                break
                            else:
                                count += 1
                        count_1 += 1
                # print(count_1)
                while count_1 < 20:
                    print(count_1)
                    for x in range(count, len(word_0_index)):
                        if vectorizer.feature_names_[int(
                                word_0_index[x])] not in l:
                            l_new.append(vectorizer.feature_names_[int(
                                word_0_index[x])])
                        count += 1
                    count_1 += 1

                p = " ".join(str(i) for i in l_new)
                modified_data.write(p + '\n')
                # break

    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...

    ## You can check that the modified text is within the modification limits.

    # result = clf.predict(modified_data).tolist()
    # print(result.count(1) / len(result))
    modified_data = './modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
def fool_classifier(test_data): ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    
    
    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance=helper.strategy() 
    import numpy as np
    y = np.zeros((360,1),dtype=np.int)
    y[180:] = 1
    y = y.ravel()
    prng = np.random.RandomState(233233)
    train0 = prng.choice(strategy_instance.class0,180)
    train0 = train0.tolist()
    x_trainAll = train0 + strategy_instance.class1
    
    def createVocabList(dataSet):
        vocabSet=set([])
        for document in dataSet:
            vocabSet=vocabSet|set(document)
        return list(vocabSet)
    
    
    def setOfWords2Vec(vocabSet,inputSet):
        returnVec=[0]*len(vocabSet)
        for word in inputSet:
            if word in vocabSet:
                returnVec[vocabSet.index(word)]=1
        return returnVec
    
    data = createVocabList(x_trainAll)
    trainAll=[]
    for postinDoc in x_trainAll :
        trainAll.append(setOfWords2Vec(data,postinDoc))
    
    
    trainAll = np.array(trainAll)
    with open('test_data.txt','r') as test1:
            test1=[line.strip().split(' ') for line in test1]
    
    testAll=[]
    for postinDoc in test1:
        testAll.append(setOfWords2Vec(data,postinDoc))
    
    parameters={} 
    parameters['C'] = 0.02
    parameters['kernel'] = 'linear'
    parameters['degree'] = 3
    parameters['gamma'] = 1
    parameters['coef0'] = 1
    clf = strategy_instance.train_svm(parameters,trainAll,y)
    w = clf.coef_
    index = np.where(w[0] < 0)[0]
    
    dic_w = {}
    for i in index:
        dic_w[i] = w[0][i]
    dic_w = sorted(dic_w.items(), key=lambda d: d[1])[0:200]
    
    index = [dic_w[i][0] for i in range(len(dic_w))]

    for i in index:
        print('index: ',i)
    add_word = []
    for i in index:
        add_word.append(data[i])
    
    n = 0
    for i in range(len(test1)):
        n = 0
        for w in add_word:
            if n == 20:
                break
            
            if w not in test1[i]:
                 test1[i].append(w)
                
            else:
                continue
            n = n + 1

    print(len(test1))
    file=open('./modified_data.txt','w+')
    for i in range(len(test1)):
        file.write(" ".join(test1[i]))
        file.write("\n")
    line_num = 0
    file.close()
    with open('./modified_data.txt') as mod:
        for line in mod:
            line_num += 1
    print('line num of modified_data',line_num)




    
    ## You can check that the modified text is within the modification limits.
    modified_data='./modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance ## NOTE: You are required to return the instance of this class.
Esempio n. 24
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    strategy_instance = helper.strategy()
    parameters = {}

    parameters['C'] = 1
    parameters['kernel'] = 'linear'
    parameters['degree'] = 3
    parameters['gamma'] = 'auto'
    parameters['coef0'] = 0

    vec = CountVectorizer(tokenizer=lambda x: x.split())
    # tfidf = TfidfVectorizer()
    data = []

    for line in strategy_instance.class0:
        data.append(' '.join(i for i in line))
    for line in strategy_instance.class1:
        data.append(' '.join(i for i in line))

    TFID = TfidfTransformer()
    X = TFID.fit_transform(vec.fit_transform(data))

    y = [0] * len(strategy_instance.class0) + [1] * len(
        strategy_instance.class1)

    clf = svm.SVC(kernel='linear', C=1)
    # clf = svm.SVC(kernel='linear', C=0.030999999999999996, class_weight="balanced")
    clf.fit(X, y)
    # print(clf.coef_)

    coef_dict = dict()
    coef = clf.coef_[0]
    for i in range(len(coef.indices)):
        coef_dict[coef.indices[i]] = coef.data[i]

    coef_pos = [i for i in coef_dict.items() if i[1] > 0]
    coef_neg = [i for i in coef_dict.items() if i[1] < 0]
    coef_pos = sorted(coef_pos, key=lambda x: x[1], reverse=True)
    coef_neg = sorted(coef_neg, key=lambda x: x[1], reverse=False)

    # delete:word_pos
    word_pos = [
        j[0] for i in coef_pos for j in vec.vocabulary_.items() if j[1] == i[0]
    ][:1000]
    # add:word_neg
    word_neg = [
        j[0] for i in coef_neg for j in vec.vocabulary_.items() if j[1] == i[0]
    ]

    # modify
    modified_data = './modified_data.txt'
    # modify operation
    with open(test_data, 'r') as test_file:
        with open(modified_data, 'w') as modified_file:
            for line in test_file:
                use_add = []
                use_dele = []
                words = line.strip().split(' ')
                words = list(set(words))
                dele = [i for i in word_pos if i in words]
                # print('d', len(dele), dele)
                count = 0
                for i in dele:

                    words.remove(dele[count])
                    use_dele.append(dele[count])
                    count += 1
                    if count == 10:
                        break

                add_new = [i for i in word_neg if i not in words]

                for i in range(20 - count):
                    words.append(add_new[i])
                    use_add.append(add_new[i])
            ## print(use_dele)
            ## print(use_add)

                use_dele = []
                use_add = []
                line = ' '.join(i for i in words)

                modified_file.write(line + '\n')
    modified_file.close()

    # predict test
    with open(test_data, 'r') as file1:
        data1 = [line.strip().split(' ') for line in file1]
    testdata = []
    for line in data1:
        testdata.append(' '.join(i for i in line))

    X_test = vec.transform(testdata)
    # X_test = tfidf.transform(testdata)
    y1 = clf.predict(X_test)
    ##print(y1)
    result = sum(y1) * 100 / 200  # test_data是1的概率
    ##print('Success = {}%'.format(result))

    # predict modify
    with open(modified_data, 'r') as file2:
        data2 = [line.strip().split(' ') for line in file2]
    testdata2 = []
    for line in data2:
        testdata2.append(' '.join(i for i in line))

    X_test2 = vec.transform(testdata2)
    # X_test2 = tfidf.transform(testdata2)
    ##print(X_test2.shape)
    # print(X_test.toarray())
    y2 = clf.predict(X_test2)
    ##print(y2)
    result2 = sum(y2) * 100 / 200  # test_data是1的概率
    ##print('Success = {}%'.format(result2))

    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 25
0
def fool_classifier(test_data): ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...


    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance=helper.strategy()
    import numpy as np
    y = np.zeros((380,1),dtype=np.int)
    y[200:] = 1
    y = y.ravel()
    prng = np.random.RandomState(233233)
    train0 = prng.choice(strategy_instance.class0,200)
    train0 = train0.tolist()
    x_trainAll = train0 + strategy_instance.class1
    corpus = []
    for para in x_trainAll:
        corpus.append(''.join(para))        # train set


    def createVocabList(dataSet):
        vocabSet=set([])
        for document in dataSet:
            vocabSet=vocabSet|set(document)
        return list(vocabSet)


    def setOfWords2Vec(vocabSet,inputSet):
        returnVec=[0]*len(vocabSet)
        for word in inputSet:
            if word in vocabSet:
                returnVec[vocabSet.index(word)]=1
        #print('type of returnVec',type(returnVec))
        return returnVec

    data = createVocabList(x_trainAll)
    trainAll=[]
    for postinDoc in x_trainAll :
        trainAll.append(setOfWords2Vec(data,postinDoc))


    trainAll = np.array(trainAll)
    with open('test_data.txt','r') as test1:           # test1 is the test_data file
            test1=[line.strip().split(' ') for line in test1]



    ####################### START OF TRANSFORM ############

    if False:
        idf = TfidfTransformer()  # 类调用
        idf.fit(trainAll)
        xtrain_tfm = idf.transform(trainAll)
    if True:
        idf = TfidfVectorizer(token_pattern='\S+',analyzer='char')              # 类调用
        xtrain_tfm = idf.fit_transform(corpus) # 将词频矩阵统计成TF-IDF值
    weight = xtrain_tfm.toarray()

    ####################### TEST PART #####################
    testAll=[]
    for postinDoc in test1:
        testAll.append(setOfWords2Vec(data,postinDoc))
    test = []
    test2 = []
    for i in range(len(test1)):
        test2.append(' '.join(test1[i]))
        test.append(' '.join(test1[i]))

    parameters={}
    parameters['C'] = 0.02
    parameters['kernel'] = 'linear'
    parameters['degree'] = 3
    parameters['gamma'] = 1
    parameters['coef0'] = 1
    clf = strategy_instance.train_svm(parameters,weight,y)
    w = clf.coef_

    index = np.where(w[0] > 0)[0]
    #print('show type of index: ',index)

    dic_w = {}
    for i in index:
        dic_w[i] = w[0][i]
    dic_w = sorted(dic_w.items(), key=lambda d: d[1],reverse = True)[0:-1]
    #dic_w_reverse = sorted(dic_w.items(), key=lambda d: d[1])[0:200]


    index = [dic_w[i][0] for i in range(len(dic_w))]
    #index_reverse = [dic_w_reverse[i][0] for i in range(len(dic_w))]

    delete_word = []
    for i in index:
        #print('word that need to be deleted: ',data[i])
        delete_word.append(data[i])
    #print('number of add_word: ',len(delete_word))

    with open('test_data.txt','r') as f:

            f=[line.strip().split(' ') for line in f]
            #print('len of set(f[0])',len(set(f[0])))
#删除单词
    n = 0
    for i in range(len(f)):         # 行数循环
        n = 0
        #print(i)
        deleted = set()
        added = set()
        for w in delete_word:          # 遍历要删除的数
            #print(w)
            if len(deleted) == 20:
                break
            if w in test1[i]:
                n = n + 1
                for index,s in enumerate(f[i]):
                    #print(index,s)
                    if s == w:
                        test1[i].remove(s)
                        deleted.add(s)
                        #删除一次'
        print('number of deleting',len(deleted))

        '''   
        if len(deleted) != 20:
            print('len of set(deleted)',len(deleted))
            print('show set of deleted: ',deleted)
    print(len(set(f[0])))
    print(len(set(test1[0])))
        '''

    file=open('./modified_data.txt','w+')
    for i in range(len(test1)):
        file.write(" ".join(test1[i]))
        file.write("\n")
    file.close()



    ## You can check that the modified text is within the modification limits.
    modified_data='./modified_data.txt'
    if False:
        with open(modified_data, 'r') as mod:
            final_version = [line.strip().split(' ') for line in mod]
        data_final = createVocabList(final_version)
        final_ALL = []
        for postinDoc in final_version:
            final_ALL.append(setOfWords2Vec(data_final, postinDoc))

        final_ALL = np.array(final_ALL)




    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance ## NOTE: You are required to return the instance of this class.
Esempio n. 26
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

    with open(test_data, 'r') as infile:
        data = [line.strip().split(' ') for line in infile]

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    strategy_instance = helper.strategy()
    parameters = {}

    ##..................................#
    #
    #
    #
    ## Your implementation goes here....#
    #
    #
    #
    ##..................................#

    # It seems silly to have to re-concatenate the data that's just been split, only to have
    # CountVectorizer split it again. However, there doesn't seem to be a way to pass vectors
    # of tokens to CountVectorizer. It requires the examples to be strings.
    training_data = Concatenate(strategy_instance.class0) + Concatenate(
        strategy_instance.class1)
    training_labels = [0] * len(strategy_instance.class0) + [1] * len(
        strategy_instance.class1)
    # Note that we use a custom tokenizer with CountVectorizer to prevent it from removing
    # punctuation.
    count_vect = CountVectorizer(tokenizer=SimpleTokenize).fit(training_data)
    training_counts = count_vect.transform(training_data)
    tfidf_transformer = TfidfTransformer()
    training_idf = tfidf_transformer.fit_transform(training_counts)
    # Train a linear SVM using a tf-idf representation of the training data.
    parameters = {
        'gamma': 'auto',
        'C': 1.0,
        'kernel': 'linear',
        'degree': 2,
        'coef0': 0
    }
    classifier = strategy_instance.train_svm(parameters, training_idf,
                                             training_labels)
    # Use our SVM to determine the best words to remove, and possibly add, to fool the classifier.
    to_replace, replacements = construct_replace_list(
        classifier, count_vect.get_feature_names())

    for lineNo in range(len(data)):
        line = data[lineNo]
        wordset = set(line)
        # Look up the rank for each distinct word in the example, and construct a list of
        # (rank, word) tuples.
        word_ranks = []
        for word in wordset:
            if word in to_replace:
                word_ranks.append((to_replace[word], word))
        # Sort the list so that the words with the lowest rank, which most strongly indicate
        # class 1, are at the beginning.
        word_ranks.sort()
        # Construct a set of the 20 words that most strongly indicate class 1, and remove
        # these words from the example.
        to_remove = set([wi[1] for wi in word_ranks[:20]])
        new_line = []
        for i in range(len(line)):
            if line[i] not in to_remove:
                new_line.append(line[i])
        # If we couldn't find 20 words to remove then add words until the total number of changes
        # is 20. We add the words which most strongly indicate class 0.
        if len(to_remove) < 20:
            ri = 0
            for _ in range(20 - len(to_remove)):
                # Don't add a word if it's already in the example.
                while replacements[ri] in wordset:
                    ri += 1
                new_line.append(replacements[ri])
                ri += 1
        data[lineNo] = new_line

    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...
    modified_data = './modified_data.txt'
    with open(modified_data, 'w') as outfile:
        for line in data:
            print(' '.join(line), file=outfile)

    ## You can check that the modified text is within the modification limits.
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 27
0
@author: junshuaizhang, monaithang
"""
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
import helper
from collections import defaultdict
#import numpy as np
#def fool_classifier(test_data): ## Please do not change the function defination...
## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
#  and modifications limit checking
strategy_instance = helper.strategy()
parameters = {
    'gamma': 'auto',
    "C": 0.1,
    "degree": 10,
    "kernel": "linear",
    "coef0": -100
}

##..................................#
#
#
#
## Your implementation goes here....#
#
#
Esempio n. 28
0
def fool_classifier(test_data): ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...
    test_dt = None
    with open('test_data.txt', 'r') as infile:
        test_dt = [line.strip().split(' ') for line in infile]
    
    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking
    constants = ['#' * i for i in range(100)]
    strategy_instance = helper.strategy() 
    parameters = {
        'C': 1,
        'gamma': 'auto',
        'kernel': 'linear',
        'coef0': 0.0,
        'degree': 3
    }
    lines = [' '.join(line) for line in strategy_instance.class0] \
            + [' '.join(line) for line in strategy_instance.class1]
    
    cv = CountVectorizer()
    cv.fit(lines)
    X_train = cv.transform(lines)
    model = strategy_instance.train_svm(parameters, X_train, np.array([0] * 360 + [1] * 180))
    top_coef_sorted = np.argsort(model.coef_.data)[::-1]
    top_features = np.array(cv.get_feature_names())
    ##..................................#
    modified_list = []

    for record in test_dt:
        record_new = record
        for coef_index in top_coef_sorted[:1000]:
            feature = top_features[coef_index]
            feature_coef = model.coef_.data[coef_index]

            if feature_coef > 0 and feature in record_new: 
                # if positivie weight, remove it
                record_new = [word for word in record_new if word != feature]

            if len((set(record) - set(record_new)) | \
                   (set(record_new) - set(record))) == 20: # no more modifications
                break

        for coef_index in top_coef_sorted[-1000:]:
            feature = top_features[coef_index]
            feature_coef = model.coef_.data[coef_index]
            if len((set(record) - set(record_new)) | \
                   (set(record_new) - set(record))) == 20: # no more modifications
                break
            if feature_coef < 0:
                # if negative, add to the record if it's not there
                if feature not in record_new:
                    record_new = record_new + [feature]        

        if len((set(record) - set(record_new)) | \
                   (set(record_new) - set(record))) != 20: 
            for const in constants:
                if const not in record_new:
                    record_new += [const]
                if len((set(record) - set(record_new)) | \
                   (set(record_new) - set(record))) == 20: 
                    break

        modified_list.append(record_new)
    
    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...
    new_file = open("modified_data.txt", "w")

    for i in modified_list:
        new_file.write(' '.join(i))
        new_file.write('\n')
    new_file.close()
    
    ## You can check that the modified text is within the modification limits.
    modified_data='./modified_data.txt'
    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance ## NOTE: You are required to return the instance of this class.
Esempio n. 29
0
def fool_classifier(
        test_data):  ## Please do not change the function defination...
    ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory...

    ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any),
    #  and modifications limit checking

    strategy_instance = helper.strategy()
    parameters = {
        'gamma': 'auto',
        'C': 0.020999999999999998,
        'kernel': 'linear',
        'degree': 3,
        'coef0': 0
    }

    test_file = test_data

    class_0 = strategy_instance.class0
    class_1 = strategy_instance.class1
    test = []

    with open(test_file) as f:
        test = [line.strip().split(' ') for line in f]

    class_all = class_0 + class_1

    vocabulary = set()

    for sentence in class_all:
        for word in sentence:
            vocabulary.add(word)

    word_list = sorted(vocabulary)

    train_data_matrix = []

    for sample in class_all:
        temp_list = []
        for word in word_list:
            if word in sample:
                temp_list.append(1)
            else:
                temp_list.append(0)
        train_data_matrix.append(temp_list)

    train_data_matrix = np.array(train_data_matrix)

    test_data_matrix = []

    for sample in test:
        temp_list = []
        for word in word_list:
            if word in sample:
                temp_list.append(1)
            else:
                temp_list.append(0)

        test_data_matrix.append(temp_list)

    test_data_matrix = np.array(test_data_matrix)

    train_label = [0] * 360 + [1] * 180
    train_label = np.array(train_label)

    test_label = [1] * 200
    test_label = np.array(test_label)

    ##clf_start = strategy_instance.train_svm(parameters, train_data_matrix, train_label)
    ####
    ##param_range = np.arange(0.001,1,0.01)
    ##
    ##param_grid = [{'C': param_range, 'kernel': ['linear']}]
    ##grid = GridSearchCV(clf_start, param_grid)
    ##grid.fit(train_data_matrix, train_label)
    ##clf = grid.best_estimator_
    ##print(clf)

    clf = strategy_instance.train_svm(parameters, train_data_matrix,
                                      train_label)

    dual_coef = clf.dual_coef_[0]

    class_0_dual_coef = dual_coef[:clf.n_support_[0]]
    class_1_dual_coef = dual_coef[clf.n_support_[0]:]

    support_vector_index = clf.support_

    class_0_dual_sv_index = []
    class_1_dual_sv_index = []
    for i in range(len(dual_coef)):
        if i < clf.n_support_[0]:
            class_0_dual_sv_index.append(
                (dual_coef[i], support_vector_index[i]))
        else:
            class_1_dual_sv_index.append(
                (dual_coef[i], support_vector_index[i]))

    class_0_dual_sv_index = sorted(class_0_dual_sv_index,
                                   key=lambda x: abs(x[0]),
                                   reverse=True)

    class_1_dual_sv_index = sorted(class_1_dual_sv_index,
                                   key=lambda x: x[0],
                                   reverse=True)

    for test_instance in test_data_matrix:

        change_count = set()

        for d1 in class_1_dual_sv_index:
            index = d1[1]
            train_instance = train_data_matrix[index]

            for i in range(len(test_instance)):
                if test_instance[i] == 1 and train_instance[i] == 1:
                    if i in change_count:
                        continue

                    test_instance[i] = 0
                    change_count.add(i)
                    if len(change_count) >= 20:
                        break

            if len(change_count) >= 20:
                break

        if len(change_count) >= 20:
            continue

        for d0 in class_0_dual_sv_index:
            index = d0[1]
            train_instance = train_data_matrix[index]

            for i in range(len(test_instance)):
                if test_instance[i] == 0 and train_instance[i] == 1:
                    if i in change_count:
                        continue

                    test_instance[i] = 1

                    if len(change_count) >= 20:
                        break

            if len(change_count) >= 20:
                break

        if len(change_count) >= 20:
            continue

    modified_data = './modified_data.txt'

    with open(modified_data, 'a') as f:
        for i in range(len(test)):
            words_in_original = test[i]
            words_in_training = word_list
            words_all = set(words_in_original) | set(words_in_training)
            words_all = sorted(words_all)

            modified_test_instance = test_data_matrix[i]

            for word in words_all:
                if word not in words_in_training:
                    f.write(f'{word} ')
                else:
                    word_index = word_list.index(word)

                    if modified_test_instance[word_index] == 0:
                        continue

                    f.write(f'{word} ')

            f.write('\n')

    ##..................................#
    #
    #
    #
    ## Your implementation goes here....#
    #
    #
    #
    ##..................................#

    ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory...

    ## You can check that the modified text is within the modification limits.

    assert strategy_instance.check_data(test_data, modified_data)
    return strategy_instance  ## NOTE: You are required to return the instance of this class.
Esempio n. 30
0
import helper


class0 = helper.strategy().class0
class1 = helper.strategy().class1

feather_list = []
for i in class0:
    for m in i:
        feather_list.append(m)

for i in class1:
    for m in i:
        feather_list.append(m)

feather_list = list(set(feather_list))

with open('test_data.txt','r') as test_data:
    test_data = [line.strip().split(' ') for line in test_data]


test = []
for i in test_data:
    indexlist = [0 for i in range(len(feather_list))]
    for j in feather_list:
        if j in i:
            count = i.count(j)
            indexlist[feather_list.index(j)] += count
    test.append(indexlist)