def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() # gamma : float, optional (default='auto') 2**-15 ~ 2**3 # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. # If gamma is 'auto' then 1/n_features will be used instead. # C : float, optional (default=1.0) 2**-5 ~ 2**15 # Penalty parameter C of the error term. # kernel : string, optional (default='rbf') # Specifies the kernel type to be used in the algorithm. # It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or # a callable. # If none is given, 'rbf' will be used. If a callable is given it is # used to pre-compute the kernel matrix from data matrices; that matrix # should be an array of shape ``(n_samples, n_samples)``. # degree : int, optional (default=3) # Degree of the polynomial kernel function ('poly'). # Ignored by all other kernels. # coef0 : float, optional (default=0.0) # Independent term in kernel function. # It is only significant in 'poly' and 'sigmoid'. parameters = { 'gamma': 'auto', 'C': 1.0, 'kernel': 'rbf', 'degree': 3, 'coef0': 0.0 } do_log = False do_norm = None do_bern = True ##..................................# # get vocabulary vocabulary = get_vocabulary(strategy_instance) # get y_train y_train = get_y_train(strategy_instance) # compare different parameters remained_iteration = 399 for i in range(-5, 16): c = 2**i parameters['C'] = c for j in range(-15, 4): parameters['gamma'] = 2**j calculate_parameter(strategy_instance, vocabulary, parameters,\ y_train, do_bern, do_log, do_norm, remained_iteration) remained_iteration -= 1 # debug('y_train:', type(y_train), y_train.shape) # clf = strategy_instance.train_svm(parameters, x_train, y_train) # log_result(clf, vocabulary, do_bern, do_log, do_norm) ##..................................# ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' # assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def train_target_and_validate_changed_txt( test_file, modified): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... np.set_printoptions(threshold=np.nan) ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() with open(test_file, 'r') as test_text: test_samples = [line.strip().split(' ') for line in test_text] parameters = { 'C': 1, 'kernel': 'linear', 'degree': 3, 'gamma': 'auto', 'coef0': 1 } investigate_train_results( train(strategy_instance, parameters, test_samples), modified) ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... np.set_printoptions(threshold=np.nan) ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() with open(test_data, 'r') as test_text: test_samples = [line.strip().split(' ') for line in test_text] class_0 = strategy_instance.class0 class_1 = strategy_instance.class1 # print(full_dict) # print(len(full_dict)) parameters = { 'C': 0.4, 'kernel': 'linear', 'degree': 3, 'gamma': 'auto', 'coef0': 0.0 } investigate_train_results( train(strategy_instance, parameters, 220, test_samples)) ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier(test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking vectorizer = CountVectorizer() strategy_instance = helper.strategy() x_train = [] x_train_ = [] list_x_train = strategy_instance.class0 + strategy_instance.class1 for i in list_x_train: print(i) x_train_ += i print('x_train_: \n') print(x_train_) X = vectorizer.fit_transform(x_train) print(X.toarray()) print(vectorizer.get_feature_names()) y = np.zeros((540, 1), dtype=np.int) y[360:] = 1 y = y.ravel() # Convert a multidimensional array to a one-dimensional array #print(y) print('end of printing y')
def fool_classifier(test_data): with open(test_data, 'r') as file: data = [line.strip().split(' ') for line in file] strategy_instance = helper.strategy() parameters = { 'gamma': 'auto', 'C': 1.0, 'kernel': 'linear', 'degree': 3, 'coef0': 0.0 } x_train, y_train, word_list = train_generator(strategy_instance.class0, strategy_instance.class1) clf = strategy_instance.train_svm(parameters, x_train, y_train) data_modified(word_list, clf.coef_[0], data) with open('modified_data.txt', 'w') as file: file.write('\n'.join([' '.join(a) for a in data])) modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... test_dt = None with open(test_data, 'r') as infile: test_dt = [line.strip().split(' ') for line in infile] ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = { 'C': 1, 'gamma': 'auto', 'kernel': 'linear', 'coef0': 0.0, 'degree': 3 } lines = [' '.join(line) for line in strategy_instance.class0] \ + [' '.join(line) for line in strategy_instance.class1] # prepare X(samples), y(targets) # bag of words vectorizer cv = CountVectorizer() cv.fit(lines) X_train = cv.transform(lines) model = strategy_instance.train_svm(parameters, X_train, np.array([0] * 360 + [1] * 180)) # sort the coefficients from positive to negative top_coef_sorted = np.argsort(model.coef_.toarray()[0])[::-1] top_features = np.array(cv.get_feature_names()) ##..................................# modified_list = [] # go through each test record, delete features in the records # which correspond to positive coefficient in the trained model for record in test_dt: record_new = record for coef_index in top_coef_sorted: feature = top_features[coef_index] record_new = [word for word in record_new if word != feature] if len((set(record) - set(record_new)) | \ (set(record_new) - set(record))) == 20: # no more modifications break modified_list.append(record_new) ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... new_file = open("modified_data.txt", "w") for i in modified_list: new_file.write(' '.join(i)) new_file.write('\n') new_file.close() ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = {} ##..................................# # # # ## Your implementation goes here....# # # # ##..................................# ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = {} pre, list_name, test_data1, class_1, class_0 = extract(test_data) fool(pre, list_name, test_data1, class_1, class_0)
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model trainin g (if any), # and modifications limit checkin strategy_instance = helper.strategy() parameters = {} # initial parameters parameters['gamma'] = 'auto' parameters['C'] = 100 parameters['kernel'] = 'linear' parameters['degree'] = 3 parameters['coef0'] = 0.0 from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer for i in range(150, 450, 5): vectorizer = CountVectorizer(stop_words='english', min_df=10, preprocessor=None, max_features=i) try: vectorizer.fit(merge(['class-0.txt', 'class-1.txt'])) except Exception: continue X_train = vectorizer.transform(merge(['class-0.txt', 'class-1.txt'])) X_test = vectorizer.transform(open('test_data.txt', 'r')) Y_train = ['class-0'] * 360 + ['class-1'] * 180 Y_test = ['class-1'] * 200 # print(X_train.shape) # print(vectorizer.get_feature_names()) clf = strategy_instance.train_svm(parameters, X_train, Y_train) # print(clf.predict(X_test)) print('c = ', 100, 'mindf = ', 10, 'i = ', i, end='') print(correctrate(clf.predict(X_test), Y_test), correctrate(clf.predict(X_train), Y_train)) # print(X_train) ##..................................# # # # ## Your implementation goes here....# # # # ##..................................# ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... ## You can check that the modified text is within the modification limits. # modified_data='./modified_data.txt' # assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... test_dt = None with open('test_data.txt', 'r') as infile: test_dt = [line.strip().split(' ') for line in infile] ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = { 'C': 2**-5, 'gamma': 'auto', 'kernel': 'linear', 'coef0': 0.0, 'degree': 3 } lines = [' '.join(line) for line in strategy_instance.class0] \ + [' '.join(line) for line in strategy_instance.class1] cv = CountVectorizer() cv.fit(lines) X_train = cv.transform(lines) model = strategy_instance.train_svm(parameters, X_train, np.array([0] * 360 + [1] * 180)) top_positive_coef = np.argsort(model.coef_.data)[::-1] top_features = np.array(cv.get_feature_names()) ##..................................# modified_list = [] for record in test_dt: del_count = 0 record_new = record for feature in top_features[top_positive_coef]: if feature in record_new: if del_count == 20: break record_new = [word for word in record_new if word != feature] del_count += 1 modified_list.append(record_new) ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... new_file = open("modified_data.txt", "w") for i in modified_list: new_file.write(' '.join(i)) new_file.write('\n') new_file.close() ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = {} x_train = strategy_instance.class0 + strategy_instance.class1 train = [] for i in range(len(x_train)): train.append(' '.join(x_train[i])) with open(test_data, 'r') as test1: test1 = [line.strip().split(' ') for line in test1] test = [] for i in range(len(test1)): test.append(' '.join(test1[i])) from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer tfv = CountVectorizer() tfv.fit(list(train)) word = tfv.get_feature_names() xtrain_tfv = tfv.transform(train) xvalid_tfv = tfv.transform(test) print('len of word', len(word)) import numpy as np y = np.zeros((540, 1), dtype=np.int) y[360:] = 1 y = y.ravel() from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline parameters['C'] = 1.0 parameters['kernel'] = 'linear' parameters['degree'] = 3 parameters['gamma'] = 10 parameters['coef0'] = 1 clf = strategy_instance.train_svm(parameters, xtrain_tfv, y) predict_test = clf.predict(xvalid_tfv) count_1 = 0 count_0 = 0 for x in predict_test: if x == 1: count_1 += 1 if x == 0: count_0 += 1 print('count_1: ', count_1) print('count_0: ', count_0) # 完毕 w = clf.coef_.toarray() index = np.where(w[0] < 0)[0] dic_w = {} for i in index: dic_w[i] = w[0][i] dic_w = sorted(dic_w.items(), key=lambda d: d[1])[0:100] index = [dic_w[i][0] for i in range(len(dic_w))] add_word = [] for i in index: add_word.append(tfv.get_feature_names()[i]) n = 0 for i in range(len(test)): n = 0 for w in add_word: if n == 20: break if w not in test1[i]: test[i] = test[i] + " " + w else: continue n = n + 1 file = open('./modified_data.txt', 'w') for i in range(len(test)): file.write(test[i]) file.write("\n") file.close() ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... strategy_instance = helper.strategy() parameters = {} list_class0 = strategy_instance.class0 list_class1 = strategy_instance.class1 vertical_dim_of_trainx = len(list_class0) + len(list_class1) paragraphs = [] for para in list_class0: little_paragraph = "" for word in para: little_paragraph += word little_paragraph += " " paragraphs.append(little_paragraph) for para in list_class1: little_paragraph = "" for word in para: little_paragraph += word little_paragraph += " " paragraphs.append(little_paragraph) y_train = [] for i in range(len(list_class0)): y_train.append(0) for j in range(len(list_class1)): y_train.append(1) x_vector = TfidfVectorizer(token_pattern='[^\s]+') x_train = x_vector.fit_transform(paragraphs) words_bag = x_vector.vocabulary_ # Looking for the best 'C' C_parameter = np.arange(0.01, 1.2, 0.01) parameters_for_grid = {'kernel': ['linear'], 'C': C_parameter} clf_for_grid = GridSearchCV(svm.SVC(), parameters_for_grid) clf_for_grid.fit(x_train, y_train) c_best = clf_for_grid.best_params_ word_list = x_vector.get_feature_names() parameters = { 'kernel': 'linear', 'C': c_best['C'], 'degree': 1, 'coef0': 0, 'gamma': 'auto' } clf = strategy_instance.train_svm(parameters, x_train, y_train) weight_list = clf.coef_.toarray().tolist()[0] for i in range(len(weight_list)): if weight_list[i] > 0: weight_list[i] = weight_list[i] * 2 x_data, data_list = transform_data(test_data, words_bag) ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... dict_for_word = {} for idx in range(len(word_list)): dict_for_word[word_list[idx]] = weight_list[idx] sorted_dict_for_word = sorted(dict_for_word.items(), key=lambda x: x[1]) reversed_dict_for_word = sorted(dict_for_word.items(), key=lambda x: x[1], reverse=True) list_for_test_dict = [] sorted_test_dict = [] rsorted_test_dict = [] for idx in range(len(data_list)): paragraph_list = data_list[idx] test_data_dict = {} for word in paragraph_list: if word in dict_for_word: test_data_dict[word] = dict_for_word[word] list_for_test_dict.append(test_data_dict) sorted_test_dict.append( sorted(test_data_dict.items(), key=lambda x: x[1])) rsorted_test_dict.append( sorted(test_data_dict.items(), key=lambda x: x[1], reverse=True)) for i in range(len(list_for_test_dict)): time = 20 s_j = 0 add_index = 0 while (time > 0): if sorted_dict_for_word[add_index][1] + rsorted_test_dict[i][s_j][ 1] > 0: rm_all(data_list[i], (rsorted_test_dict[i][s_j][0])) time -= 1 s_j += 1 else: if sorted_dict_for_word[add_index][0] not in data_list[i]: data_list[i].append(sorted_dict_for_word[add_index][0]) time -= 1 add_index += 1 f = open('modified_data.txt', 'w') for i in data_list: line = '' for word in i: line += word line += ' ' line += '\n' f.write(line) f.close() modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) modify_x, m_list = transform_data('modified_data.txt', words_bag) return strategy_instance
def fool_classifier(test_data): ## Please do not change the function defination... strategy_instance=helper.strategy() parameters={'kernel':'linear', 'C':1.0, 'gamma':'auto', 'degree':3, 'coef0':0.0} # generate training dictionary training_set = set() for line in strategy_instance.class0: for token in line: training_set.add(token) for line in strategy_instance.class1: for token in line: training_set.add(token) dictionary = {} for token in training_set: dictionary[token] = 0 dictionary_index = list(dictionary.keys()) # generate x_train and y_train x = [] y = [] for line in strategy_instance.class0: tmp_dic = copy.deepcopy(dictionary) for token in line: tmp_dic[token] += 1 x.append(list(tmp_dic.values())) y.append(0) for line in strategy_instance.class1: tmp_dic = copy.deepcopy(dictionary) for token in line: tmp_dic[token] += 1 x.append(list(tmp_dic.values())) y.append(1) x = np.array(x) # vector of n_sample that contains the token ti = [] for col in range(len(x[0])): count = 0 for row in range(len(x)): if x[row][col] >= 1: count += 1 ti.append(count) # compute tf-idf to generate x_train tf_vector = [] idf_vector = [] for i in range(len(x)): tf = [] idf = [] row_sum = sum(x[i]) for j in range(len(x[0])): tf.append( x[i][j] / row_sum) idf.append(math.log(x.shape[0] / (ti[j] + 1) ,2)) tf_vector.append(tf) idf_vector.append(idf) x_train = np.array(tf_vector) * np.array(idf_vector) y_train = np.array(y) # training clf = strategy_instance.train_svm(parameters,x_train,y_train) #get the weights of features and sort them support_vector = clf.coef_[0] weighted_dictionary = copy.deepcopy(dictionary) for i in range(len(dictionary_index)): weighted_dictionary[dictionary_index[i]] = support_vector[i] sorted_weight = sorted(weighted_dictionary.items(),key = lambda x:x[1],reverse = False) ## modify test.txt based on sorted_weight with open(test_data,'r') as f: data=[line.strip().split(' ') for line in f] for n_line in range(len(data)): count = 0 while count < 20: for i in range(len(sorted_weight)): if sorted_weight[-i-1][0] in data[n_line]: tmp = [word for word in data[n_line] if word != sorted_weight[-i-1][0]] data[n_line] = tmp count += 1 break # for n_line in range(len(data)): # count = 0 # while count < 10: # for i in range(len(sorted_weight)): # if sorted_weight[i][0] not in data[n_line]: # data[n_line] = data[n_line] + [sorted_weight[i][0]] # count += 1 # break ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory modified_data='./modified_data.txt' path = os.getcwd() os.chdir(path) with open(modified_data,"w") as f: for line in data: f.write(" ".join(sorted(line)) + "\n") # #### my test script######################## # with open(modified_data,'r') as f: # data=[line.strip().split(' ') for line in f] # x = [] # for line in data: # tmp_dic = copy.deepcopy(dictionary) # for token in line: # if token in tmp_dic: # tmp_dic[token] += 1 # x.append(list(tmp_dic.values())) # x_test = np.array(x) # y_test = np.array([0 for _ in range(len(data))]) # print(clf.predict(x_test)) # print(clf.predict(x_test).shape) # print(clf.score(x_test,y_test)) # ######################################## assert strategy_instance.check_data(test_data, modified_data) return strategy_instance
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = { 'gamma': 'auto', 'C': 0.02, 'kernel': 'linear', 'degree': 3, 'coef0': 0.0 } # This step is clean the modified_data.txt if there have some context in this file modified_data = './modified_data.txt' f = open(modified_data, "w") f.close() ##..................................# # first step open strategy().class0 and strategy().class1 class0_data = strategy_instance.class0 # type is list, 2 dimensions class1_data = strategy_instance.class1 # second step open test_data file with open(test_data, 'r') as test_d: test = test_d.read() # transform class0_data(list) to become dictionary and label them features_and_labels = [] store_test_features = [] for line in class0_data: # if(line != ''): class0_dict = get_freq_of_tokens_from_list(line) features_and_labels.append((class0_dict, 0)) for line in class1_data: # if(line != ''): class1_dict = get_freq_of_tokens_from_list(line) features_and_labels.append((class1_dict, 1)) # print('len(features_and_labels)', len(features_and_labels)) # transform test(txt file) to becaome dictionary and labe them for line in test.split('\n'): if (line != ''): test_dict = get_freq_of_tokens_from_file(line) store_test_features.append((test_dict, 1)) # transform features_and_labels to become x_train(matrix) and y_train(array) encoder = LabelEncoder() vectorizer = DictVectorizer(dtype=int, sparse=True) x_train, y_train = list(zip(*features_and_labels)) x_train = vectorizer.fit_transform(x_train) y_train = encoder.fit_transform(y_train) x_test, y_test = list(zip(*store_test_features)) x_test = vectorizer.transform(x_test) # print(x_train.shape) # According to the svm(Support Vector Machine) to do the training to train training data(540, 5178) training = strategy_instance.train_svm(parameters, x_train, y_train) # print(training) # result = training.predict(x_test) # this is the test data's predict result # this function is using to get the relationship about the word and it's appear frequency store_each_test_sample_frequency_dict_list = test_samples_word_to_frequency( x_test, vectorizer) # This funtion is using to according to the svm training result to find each feature's weight, # and get the relationship about the word and it's weight sorted_traing_data_word_weight_dict = training_data_coeffecient( training, vectorizer) # This fuction is using to get each test sample's relationship with word and weight sorted_whole_test_sample_word_weight_dict_list = dict_between_test_words_and_weight( store_each_test_sample_frequency_dict_list, sorted_traing_data_word_weight_dict, vectorizer) ## Your implementation goes here....# # This is using to store the traing_data_word_weight(2 demensions list) which is used to compare the test sample's word weight sorted_traing_data_word_weight_list = dict_to_list( sorted_traing_data_word_weight_dict) # This is using to store the test_data_word_weight(3demesins list) which is used to compare the whole training sample's word weight sorted_whole_test_sample_word_weight_3_demesions_list = [] for each_sample in sorted_whole_test_sample_word_weight_dict_list: sorted_whole_test_sample_word_weight_3_demesions_list.append( dict_to_list(each_sample)) # print('sorted_whole_test_sample_word_weight_3_demesions_list----------', sorted_whole_test_sample_word_weight_3_demesions_list) # final_rsult = final_add_or_delete( sorted_whole_test_sample_word_weight_3_demesions_list, sorted_traing_data_word_weight_list, test) ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... with open(modified_data, 'w') as modified: modified.write(final_rsult) ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool(pre, list_name, test_data, class_1, class_0): strategy = helper.strategy() weight_ = list(pre.coef_)[0] #print(pre.coef_) word_weight = dict.fromkeys(list_name, 0) n_w = 0 for word in list_name: word_weight[word] = weight_[n_w] n_w += 1 weight_list = [] for word, val in word_weight.items(): weight_list.append([val, word]) weight_class1 = sorted(weight_list, reverse=True) weight_class0 = sorted(weight_list, reverse=False) print(weight_class1[:10]) print(weight_class0[:10]) len_class_1 = len(class_1) len_class_0 = len(class_0) rotio = 0 de_r = 0 if len_class_1 < len_class_0: r = float(len_class_0 / len_class_1) if r == 1: rotio = 0.5 if r > 1 and r < 1.5: rotio = 0.7 if r > 1.5 and r < 2: rotio = 0.8 if r == 2: rotio = 0.85 if r > 2 and r < 3: rotio = 0.9 if r > 3: rotio = 0.95 de_r = int(20 * rotio) if len_class_1 > len_class_0: r = float(len_class_1 / len_class_0) if r == 1: rotio = 0.5 if r > 1 and r < 1.5: rotio = 0.3 if r > 1.5 and r < 2: rotio = 0.2 if r == 2: rotio = 0.15 if r > 2 and r < 3: rotio = 0.1 if r > 3: rotio = 0.05 de_r = int(20 * rotio) with open('log.txt', 'w') as l_f: with open('modified_data.txt', 'w') as t_f: for t_line in test_data: count = de_r add_change = [] delete_change = [] add_index = 0 t_delete = [] t_add = [] delete = [] add = [] ######################################################################### #---------------delete for index in range(len(weight_class1)): if count > 0: if weight_class1[index][1] in t_line: count -= 1 delete_change.append(weight_class1[index][1]) for t_data in t_line: if t_data in delete_change: delete.append(t_data) continue else: t_delete.append(t_data) t_f.write(t_data + ' ') ##################################################################################### # ----------add count = 20 - de_r for index in range(len(weight_class0)): if count > 0: if weight_class0[index][1] not in t_line: count -= 1 add_change.append(weight_class0[index][1]) add_index = index for a_data in add_change: t_add.append(a_data) add.append(a_data) t_f.write(a_data + ' ') total = t_add + t_delete #################################################################################################### #######----------check add T = set(total) L = set(t_line) if len(set(T - L) | set(L - T)) < 20: for index in range(add_index, len(weight_class0)): T = set(total) if len(set(T - L) | set(L - T)) > 20: break else: total.append(weight_class0[index][1]) T = set(total) if len(set(T - L) | set(L - T)) <= 20: add.append(weight_class0[index][1]) t_f.write(weight_class0[index][1] + ' ') # print('total',total_change) # print('c0',class_0_list) # print('c1',class_1_list) # print(len(set(T - L))) t_f.write('\n') l_f.write('-----------' + 'delete:' + '\n' + str(set(delete)) + '\n') l_f.write('-----------' + 'add:' + '\n' + str(set(add)) + '\n') l_f.write('\n')
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() ########################### define parameter ########################### parameters = { 'gamma': 'auto', 'C': 1.0, 'kernel': 'linear', 'degree': 3, 'coef0': 0.0 } ########################### feature extraction ############################# # debug_matrix('class0', strategy_instance.class0) # debug_matrix('class1', strategy_instance.class1) y_train = get_y_train(strategy_instance) # debug('y_train =\n', y_train) x_train, vectorizer = get_x_train(strategy_instance) # debug('x_train =\n', x_train) ############################## train model ################################# clf = strategy_instance.train_svm(parameters, x_train, y_train) # debug(clf) # grid search # param_range = [2**i for i in range(-5, 16)] # param_grid = [{'C': param_range, 'kernel': ['linear']}] # grid = GridSearchCV(clf_start, param_grid) # grid.fit(x_train,y_train) # clf = grid.best_estimator_ vocabulary = vectorizer.get_feature_names() # debug('vocabulary =\n', vocabulary) weight_list = clf.coef_.tolist()[0] # debug('weight_list =\n', weight_list) # ############################# modify file ############################## modified_data = './modified_data.txt' #read file test_matrix = read_test_matrix(test_data) # get modified matrix modified_matrix = [] for test_vector in test_matrix: modified_vector = get_modified_vector(\ test_vector, vocabulary, weight_list, vectorizer) modified_matrix.append(modified_vector) # write file write_modified_matrix(modified_matrix, modified_data) ################################## test ################################### # show_test_result(clf, vectorizer) assert strategy_instance.check_data(test_data, modified_data) return strategy_instance
def extract(test_data1): strategy = helper.strategy() #class0 = strategy.class0 # class1 = strategy.class1 class_0 = strategy.class0 class_1 = strategy.class1 #--------------------------------------------------------------- # class_0_a = [] # class_1_a = [] # count_0 = 360 # count_1 = 180 # while count_0: # count_line = randint(4,8) # t = [] # while count_line: # line_index = randint(0,359) # index_data = randint(0, len(class0[line_index]) - 20) # for index in range(index_data, index_data + 20): # t.append(class0[line_index][index]) # count_line -= 1 # # class_0_a.append(t) # count_0 -= 1 # while count_1: # count_line = randint(4,8) # t = [] # while count_line: # line_index = randint(0,179) # index_data = randint(0, len(class1[line_index]) - 20) # for index in range(index_data,index_data + 20): # t.append(class1[line_index][index]) # count_line -=1 # # class_1_a.append(t) # count_1 -= 1 # # class_0 = class0 + class_0_a # class_1 = class1 + class_1_a # print(len(class_0)) # print(len(class_1)) #-------------------------------------------------------------------------- with open(test_data1, 'r') as test_file: test_data = [line.strip().split(' ') for line in test_file] vectorizer = TfidfVectorizer(max_features=5720, use_idf=True, norm='l2', analyzer='word', token_pattern='[^\s]+') #class_0 = class_0[int(len(class_0)/2):] # data = vectorizer.fit_transform([' '.join(line) for line in class_0 + class_1 + test_data]) data = vectorizer.fit_transform( [' '.join(line) for line in class_0 + class_1]) # data1 = vectorizer.fit_transform([' '.join(line) for line in class_1]) print(data.toarray()) t_data = vectorizer.fit_transform([' '.join(line) for line in test_data]) parameters = { 'gamma': 'auto', 'C': 1, 'kernel': 'linear', 'degree': 0, 'coef0': 0 } x_train = data.toarray()[:len(class_0 + class_1)] y_train = [0 for _ in range(len(class_0)) ] + [1 for _ in range(len(class_1))] pre = strategy.train_svm(parameters, x_train, y_train) ####################################################### #print(pre.predict(data.toarray()[len(class_0+class_1):])) # total = pre.predict(data.toarray()[len(class_0+class_1):]) # one = [_ for _ in total if _==1] # print(len(one)/len(total)) ############################################################### #final_predict = pre.predict(data.toarray()[len(class_0 + class_1):]) # final_predict = pre.predict(t_data.toarray()[:]) list_name = vectorizer.get_feature_names() return pre, list_name, test_data, class_1, class_0
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() ############################# define parameter ############################# parameters = { 'gamma': 'auto', 'C': 1.0, 'kernel': 'linear', 'degree': 3, 'coef0': 0.0 } # gamma : float, optional (default='auto') 2^-15 ~ 2^3 # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'. # If gamma is 'auto' then 1/n_features will be used instead. # C : float, optional (default=1.0) 2^-5 ~ 2^15 # Penalty parameter C of the error term. # kernel : string, optional (default='rbf') # Specifies the kernel type to be used in the algorithm. # It must be one of 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed' or # a callable. # If none is given, 'rbf' will be used. If a callable is given it is # used to pre-compute the kernel matrix from data matrices; that matrix # should be an array of shape ``(n_samples, n_samples)``. # degree : int, optional (default=3) # Degree of the polynomial kernel function ('poly'). # Ignored by all other kernels. # coef0 : float, optional (default=0.0) # Independent term in kernel function. # It is only significant in 'poly' and 'sigmoid'. ############################### train data ################################# debug_matrix('class0', strategy_instance.class0) debug_matrix('class1', strategy_instance.class1) # get vocabulary vocabulary = get_vocabulary(strategy_instance) debug('vocabulary =\n', vocabulary) # get y_train y_train = get_y_train(strategy_instance) debug('y_train =\n', y_train) # get x_train x_train = get_x_train(strategy_instance, vocabulary) debug('x_train =\n', x_train) # training clf = strategy_instance.train_svm(parameters, x_train, y_train) ############################### modify file ################################ # generate weight_table corresponded by vocabulary # weight_table = clf.coef_.tolist()[0] # class0_vocabulary = [] # for i in range(2): # class0_word = vocabulary[weight_table.index(sorted(weight_table)[i])] # class0_vocabulary.append(class0_word) # class1_vocabulary = [] # for i in range(2): # class1_word = vocabulary[weight_table.index(sorted(weight_table)[-i - 1])] # class1_vocabulary.append(class1_word) # # debug(vocabulary) # # debug(weight_table) # debug(class0_vocabulary) # debug(class1_vocabulary) # # read file # with open('./test_data.reduced','r') as test_data_file: # test_data_matrix=[line.strip().split(' ') for line in test_data_file] # # debug(test_data_matrix) # # generate weight by index # # exchange_data_matrix = [ # # sample_exchange_data = [ # # word_exchange_data = [ # # [weight, index, word_test_data]]]] # # Note: word_exchange_data is sorted by weight # exchange_data_matrix = [] # for sample_test_data in test_data_matrix: # sample_exchange_data = [] # for index in range(len(sample_test_data)): # word_test_data = sample_test_data[index] # try: # weight = weight_table[vocabulary.index(word_test_data)] # except ValueError: # weight = 0 # word_exchange_data = [weight, index, word_test_data] # sample_exchange_data.append(word_exchange_data) # sample_exchange_data = sorted(sample_exchange_data,key=lambda l:l[0]) # exchange_data_matrix.append(sample_exchange_data) # # debug(exchange_data_matrix) # # generate the index of class1 feature word # class1_feature_word_matrix = [] # for vector_exchange in exchange_data_matrix: # vector_feature = [] # for word_exchange in vector_exchange[0:2]: # # vector_feature.append(word_exchange[1]) # pass # class1_feature_word_matrix.append(vector_feature) # # debug(class1_feature_word_matrix) # # write modified data # with open('./modified_data.txt', 'w') as modiefied_data_file: # for i in range(len(test_data_matrix)): # modiefied_data_list = test_data_matrix[i][:] # for j in range(len(class1_feature_word_matrix[i])): # index = class1_feature_word_matrix[i][j] # modiefied_data_list[j] = class0_vocabulary[i] # modiefied_data_str = ' '.join(modiefied_data_list) # # debug(modiefied_data_str) # modiefied_data_file.write(modiefied_data_str + '\n') ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' # assert strategy_instance.check_data(test_data, modified_data) print_test(clf, vocabulary) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() x_train = strategy_instance.class0 + strategy_instance.class1 import numpy as np y = np.zeros((540, 1), dtype=np.int) y[360:] = 1 y = y.ravel( ) # Convert a multidimensional array to a one-dimensional array #print(y) #print('end of printing y') def createVocabList(dataSet): vocabSet = set([]) for document in dataSet: vocabSet = vocabSet | set(document) return list(vocabSet) def setOfWords2Vec(vocabSet, inputSet): returnVec = [0] * len(vocabSet) for word in inputSet: if word in vocabSet: returnVec[vocabSet.index(word)] = 1 return returnVec data = createVocabList(x_train) trainAll = [] for postinDoc in x_train: trainAll.append(setOfWords2Vec(data, postinDoc)) trainAll = np.array(trainAll) with open('test_data.txt', 'r') as test1: test1 = [line.strip().split(' ') for line in test1] testAll = [] for postinDoc in test1: testAll.append(setOfWords2Vec(data, postinDoc)) parameters = {} parameters['C'] = 0.05 parameters['kernel'] = 'linear' parameters['degree'] = 3 parameters['gamma'] = 1 parameters['coef0'] = 1 clf = strategy_instance.train_svm(parameters, trainAll, y) # ----------------- #print('end of setting parameters') w = clf.coef_ index = np.where(w[0] < 0)[0] dic_w = {} for i in index: dic_w[i] = w[0][i] dic_w = sorted(dic_w.items(), key=lambda d: d[1])[0:200] index = [dic_w[i][0] for i in range(len(dic_w))] add_word = [] for i in index: add_word.append(data[i]) n = 0 for i in range(len(test1)): n = 0 for w in add_word: if n == 20: break if w not in test1[i]: test1[i].append(w) else: continue n = n + 1 #print('before open') file = open('./modified_data.txt', 'w') for i in range(len(test1)): file.write(" ".join(test1[i])) file.write("\n") file.close() ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' #print('before assert') assert strategy_instance.check_data(test_data, modified_data)
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... with open(test_data, 'r') as file: test_data = [line.strip().split(' ') for line in file] ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = {} #use Native Bayes classification to calculate the possibilities, P(word|class0) and P(word|class1). class0 = strategy_instance.class0 class1 = strategy_instance.class1 p_class0 = len(class0) / (len(class0) + len(class1)) p_class1 = len(class1) / (len(class0) + len(class1)) class0_dic = {} class1_dic = {} for i in range(len(class0)): for j in range(len(class0[i])): if class0[i][j] not in class0_dic: class0_dic[class0[i][j]] = 1 else: class0_dic[class0[i][j]] += 1 for m in range(len(class1)): for n in range(len(class1[m])): if class1[m][n] not in class1_dic: class1_dic[class1[m][n]] = 1 else: class1_dic[class1[m][n]] += 1 class0_key_list = class0_dic.keys() class1_key_list = class1_dic.keys() class0_value_list = class0_dic.values() class1_value_list = class1_dic.values() j_number = len(class1_key_list) for k in class0_key_list: if k not in class1_key_list: j_number += 1 class0_posibility = {} class1_posibility = {} smooth = 1 for g in class0_key_list: class0_posibility[g] = (class0_dic[g] + smooth) / (sum(class0_value_list) + j_number) for h in class1_key_list: class1_posibility[h] = (class1_dic[h] + smooth) / (sum(class1_value_list) + j_number) # Remove the interference -- a word has a higher possibility both in class1 and class0. characteristic_class0 = {} characteristic_class1 = {} for value in class0_key_list: if value in class1_key_list: characteristic_class0[ value] = class0_posibility[value] - class1_posibility[value] else: characteristic_class0[value] = class0_posibility[value] char_class0 = sorted(characteristic_class0.items(), key=lambda d: d[1], reverse=True) char_class0 = returntodic(char_class0) insert_list = list(char_class0.keys()) for value in class1_key_list: if value in class0_key_list: characteristic_class1[ value] = class1_posibility[value] - class0_posibility[value] else: characteristic_class1[value] = class1_posibility[value] char_class1 = sorted(characteristic_class1.items(), key=lambda d: d[1], reverse=True) char_class1 = returntodic(char_class1) ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... for x in range(len(test_data)): indx = 0 fix_list = [] final_insert_list = [] for value in insert_list: if value not in test_data[x]: final_insert_list.append(value) for values in test_data[x]: if values in char_class1 and values not in final_insert_list: fix_list.append(values) a = set(fix_list) fix_list = [b for b in a] if len(fix_list) < 10: fix_dic = {} for value in fix_list: fix_dic[value] = char_class1[value] fix_dic = sorted(fix_dic.items(), key=lambda d: d[1], reverse=True) fix_dic = returntodic(fix_dic) final_list = list(fix_dic.keys()) count = 0 for y in range(len(final_list)): for z in range(len(test_data[x])): if test_data[x][z] == final_list[y]: index = test_data[x].index(final_list[y]) test_data[x][index] = final_insert_list[indx] indx += 1 count += 1 rest = 20 - count * 2 for g in range(rest): test_data[x].append(final_insert_list(indx)) indx += 1 if len(fix_list) >= 10: fix_dic = {} for value in fix_list: fix_dic[value] = char_class1[value] fix_dic = sorted(fix_dic.items(), key=lambda d: d[1], reverse=True) fix_dic = returntodic(fix_dic) final_list = list(fix_dic.keys()) count = 0 # print("The line is: " + str(x) ) # print(final_list) # print(test_data[x]) for y in range(len(final_list)): if count != 10: for z in range(len(test_data[x])): if test_data[x][z] == final_list[y]: index = test_data[x].index(final_list[y]) test_data[x][index] = final_insert_list[indx] indx += 1 count += 1 else: break with open("modified_data.txt", "w") as file: for i in range(len(test_data)): for j in range(len(test_data[i])): file.write(str(test_data[i][j]) + " ") file.write("\n") ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' test_data = './test_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = {} #mycodes #training model using class_0 and class_1 class_0 = [' '.join(i) for i in strategy_instance.class0] class_1 = [' '.join(i) for i in strategy_instance.class1] class_all = class_0 + class_1 vectorizer = CountVectorizer() count = vectorizer.fit_transform(class_all) l = [0] * 360 + [1] * 180 X = count.toarray() y = np.array(l) #find the best parameter param_grid = [{ 'kernel': ['rbf'], 'gamma': [0.01], 'C': [1, 10] }, { 'kernel': ['linear'], 'C': [1, 10, 100, 1000] }] #find the best parameters #grid_search = GridSearchCV(svm.SVC(),param_grid,cv=5) #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0) #grid_search.fit(X_train,y_train) parameters['C'] = 10 parameters['kernel'] = 'rbf' parameters['gamma'] = 0.01 parameters['degree'] = 3 parameters['coef0'] = 1 #train the model clf = strategy_instance.train_svm(parameters, X, y) ''' #predict the rate for original test data with open(test_data, 'r') as test: t = [line.strip().split(' ') for line in test] features = vectorizer.get_feature_names() d = dict(zip(features, [0] * len(features))) predict = [] for sample in t: a = [] for i in sample: if i in d: d[i] += 1 for key in d.keys(): a.append(d[key]) predict.append(a) for key in d.keys(): d[key] = 0 ''' #change the dataset with open(test_data, 'r') as test: t = [line.strip().split(' ') for line in test] t1 = [] x = 0 for line in t: original = {} for i in line: if i not in original: original[i] = 1 else: original[i] += 1 original = sorted(original.items(), key=lambda x: x[1], reverse=True) c = 0 flag = 0 for i in [ 'david', 'yeltsin', 'replacement', 'mubarak', 'phone', 'powell', 'wimbledon', 'brazilian', 'title', 'm.', 'bulgaria', 'confront', 'hosni', 'inspire', 'lap', 'denial', 'benedict', 'maryland', 'giant', 'midfielder', 'uribe', 'maliki', 'insult', 'cypriot', 'gyanendra', 'bhutto', 'autonomy', 'apparently', 'wild', 'jan', 'tunisia', 'social', 'magazine', 'xvi', 'sick', 'pence', 'migratory', 'multiparty', 'unite', 'postal', 'upcom', 'universal', 'actress', 'highly', 'colin', 'penalty', 'aol', 'huber', 'dissolve', 'augusto', 'lauck', 'internal', '22nd', 'christopher', 'visegrad', 'takeda', 'bratislava', 'fallujah', 'rush', 'whom', 'hat', 'topple', 'online', 'suspicious', 'irreparable', 'funeral', 'mbeki', 'karimov', 'heat', 'bar', 'walid', 'chris', 'these', 'curb', 'vs.', 'institute', 'unnamed', 'forum', 'reluctant', 'strength', 'tribute', 'recovery', 'puk', 'zvornik', 'dutroux', 'forma', 'koizumi', 'guy', 'view', 'plo', 'eighth', 'indict', 'merkel', 'nervous', 'kamal', 'zionism', 'maleeva', '350', 'sponsor', 'missionary', 'industrialize', 'reconstruction', 'cheney', 'lynch', 'grimsby', 'morales', 'constitute', 'congratulate', 'initiate', 'hansa', 'cancel', 'awareness', 'performance', 'sing', 'savic', 'cape', 'veto', 'choose', 'abdullahi', 'nathalie', 'jose', 'contribution', 'wildlife', 'stake', 'francisco', 'gunfight', 'tibetan', 'levee', 'personal', 'transfer', 'kevin', 'sean', 'belgian', 'goss', 'wiesner', 'slovenia', 'hardline', 'indigenous', 'compound', 'contractor', 'trick', 'medal', 'stick', 'championship', 'voice', 'abdullah', 'rostock', 'suleiman', '1975', 'really', 'angela', 'nouri', 'j.', 'society', 'offset', 'reunification', 'levy', 'nationwide', 'exceed', 'emirate', 'tribe', 'conte', 'ill', 'chancellor', 'bin', 'bhutan', 'm', 'worship', 'marseille', 'pauli', 'pinochet', 'flush', 'aggressive', 'atoll', 'delegation', 'judith', 'thani', 'quickly', 'blessing', 'meadows', 'kyrgyzstan', 'enrich', 'avenge', 'inc.', 'croatia', 'moratorium', 'inability', 'surayud', 'plea', 'roe', 'armenia', 'philippine', 'wang', 'vogts', 'lebed', 'outgoing', 'hiddink', 'abdel', 'hampshire', 'demolition', 'sundown', 'gnassingbe', 'google', 'moya', 'ball', 'emanuele', 'sprint', '1971', 'clayton', '275', 'leash', 'aoki', 'chirac', 'intervene', 'togolese', 'olsza', 'lower', 'carolina', 'coulthard', 'hearing', 'optimistic', 'chaudhry', 'andre', 'safin', 'jerry', 'wheat', 'rallying', 'akram', 'matt', 'solve', '3.8', 'subsidize', 'dash', 'nationalize', 'motive', 'ira', '280', 'willingness', 'alex', 'estrada', 'humenne', 'left', 'depress', 'hymn', 'mob', 'candle', '7/8', 'metal', 'plummet', 'wedding', 'rebate', 'okuda', 'guitar', 'stab', 'bristol', 'ruutel', 'w.', 'europeans', 'dealing', 'abnormal', 'golf', 'iftikhar', 'effigy', 'abstention', 'prevention', 'compromise', 'sphere', 'calderon', 'fitzgerald', 'ting', 'barroso', 'handful', 'arlington', 'restoration', 'prove', 'prachanda', 'departure', 'oz', 'montpelli', 'relegate', 'confinement', 'magdalena', 'conscience', 'wilma', 'zapatero', 'hosseini', 'clarence', 'penrose', 'rumor', 'quarterfinal', 'revolt', 'engineering', 'felipe', 'ransom', 'seventy', '260', 'hyderabad', 'chang', 'erdogan', 'guus', 'principe', 'gear', 'milf', 'khaled', 'erosion', 'partition', 'eta', 'atp', 'bossa', 'at&t', 'anabel', 'pipe', 'advertising', 'ski', 'mediterranean', 'ernesto', 'botham', 'hitter', 'hole', 'sampras', 'roger', 'vacancy', 'manhattan', 'salih', 'rid', 'pure', 'ieng', '10-year', 'kohl', 'chesnot', 'shetty', 'fifty', 'porter', '36th', 'studio', 'patrick', 'buffett', 'justin', 'bellerive', 'cemetery', 'slovan', 'r.', 'revitalize', 'joseph', 'significant', 'ideology', 'caldwell', 'acquire', 'lodge', 'coetzer', 'rba', 'nursultan', 'lg', 'remarkable', 'gonzales', 'jockey', 'pretoria', 'barry', 'dissent', 'modest', 'ioc', 'arc', 'scholar', "'d", 'blind', 'lahoud', 'sieg', 'canonica', 'archipelago', 'kabir', 'hakimi', 'wwf', 'segment', 'falter', 'prodi', 'ceo', 'haarlem', 'panic', 'rafik', 'cola', 'rauf', 'amir', 'continuous', 'fortuna', 'staffer', 'scandinavian', 'kubis', 'retaliation', 'bankruptcy', 'consistent', 'jesus', 'mashhadani', 'federer', 'proper', 'paedophile', 'hyun', 'shed', 'repeatedly', 'osama', 'anguilla', 'religion', 'kilometre', 'debut', 'harsh', '756', 'guiana', 'abubakar', 'maynard', 'evacuation', 'tactic', 'falkland', 'ad', 'lethal', '430', 'barter', 'gratitude', 'centre', 'mercosur', 'hispanic', 'sympathizer', 'prolific', 'perth', 'bandar', 'retail', 'axum', 'injection', 'h5n2', 'kuchma', 'annan', 'boris', 'deterioration', 'myth', '83', 'cannes', 'f.w.', 'salesman', 'da', 'sanader', 'volkova', 'radulescu', 'stir', '15-member', 'benzene', 'talha', 'arnold', 'medalist', 'attorney', 'danny', '...', 'cotton', 'riel', '2020', 'stein', 'mccormack', 'incomplete', 'secondhand', 'tool', 'turki', 'youth', 'reassure', 'lien', 'calm', 'reformist', 'impeachment', 'tablet', 'elizabeth', 'hamburg', 'korda', 'improvised', 'addis', 'manuel', 'zelaya', 'postponement', 'jorge', '165', 'popov', 'citgo', 'powder', 'feyenoord', 'lage', 'web', 'effect', 'ravine', 'beit', 'justify', 'apologise', 'cocu', 'broad', 'sole', 'professor', 'nicholas', 'georges', 'nagin', 'mccain', '1956', 'haga', 'faith', '06-jan', 'clan', 'rebellion', 'cameraman', 'jazz', 'lukoil', 'junior', 'motivated', 'erik', 'punishment', 'taylor', 'pearson', 'nepali', 'forgery', 'ackerman', 'jamie', 'bernd', 'argentina', '5,000', 'icy', 'male', 'terrorize', 'rough', '600,000', 'frederic', '1978', 'aleksandra', 'punjab', 'zardari', 'offence', 'mini', 'twitter', 'labour', 'mottaki', '286', 'nwr', 'yoshikawa', 'defender', 'edwin', 'wolfowitz', 'haringa', 'nazarbayev', 'mainstream', 'richard', 'medina', 'retreat', 'tower', 'lament', 'baeron', 'lovato', 'hitch', 'miller', 'milan', 'hans', 'gm', 'grave', 'portsmouth', 'sven', 'stagger', 'supplier', 'shilpa', 'sarkozy', 'multiple', 'gunship', 'reshuffle', 'kucera', '110', 'protectorate', 'forward', 'rescuer', 'stormy', 'aug', 'icac', 'bargain', 'slalom', 'regulation', 'baghdadi', 'expos', 'fade', 'advisory', 'ridge', 'henri', '18-man', '6.4', 'koninklijke', 'agassi', 'garrigue', 'romano', 'astros', 'karami', '199', 'hossein', 'sussex', 'indebted', 'actor', 'clearly', 'livestock', 'joyful', 'ass', 'insert', 'apology', 'ghad', 'jennifer', 'henry', 'veiled', 'transcript', 'manila', 'tranmere', 'renegotiate', 'assumption', 'java', 'shallah', 'milton', 'semifinalist', 'faisal', 'lucrative', 'restriction', 'iva', 'mclaren', 'upswing', 'leopold', 'unocal', 'ion', 'riddled', 'willem', 'confidence', 'accidentally', 'bakool', 'gough', 'rabinovich', 'prosecute', 'sisco', 'scoreboard', 'mary', 'narrowly', 'exhume', 'confess', 'concede', 'burst', 'chisinau', '195', 'laugh', 'toyota', 'metz', 'karsten', 'native', 'manufacture', 'reverse', 'eager', 'dining', 'juba', 'credible', 'backer', 'soft', 'veteran', 'directly', 'siphon', 'eliminate', 'donation', 'burns', 'greeting', 'leslie', 'italians', 'amy', 'askar', 'oliver', 'confer', 'bluefin', 'saeed', 'constructorul', 'interpreter', 'jim', 'sofia', 'infant', 'basuki', 'bernard', 'favorable', 'exact', 'gwyneth', 'trainer', 'difficult', '54', 'sultan', 'caraballo', 'hefty', 'steve', 'phnom', 'kdp', 'verizon', 'facilitate', 'adams', 'telecom', 'boardman', 'cologne', 'luiz', 'breda', 'martic', 'application', 'rose', 'puerto', 'subdue', 'disappointed', 'competent', 'klerk', 'bode', 'garang', 'jewelry', 'lakes', 'dundee', 'laden', 'character', 'genoa', 'relic', 'sorry', 've', 'vanuatu', 'sack', 'batticaloa', 'empty', 'gray', 'wielgus', 'oncin', 'ignacio', 'johnstone', 'gilford', 'commentary', 'lak', 'martin', 'opec', 'karlsruhe', 'clearance', 'joaquin', '2.6', 'kai', '86', 'ramadan', 'deepen', 'majoli', 'holocaust', 'ceremonial', 'u.n', 'film', 'lumpur', 'kuala', 'beg', 'snap', 'harass', 'celica', 'spla', 'saran', 'architect', 'composer', 'huot', 'boss', '7.4', 'partisan', 'countryside', 'solution', 'emile', 'tauziat', 'compensation', 'define', 'brand', 'representation', 'barricade', 'groenefeld', 'cattle', 'secede', 'chair', 'frenchman', 'melinda', 'fragmented', 'sittard', 'antibiotic', 'exhibition', 'nv', 'balkans', 'artillery', 'alvaro', 'skier', 'takemura', 'pork', 'monaco', 'graham', 'chemlon', 'briefly', 'breakaway', 'nour', 'lady', 'batchelor', 'mock', 'dazzle', 'perform', 'principle', 'baltimore', 'partially', 'automatic', 'disburse', 'kocinski', 'patriot', 'perez', 'occasion', 'remnant', 'conspire', 'counsel', 'medvedev', '6-', 'skid', 'sergeant', 'roque', 'caution', 'jumblatt', 'almere', 'scope', 'lula', 'stride', 'ringleader', 'cosmonaut', 'stumble', 'nist', 'invade', 'assignment', 'kumaratunga', 'smoke', 'exploration', 'apostolic', 'gloucestershire', 'ivo', 'cement', 'razuri', 'jabaliya', 'teen', 'meshaal', 'guide', 'framework', 'jans', 'afterward', 'nec', 'minnesota', 'marcus', 'assistant', 'mind', 'hemisphere', 'seiki', 'kingdom', 'stretch', 'super', 'shadab', 'uncensored', 'sorensen', 'importance', 'lancashire', 'arroyo', 'rodriguez', 'taking', 'wood', 'roh', 'delp', 'djindjic', 'preval', 'vacuum', 'wrongdoing', 'benazir', 'mofaz', 'vierklau', 'error', 'buyer', 'grower', 'gary', 'petr', 'gronholm', 'estonian', 'else', 'ghanaian', 'axe', 'arson', 'victor', 'precaution', 'rene', 'corporate', 'jericho', 'eld', 'basque', 'crush', 'tsang', 'hillary', 'hoax', '84', 'chela', 'witschge', 'coffin', 'tim', 'yesterday', 'd.', 'blogger', 'wipe', 'dechy', 'panis', 'thermal', 'checkpoint', 'nijmegen', 'stephane', 'bail', 'bride', 'decree', 'tome', 'prix', 'hudson', 'momentum', 'forget', 'idema', 'northeastern', 'costello', 'quite', 'exclusive', 'teutenberg', 'submission', 'constantinople', 'hipc', 'silva', 'aggravate', 'nonetheless', 'sino', 'ethnicity', 'thabo', 'jalalabad', 'smuggler', 'marketing', 'memorandum', 'ariane', 'deplete', 'floor', 'rivalry', 'claire', 'eduard', 'mortar', 'khan', 'eide', 'robot', 'caller', 'label', 'solitary', 'implementation', 'barno', 'impeach', 'solo', 'kaluwitharana', 'wrack', 'soul', 'sharapova', 'crippled', 'evelyn', 'interfax', 'bakara', 'angry', 'smith', 'cheat', 'caucasus', 'yusuf', 'congressional', 'egeland', 'blair', '*', 'disabled', 'brad', 'vitesse', 'debbie', 'marriage', 'vincent', 'courthouse', 'kph', 'sport', 'airway', 'operational', 'barton', 'outsider', 'jeff', 'hero', 'royalty', 'barbara', 'ruegen', 'harald', 'zapatista', 'n.', 'stanford', 'portugal', 'nac', 'litigation', 'makinen', 'shake', 'tennessee', 'villepin', 'durham', '26-year', 'mercantile', 'bennett', 'luis', 'hog', 'extraordinary', 'sight', 'attach', 'ligi', 'azahari', 'abortion', 'clock', 'actual', 'fein', 'respective', 'adjust', 'turin', 'bleed', 'bob', 'greg', 'finland', 'grab', 'injunction', 'micheletti', 'karol', 'bashir', 'commend', 'copy', '21st', 'boraine', 'restrictive', 'except', 'cyclone', 'automobile', 'tire', 'porto', 'culminate', 'hanoun', 'leonard', 'ingrid', 'richards', 'organisation', 'erase', 'trans', 'ghazni', 'mukasey', 'element', 'wasim', 'vancouver', '19-year', 'earmark', 'oval', '176', 'anderson', 'amend', 'malbrunot', 'treasure', 'luge', 'shaul', 'kiss', 'empower', 'ravi', 'harare', 'slovakia', 'vouch', 'behavior', 'ugandan', 'tilburg', '8th', 'frazier', 'archbishop', 'dmitry', 'samper', 'mistake', 'itamar', 'junichiro', 'precedent', '83rd', '17th', 'optimism', 'dell', 'bot', 'vision', 'olivier', 'rag', 'radar', 'striker', 'h.i.v.', 'wikileaks', 'franz', 'greet', 'baluch', 'ratner', 'smile', 'ababa', '210', 'container', 'veldman', 'harvesting', 'tendulkar', 'skating', 'flore', 'hometown', 'serie', 'penh', 'dick', 'dye', 'incumbent', 'consultation', 'science', 'pilot', 'oppressed', 'kanyarukiga', 'object', 'plantation', 'liaon', 'eyewitness', 'karim', 'anwar', 'palmans', 'yankees', 'baker', 'annex', '111', 'fate', 'endanger', 'suitable', 'mehrtens', 'famous', 'dominguez', 'spin', 'jabalya', 'carol', 'howes', 'zimbabwean', 'drc', 'moo', 'oic', 'impasse', "o'brien", 'moussaoui', 'rodrigo', 'bedside', '5.1', 'jemaah', 'ashore', 'accuracy', 'gloria', 'guerrero', 'preparation', 'belarus', 'publishing', '48', 'gujral', 'accra', 'marie', 'breed', 'najaf', 'craig', 'courage', 'brazilians', 'hunter', '1972', 'netanyahu', 'ortiz', 'pickens', 'cage', 'enqvist', 'larijani', 'zacarias', 'poisoning', 'russell', 'firewood', 'credibility', 'midway', 'provider', '87', 'dark', 'apartheid', 'lawlessness', 'armenian', 'bourlet', 'adamkus', 'mutola', 'tiananmen', 'incur', 'inter', 'initial', 'publicize', '103', 'fellow', 'achievement', 'tarango', 'cable', 'koert', 'starve', 'zambia', 'administrator', 'erkinbayev', 'lithuanian', '941', 'mike', 'reaffirm', 'sinn', 'motorcycle', 'aig', 'northeast', 'spiritual', 'cheer', 'nova', 'casey', 'embark', 'kharrazi', 'akerson', 'fiery', 'nangarhar', 'irregularity', 'silver', 'inacio', 'pire' ]: if i in line: line = [j for j in line if j != i] c += 1 if c == 20: t1.append(line) flag = 1 break if flag: continue for i in original: i = i[0] if i not in [ 'david', 'yeltsin', 'replacement', 'mubarak', 'phone', 'powell', 'wimbledon', 'brazilian', 'title', 'm.', 'bulgaria', 'confront', 'hosni', 'inspire', 'lap', 'denial', 'benedict', 'maryland', 'giant', 'midfielder', 'uribe', 'maliki', 'insult', 'cypriot', 'gyanendra', 'bhutto', 'autonomy', 'apparently', 'wild', 'jan', 'tunisia', 'social', 'magazine', 'xvi', 'sick', 'pence', 'migratory', 'multiparty', 'unite', 'postal', 'upcom', 'universal', 'actress', 'highly', 'colin', 'penalty', 'aol', 'huber', 'dissolve', 'augusto', 'lauck', 'internal', '22nd', 'christopher', 'visegrad', 'takeda', 'bratislava', 'fallujah', 'rush', 'whom', 'hat', 'topple', 'online', 'suspicious', 'irreparable', 'funeral', 'mbeki', 'karimov', 'heat', 'bar', 'walid', 'chris', 'these', 'curb', 'vs.', 'institute', 'unnamed', 'forum', 'reluctant', 'strength', 'tribute', 'recovery', 'puk', 'zvornik', 'dutroux', 'forma', 'koizumi', 'guy', 'view', 'plo', 'eighth', 'indict', 'merkel', 'nervous', 'kamal', 'zionism', 'maleeva', '350', 'sponsor', 'missionary', 'industrialize', 'reconstruction', 'cheney', 'lynch', 'grimsby', 'morales', 'constitute', 'congratulate', 'initiate', 'hansa', 'cancel', 'awareness', 'performance', 'sing', 'savic', 'cape', 'veto', 'choose', 'abdullahi', 'nathalie', 'jose', 'contribution', 'wildlife', 'stake', 'francisco', 'gunfight', 'tibetan', 'levee', 'personal', 'transfer', 'kevin', 'sean', 'belgian', 'goss', 'wiesner', 'slovenia', 'hardline', 'indigenous', 'compound', 'contractor', 'trick', 'medal', 'stick', 'championship', 'voice', 'abdullah', 'rostock', 'suleiman', '1975', 'really', 'angela', 'nouri', 'j.', 'society', 'offset', 'reunification', 'levy', 'nationwide', 'exceed', 'emirate', 'tribe', 'conte', 'ill', 'chancellor', 'bin', 'bhutan', 'm', 'worship', 'marseille', 'pauli', 'pinochet', 'flush', 'aggressive', 'atoll', 'delegation', 'judith', 'thani', 'quickly', 'blessing', 'meadows', 'kyrgyzstan', 'enrich', 'avenge', 'inc.', 'croatia', 'moratorium', 'inability', 'surayud', 'plea', 'roe', 'armenia', 'philippine', 'wang', 'vogts', 'lebed', 'outgoing', 'hiddink', 'abdel', 'hampshire', 'demolition', 'sundown', 'gnassingbe', 'google', 'moya', 'ball', 'emanuele', 'sprint', '1971', 'clayton', '275', 'leash', 'aoki', 'chirac', 'intervene', 'togolese', 'olsza', 'lower', 'carolina', 'coulthard', 'hearing', 'optimistic', 'chaudhry', 'andre', 'safin', 'jerry', 'wheat', 'rallying', 'akram', 'matt', 'solve', '3.8', 'subsidize', 'dash', 'nationalize', 'motive', 'ira', '280', 'willingness', 'alex', 'estrada', 'humenne', 'left', 'depress', 'hymn', 'mob', 'candle', '7/8', 'metal', 'plummet', 'wedding', 'rebate', 'okuda', 'guitar', 'stab', 'bristol', 'ruutel', 'w.', 'europeans', 'dealing', 'abnormal', 'golf', 'iftikhar', 'effigy', 'abstention', 'prevention', 'compromise', 'sphere', 'calderon', 'fitzgerald', 'ting', 'barroso', 'handful', 'arlington', 'restoration', 'prove', 'prachanda', 'departure', 'oz', 'montpelli', 'relegate', 'confinement', 'magdalena', 'conscience', 'wilma', 'zapatero', 'hosseini', 'clarence', 'penrose', 'rumor', 'quarterfinal', 'revolt', 'engineering', 'felipe', 'ransom', 'seventy', '260', 'hyderabad', 'chang', 'erdogan', 'guus', 'principe', 'gear', 'milf', 'khaled', 'erosion', 'partition', 'eta', 'atp', 'bossa', 'at&t', 'anabel', 'pipe', 'advertising', 'ski', 'mediterranean', 'ernesto', 'botham', 'hitter', 'hole', 'sampras', 'roger', 'vacancy', 'manhattan', 'salih', 'rid', 'pure', 'ieng', '10-year', 'kohl', 'chesnot', 'shetty', 'fifty', 'porter', '36th', 'studio', 'patrick', 'buffett', 'justin', 'bellerive', 'cemetery', 'slovan', 'r.', 'revitalize', 'joseph', 'significant', 'ideology', 'caldwell', 'acquire', 'lodge', 'coetzer', 'rba', 'nursultan', 'lg', 'remarkable', 'gonzales', 'jockey', 'pretoria', 'barry', 'dissent', 'modest', 'ioc', 'arc', 'scholar', "'d", 'blind', 'lahoud', 'sieg', 'canonica', 'archipelago', 'kabir', 'hakimi', 'wwf', 'segment', 'falter', 'prodi', 'ceo', 'haarlem', 'panic', 'rafik', 'cola', 'rauf', 'amir', 'continuous', 'fortuna', 'staffer', 'scandinavian', 'kubis', 'retaliation', 'bankruptcy', 'consistent', 'jesus', 'mashhadani', 'federer', 'proper', 'paedophile', 'hyun', 'shed', 'repeatedly', 'osama', 'anguilla', 'religion', 'kilometre', 'debut', 'harsh', '756', 'guiana', 'abubakar', 'maynard', 'evacuation', 'tactic', 'falkland', 'ad', 'lethal', '430', 'barter', 'gratitude', 'centre', 'mercosur', 'hispanic', 'sympathizer', 'prolific', 'perth', 'bandar', 'retail', 'axum', 'injection', 'h5n2', 'kuchma', 'annan', 'boris', 'deterioration', 'myth', '83', 'cannes', 'f.w.', 'salesman', 'da', 'sanader', 'volkova', 'radulescu', 'stir', '15-member', 'benzene', 'talha', 'arnold', 'medalist', 'attorney', 'danny', '...', 'cotton', 'riel', '2020', 'stein', 'mccormack', 'incomplete', 'secondhand', 'tool', 'turki', 'youth', 'reassure', 'lien', 'calm', 'reformist', 'impeachment', 'tablet', 'elizabeth', 'hamburg', 'korda', 'improvised', 'addis', 'manuel', 'zelaya', 'postponement', 'jorge', '165', 'popov', 'citgo', 'powder', 'feyenoord', 'lage', 'web', 'effect', 'ravine', 'beit', 'justify', 'apologise', 'cocu', 'broad', 'sole', 'professor', 'nicholas', 'georges', 'nagin', 'mccain', '1956', 'haga', 'faith', '06-jan', 'clan', 'rebellion', 'cameraman', 'jazz', 'lukoil', 'junior', 'motivated', 'erik', 'punishment', 'taylor', 'pearson', 'nepali', 'forgery', 'ackerman', 'jamie', 'bernd', 'argentina', '5,000', 'icy', 'male', 'terrorize', 'rough', '600,000', 'frederic', '1978', 'aleksandra', 'punjab', 'zardari', 'offence', 'mini', 'twitter', 'labour', 'mottaki', '286', 'nwr', 'yoshikawa', 'defender', 'edwin', 'wolfowitz', 'haringa', 'nazarbayev', 'mainstream', 'richard', 'medina', 'retreat', 'tower', 'lament', 'baeron', 'lovato', 'hitch', 'miller', 'milan', 'hans', 'gm', 'grave', 'portsmouth', 'sven', 'stagger', 'supplier', 'shilpa', 'sarkozy', 'multiple', 'gunship', 'reshuffle', 'kucera', '110', 'protectorate', 'forward', 'rescuer', 'stormy', 'aug', 'icac', 'bargain', 'slalom', 'regulation', 'baghdadi', 'expos', 'fade', 'advisory', 'ridge', 'henri', '18-man', '6.4', 'koninklijke', 'agassi', 'garrigue', 'romano', 'astros', 'karami', '199', 'hossein', 'sussex', 'indebted', 'actor', 'clearly', 'livestock', 'joyful', 'ass', 'insert', 'apology', 'ghad', 'jennifer', 'henry', 'veiled', 'transcript', 'manila', 'tranmere', 'renegotiate', 'assumption', 'java', 'shallah', 'milton', 'semifinalist', 'faisal', 'lucrative', 'restriction', 'iva', 'mclaren', 'upswing', 'leopold', 'unocal', 'ion', 'riddled', 'willem', 'confidence', 'accidentally', 'bakool', 'gough', 'rabinovich', 'prosecute', 'sisco', 'scoreboard', 'mary', 'narrowly', 'exhume', 'confess', 'concede', 'burst', 'chisinau', '195', 'laugh', 'toyota', 'metz', 'karsten', 'native', 'manufacture', 'reverse', 'eager', 'dining', 'juba', 'credible', 'backer', 'soft', 'veteran', 'directly', 'siphon', 'eliminate', 'donation', 'burns', 'greeting', 'leslie', 'italians', 'amy', 'askar', 'oliver', 'confer', 'bluefin', 'saeed', 'constructorul', 'interpreter', 'jim', 'sofia', 'infant', 'basuki', 'bernard', 'favorable', 'exact', 'gwyneth', 'trainer', 'difficult', '54', 'sultan', 'caraballo', 'hefty', 'steve', 'phnom', 'kdp', 'verizon', 'facilitate', 'adams', 'telecom', 'boardman', 'cologne', 'luiz', 'breda', 'martic', 'application', 'rose', 'puerto', 'subdue', 'disappointed', 'competent', 'klerk', 'bode', 'garang', 'jewelry', 'lakes', 'dundee', 'laden', 'character', 'genoa', 'relic', 'sorry', 've', 'vanuatu', 'sack', 'batticaloa', 'empty', 'gray', 'wielgus', 'oncin', 'ignacio', 'johnstone', 'gilford', 'commentary', 'lak', 'martin', 'opec', 'karlsruhe', 'clearance', 'joaquin', '2.6', 'kai', '86', 'ramadan', 'deepen', 'majoli', 'holocaust', 'ceremonial', 'u.n', 'film', 'lumpur', 'kuala', 'beg', 'snap', 'harass', 'celica', 'spla', 'saran', 'architect', 'composer', 'huot', 'boss', '7.4', 'partisan', 'countryside', 'solution', 'emile', 'tauziat', 'compensation', 'define', 'brand', 'representation', 'barricade', 'groenefeld', 'cattle', 'secede', 'chair', 'frenchman', 'melinda', 'fragmented', 'sittard', 'antibiotic', 'exhibition', 'nv', 'balkans', 'artillery', 'alvaro', 'skier', 'takemura', 'pork', 'monaco', 'graham', 'chemlon', 'briefly', 'breakaway', 'nour', 'lady', 'batchelor', 'mock', 'dazzle', 'perform', 'principle', 'baltimore', 'partially', 'automatic', 'disburse', 'kocinski', 'patriot', 'perez', 'occasion', 'remnant', 'conspire', 'counsel', 'medvedev', '6-', 'skid', 'sergeant', 'roque', 'caution', 'jumblatt', 'almere', 'scope', 'lula', 'stride', 'ringleader', 'cosmonaut', 'stumble', 'nist', 'invade', 'assignment', 'kumaratunga', 'smoke', 'exploration', 'apostolic', 'gloucestershire', 'ivo', 'cement', 'razuri', 'jabaliya', 'teen', 'meshaal', 'guide', 'framework', 'jans', 'afterward', 'nec', 'minnesota', 'marcus', 'assistant', 'mind', 'hemisphere', 'seiki', 'kingdom', 'stretch', 'super', 'shadab', 'uncensored', 'sorensen', 'importance', 'lancashire', 'arroyo', 'rodriguez', 'taking', 'wood', 'roh', 'delp', 'djindjic', 'preval', 'vacuum', 'wrongdoing', 'benazir', 'mofaz', 'vierklau', 'error', 'buyer', 'grower', 'gary', 'petr', 'gronholm', 'estonian', 'else', 'ghanaian', 'axe', 'arson', 'victor', 'precaution', 'rene', 'corporate', 'jericho', 'eld', 'basque', 'crush', 'tsang', 'hillary', 'hoax', '84', 'chela', 'witschge', 'coffin', 'tim', 'yesterday', 'd.', 'blogger', 'wipe', 'dechy', 'panis', 'thermal', 'checkpoint', 'nijmegen', 'stephane', 'bail', 'bride', 'decree', 'tome', 'prix', 'hudson', 'momentum', 'forget', 'idema', 'northeastern', 'costello', 'quite', 'exclusive', 'teutenberg', 'submission', 'constantinople', 'hipc', 'silva', 'aggravate', 'nonetheless', 'sino', 'ethnicity', 'thabo', 'jalalabad', 'smuggler', 'marketing', 'memorandum', 'ariane', 'deplete', 'floor', 'rivalry', 'claire', 'eduard', 'mortar', 'khan', 'eide', 'robot', 'caller', 'label', 'solitary', 'implementation', 'barno', 'impeach', 'solo', 'kaluwitharana', 'wrack', 'soul', 'sharapova', 'crippled', 'evelyn', 'interfax', 'bakara', 'angry', 'smith', 'cheat', 'caucasus', 'yusuf', 'congressional', 'egeland', 'blair', '*', 'disabled', 'brad', 'vitesse', 'debbie', 'marriage', 'vincent', 'courthouse', 'kph', 'sport', 'airway', 'operational', 'barton', 'outsider', 'jeff', 'hero', 'royalty', 'barbara', 'ruegen', 'harald', 'zapatista', 'n.', 'stanford', 'portugal', 'nac', 'litigation', 'makinen', 'shake', 'tennessee', 'villepin', 'durham', '26-year', 'mercantile', 'bennett', 'luis', 'hog', 'extraordinary', 'sight', 'attach', 'ligi', 'azahari', 'abortion', 'clock', 'actual', 'fein', 'respective', 'adjust', 'turin', 'bleed', 'bob', 'greg', 'finland', 'grab', 'injunction', 'micheletti', 'karol', 'bashir', 'commend', 'copy', '21st', 'boraine', 'restrictive', 'except', 'cyclone', 'automobile', 'tire', 'porto', 'culminate', 'hanoun', 'leonard', 'ingrid', 'richards', 'organisation', 'erase', 'trans', 'ghazni', 'mukasey', 'element', 'wasim', 'vancouver', '19-year', 'earmark', 'oval', '176', 'anderson', 'amend', 'malbrunot', 'treasure', 'luge', 'shaul', 'kiss', 'empower', 'ravi', 'harare', 'slovakia', 'vouch', 'behavior', 'ugandan', 'tilburg', '8th', 'frazier', 'archbishop', 'dmitry', 'samper', 'mistake', 'itamar', 'junichiro', 'precedent', '83rd', '17th', 'optimism', 'dell', 'bot', 'vision', 'olivier', 'rag', 'radar', 'striker', 'h.i.v.', 'wikileaks', 'franz', 'greet', 'baluch', 'ratner', 'smile', 'ababa', '210', 'container', 'veldman', 'harvesting', 'tendulkar', 'skating', 'flore', 'hometown', 'serie', 'penh', 'dick', 'dye', 'incumbent', 'consultation', 'science', 'pilot', 'oppressed', 'kanyarukiga', 'object', 'plantation', 'liaon', 'eyewitness', 'karim', 'anwar', 'palmans', 'yankees', 'baker', 'annex', '111', 'fate', 'endanger', 'suitable', 'mehrtens', 'famous', 'dominguez', 'spin', 'jabalya', 'carol', 'howes', 'zimbabwean', 'drc', 'moo', 'oic', 'impasse', "o'brien", 'moussaoui', 'rodrigo', 'bedside', '5.1', 'jemaah', 'ashore', 'accuracy', 'gloria', 'guerrero', 'preparation', 'belarus', 'publishing', '48', 'gujral', 'accra', 'marie', 'breed', 'najaf', 'craig', 'courage', 'brazilians', 'hunter', '1972', 'netanyahu', 'ortiz', 'pickens', 'cage', 'enqvist', 'larijani', 'zacarias', 'poisoning', 'russell', 'firewood', 'credibility', 'midway', 'provider', '87', 'dark', 'apartheid', 'lawlessness', 'armenian', 'bourlet', 'adamkus', 'mutola', 'tiananmen', 'incur', 'inter', 'initial', 'publicize', '103', 'fellow', 'achievement', 'tarango', 'cable', 'koert', 'starve', 'zambia', 'administrator', 'erkinbayev', 'lithuanian', '941', 'mike', 'reaffirm', 'sinn', 'motorcycle', 'aig', 'northeast', 'spiritual', 'cheer', 'nova', 'casey', 'embark', 'kharrazi', 'akerson', 'fiery', 'nangarhar', 'irregularity', 'silver', 'inacio', 'pire' ]: line = [j for j in line if j != i] c += 1 if c == 20: t1.append(line) break #write the data to a modified file with open('modified_data.txt', 'w') as file: for line in t1: file.write(' '.join(line)) file.write('\n') #predict the data ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... ## You can check that the modified text is within the modification limits. modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... strategy_instance = helper.strategy() features_and_labels = [] for i in (strategy_instance.class0): tokens = get_freq_of_tokens(i) features_and_labels.append((tokens, 0)) for i in (strategy_instance.class1): tokens = get_freq_of_tokens(i) features_and_labels.append((tokens, 1)) encoder = LabelEncoder() vectorizer = DictVectorizer(dtype=int, sparse=True) x, y = list(zip(*features_and_labels)) x = vectorizer.fit_transform(x) y = encoder.fit_transform(y) parameters = { 'C': 10.0, 'coef0': 0.0, 'degree': 3, 'gamma': 'auto', 'kernel': 'linear' } #x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(x, y, random_state=1, train_size=0.8) clf = strategy_instance.train_svm(parameters, x, y) coef = clf.coef_ # print(coef) data = coef.data.tolist() idx = coef.indices.tolist() _list = [(vectorizer.get_feature_names()[idx[e]], data[e]) for e in range(len(data))] words = [] for i in range(len(data)): words.append((idx[i], data[i])) words = sorted(words, key=lambda x: x[1]) class0_dict = {} class1_dict = {} for i in range(len(words)): if words[i][1] < 0: class0_dict[words[i][0]] = abs(words[i][1]) else: class1_dict[words[i][0]] = abs(words[i][1]) word_0 = np.array( sorted(class0_dict.items(), key=lambda x: x[1], reverse=1)[:100]) word_1 = np.array( sorted(class1_dict.items(), key=lambda x: x[1], reverse=1)[:100]) word_0_index = word_0[:, 0].tolist() word_1_index = word_1[:, 0].tolist() # for i in word_0_index: # print(vectorizer.feature_names_[int(i)]) # print(word_0) class0_weight = [] class1_weight = [] # for_now0 = [] # for i in word_0_index: # for_now0.append(vectorizer.feature_names_[int(i)]) # print(for_now0) # for_now1 = [] # for i in word_1_index: # for_now1.append(vectorizer.feature_names_[int(i)]) # print(for_now1) with open('modified_data.txt', "w") as modified_data: with open(test_data, "r") as test: for line in test: l = line.strip().split(' ') global l_mod_top10 l = set(l) l = list(l) l_new = copy.deepcopy(l) for j in range(len(l)): if l[j] not in vectorizer.feature_names_: continue else: index = vectorizer.feature_names_.index(l[j]) if index in class0_dict.keys(): class0_weight.append((index, class0_dict[index])) if index in class1_dict.keys(): class1_weight.append((index, class1_dict[index])) class0_weight = sorted(class0_weight, key=lambda x: x[1], reverse=True) class1_weight = sorted(class1_weight, key=lambda x: x[1], reverse=True) # print(class0_weight) # print(class1_weight) # print(l_new) l_mod = [] for m in range(len(class1_weight)): if class1_weight[m][ 0] not in word_0_index: #and class1_weight[m][0] in word_1_index: l_mod.append(vectorizer.feature_names_[int( class1_weight[m][0])]) if len(l_mod) >= 10: l_mod_top10 = l_mod[:10] else: l_mod_top10 = copy.deepcopy(l_mod) count = 0 original_len = len(l_new) l_new = list(filter(in_it, l_new)) diff = original_len - len(l_new) # print(diff) count_1 = diff for n in range(len(l_mod)): if count_1 < 20: for x in range(count, len(word_0_index)): aa = vectorizer.feature_names_[int( word_0_index[x])] if aa not in l_new: l_new.append(aa) count += 1 break else: count += 1 count_1 += 1 # print(count_1) while count_1 < 20: print(count_1) for x in range(count, len(word_0_index)): if vectorizer.feature_names_[int( word_0_index[x])] not in l: l_new.append(vectorizer.feature_names_[int( word_0_index[x])]) count += 1 count_1 += 1 p = " ".join(str(i) for i in l_new) modified_data.write(p + '\n') # break ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... ## You can check that the modified text is within the modification limits. # result = clf.predict(modified_data).tolist() # print(result.count(1) / len(result)) modified_data = './modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier(test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance=helper.strategy() import numpy as np y = np.zeros((360,1),dtype=np.int) y[180:] = 1 y = y.ravel() prng = np.random.RandomState(233233) train0 = prng.choice(strategy_instance.class0,180) train0 = train0.tolist() x_trainAll = train0 + strategy_instance.class1 def createVocabList(dataSet): vocabSet=set([]) for document in dataSet: vocabSet=vocabSet|set(document) return list(vocabSet) def setOfWords2Vec(vocabSet,inputSet): returnVec=[0]*len(vocabSet) for word in inputSet: if word in vocabSet: returnVec[vocabSet.index(word)]=1 return returnVec data = createVocabList(x_trainAll) trainAll=[] for postinDoc in x_trainAll : trainAll.append(setOfWords2Vec(data,postinDoc)) trainAll = np.array(trainAll) with open('test_data.txt','r') as test1: test1=[line.strip().split(' ') for line in test1] testAll=[] for postinDoc in test1: testAll.append(setOfWords2Vec(data,postinDoc)) parameters={} parameters['C'] = 0.02 parameters['kernel'] = 'linear' parameters['degree'] = 3 parameters['gamma'] = 1 parameters['coef0'] = 1 clf = strategy_instance.train_svm(parameters,trainAll,y) w = clf.coef_ index = np.where(w[0] < 0)[0] dic_w = {} for i in index: dic_w[i] = w[0][i] dic_w = sorted(dic_w.items(), key=lambda d: d[1])[0:200] index = [dic_w[i][0] for i in range(len(dic_w))] for i in index: print('index: ',i) add_word = [] for i in index: add_word.append(data[i]) n = 0 for i in range(len(test1)): n = 0 for w in add_word: if n == 20: break if w not in test1[i]: test1[i].append(w) else: continue n = n + 1 print(len(test1)) file=open('./modified_data.txt','w+') for i in range(len(test1)): file.write(" ".join(test1[i])) file.write("\n") line_num = 0 file.close() with open('./modified_data.txt') as mod: for line in mod: line_num += 1 print('line num of modified_data',line_num) ## You can check that the modified text is within the modification limits. modified_data='./modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... strategy_instance = helper.strategy() parameters = {} parameters['C'] = 1 parameters['kernel'] = 'linear' parameters['degree'] = 3 parameters['gamma'] = 'auto' parameters['coef0'] = 0 vec = CountVectorizer(tokenizer=lambda x: x.split()) # tfidf = TfidfVectorizer() data = [] for line in strategy_instance.class0: data.append(' '.join(i for i in line)) for line in strategy_instance.class1: data.append(' '.join(i for i in line)) TFID = TfidfTransformer() X = TFID.fit_transform(vec.fit_transform(data)) y = [0] * len(strategy_instance.class0) + [1] * len( strategy_instance.class1) clf = svm.SVC(kernel='linear', C=1) # clf = svm.SVC(kernel='linear', C=0.030999999999999996, class_weight="balanced") clf.fit(X, y) # print(clf.coef_) coef_dict = dict() coef = clf.coef_[0] for i in range(len(coef.indices)): coef_dict[coef.indices[i]] = coef.data[i] coef_pos = [i for i in coef_dict.items() if i[1] > 0] coef_neg = [i for i in coef_dict.items() if i[1] < 0] coef_pos = sorted(coef_pos, key=lambda x: x[1], reverse=True) coef_neg = sorted(coef_neg, key=lambda x: x[1], reverse=False) # delete:word_pos word_pos = [ j[0] for i in coef_pos for j in vec.vocabulary_.items() if j[1] == i[0] ][:1000] # add:word_neg word_neg = [ j[0] for i in coef_neg for j in vec.vocabulary_.items() if j[1] == i[0] ] # modify modified_data = './modified_data.txt' # modify operation with open(test_data, 'r') as test_file: with open(modified_data, 'w') as modified_file: for line in test_file: use_add = [] use_dele = [] words = line.strip().split(' ') words = list(set(words)) dele = [i for i in word_pos if i in words] # print('d', len(dele), dele) count = 0 for i in dele: words.remove(dele[count]) use_dele.append(dele[count]) count += 1 if count == 10: break add_new = [i for i in word_neg if i not in words] for i in range(20 - count): words.append(add_new[i]) use_add.append(add_new[i]) ## print(use_dele) ## print(use_add) use_dele = [] use_add = [] line = ' '.join(i for i in words) modified_file.write(line + '\n') modified_file.close() # predict test with open(test_data, 'r') as file1: data1 = [line.strip().split(' ') for line in file1] testdata = [] for line in data1: testdata.append(' '.join(i for i in line)) X_test = vec.transform(testdata) # X_test = tfidf.transform(testdata) y1 = clf.predict(X_test) ##print(y1) result = sum(y1) * 100 / 200 # test_data是1的概率 ##print('Success = {}%'.format(result)) # predict modify with open(modified_data, 'r') as file2: data2 = [line.strip().split(' ') for line in file2] testdata2 = [] for line in data2: testdata2.append(' '.join(i for i in line)) X_test2 = vec.transform(testdata2) # X_test2 = tfidf.transform(testdata2) ##print(X_test2.shape) # print(X_test.toarray()) y2 = clf.predict(X_test2) ##print(y2) result2 = sum(y2) * 100 / 200 # test_data是1的概率 ##print('Success = {}%'.format(result2)) assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier(test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance=helper.strategy() import numpy as np y = np.zeros((380,1),dtype=np.int) y[200:] = 1 y = y.ravel() prng = np.random.RandomState(233233) train0 = prng.choice(strategy_instance.class0,200) train0 = train0.tolist() x_trainAll = train0 + strategy_instance.class1 corpus = [] for para in x_trainAll: corpus.append(''.join(para)) # train set def createVocabList(dataSet): vocabSet=set([]) for document in dataSet: vocabSet=vocabSet|set(document) return list(vocabSet) def setOfWords2Vec(vocabSet,inputSet): returnVec=[0]*len(vocabSet) for word in inputSet: if word in vocabSet: returnVec[vocabSet.index(word)]=1 #print('type of returnVec',type(returnVec)) return returnVec data = createVocabList(x_trainAll) trainAll=[] for postinDoc in x_trainAll : trainAll.append(setOfWords2Vec(data,postinDoc)) trainAll = np.array(trainAll) with open('test_data.txt','r') as test1: # test1 is the test_data file test1=[line.strip().split(' ') for line in test1] ####################### START OF TRANSFORM ############ if False: idf = TfidfTransformer() # 类调用 idf.fit(trainAll) xtrain_tfm = idf.transform(trainAll) if True: idf = TfidfVectorizer(token_pattern='\S+',analyzer='char') # 类调用 xtrain_tfm = idf.fit_transform(corpus) # 将词频矩阵统计成TF-IDF值 weight = xtrain_tfm.toarray() ####################### TEST PART ##################### testAll=[] for postinDoc in test1: testAll.append(setOfWords2Vec(data,postinDoc)) test = [] test2 = [] for i in range(len(test1)): test2.append(' '.join(test1[i])) test.append(' '.join(test1[i])) parameters={} parameters['C'] = 0.02 parameters['kernel'] = 'linear' parameters['degree'] = 3 parameters['gamma'] = 1 parameters['coef0'] = 1 clf = strategy_instance.train_svm(parameters,weight,y) w = clf.coef_ index = np.where(w[0] > 0)[0] #print('show type of index: ',index) dic_w = {} for i in index: dic_w[i] = w[0][i] dic_w = sorted(dic_w.items(), key=lambda d: d[1],reverse = True)[0:-1] #dic_w_reverse = sorted(dic_w.items(), key=lambda d: d[1])[0:200] index = [dic_w[i][0] for i in range(len(dic_w))] #index_reverse = [dic_w_reverse[i][0] for i in range(len(dic_w))] delete_word = [] for i in index: #print('word that need to be deleted: ',data[i]) delete_word.append(data[i]) #print('number of add_word: ',len(delete_word)) with open('test_data.txt','r') as f: f=[line.strip().split(' ') for line in f] #print('len of set(f[0])',len(set(f[0]))) #删除单词 n = 0 for i in range(len(f)): # 行数循环 n = 0 #print(i) deleted = set() added = set() for w in delete_word: # 遍历要删除的数 #print(w) if len(deleted) == 20: break if w in test1[i]: n = n + 1 for index,s in enumerate(f[i]): #print(index,s) if s == w: test1[i].remove(s) deleted.add(s) #删除一次' print('number of deleting',len(deleted)) ''' if len(deleted) != 20: print('len of set(deleted)',len(deleted)) print('show set of deleted: ',deleted) print(len(set(f[0]))) print(len(set(test1[0]))) ''' file=open('./modified_data.txt','w+') for i in range(len(test1)): file.write(" ".join(test1[i])) file.write("\n") file.close() ## You can check that the modified text is within the modification limits. modified_data='./modified_data.txt' if False: with open(modified_data, 'r') as mod: final_version = [line.strip().split(' ') for line in mod] data_final = createVocabList(final_version) final_ALL = [] for postinDoc in final_version: final_ALL.append(setOfWords2Vec(data_final, postinDoc)) final_ALL = np.array(final_ALL) assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... with open(test_data, 'r') as infile: data = [line.strip().split(' ') for line in infile] ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = {} ##..................................# # # # ## Your implementation goes here....# # # # ##..................................# # It seems silly to have to re-concatenate the data that's just been split, only to have # CountVectorizer split it again. However, there doesn't seem to be a way to pass vectors # of tokens to CountVectorizer. It requires the examples to be strings. training_data = Concatenate(strategy_instance.class0) + Concatenate( strategy_instance.class1) training_labels = [0] * len(strategy_instance.class0) + [1] * len( strategy_instance.class1) # Note that we use a custom tokenizer with CountVectorizer to prevent it from removing # punctuation. count_vect = CountVectorizer(tokenizer=SimpleTokenize).fit(training_data) training_counts = count_vect.transform(training_data) tfidf_transformer = TfidfTransformer() training_idf = tfidf_transformer.fit_transform(training_counts) # Train a linear SVM using a tf-idf representation of the training data. parameters = { 'gamma': 'auto', 'C': 1.0, 'kernel': 'linear', 'degree': 2, 'coef0': 0 } classifier = strategy_instance.train_svm(parameters, training_idf, training_labels) # Use our SVM to determine the best words to remove, and possibly add, to fool the classifier. to_replace, replacements = construct_replace_list( classifier, count_vect.get_feature_names()) for lineNo in range(len(data)): line = data[lineNo] wordset = set(line) # Look up the rank for each distinct word in the example, and construct a list of # (rank, word) tuples. word_ranks = [] for word in wordset: if word in to_replace: word_ranks.append((to_replace[word], word)) # Sort the list so that the words with the lowest rank, which most strongly indicate # class 1, are at the beginning. word_ranks.sort() # Construct a set of the 20 words that most strongly indicate class 1, and remove # these words from the example. to_remove = set([wi[1] for wi in word_ranks[:20]]) new_line = [] for i in range(len(line)): if line[i] not in to_remove: new_line.append(line[i]) # If we couldn't find 20 words to remove then add words until the total number of changes # is 20. We add the words which most strongly indicate class 0. if len(to_remove) < 20: ri = 0 for _ in range(20 - len(to_remove)): # Don't add a word if it's already in the example. while replacements[ri] in wordset: ri += 1 new_line.append(replacements[ri]) ri += 1 data[lineNo] = new_line ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... modified_data = './modified_data.txt' with open(modified_data, 'w') as outfile: for line in data: print(' '.join(line), file=outfile) ## You can check that the modified text is within the modification limits. assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
@author: junshuaizhang, monaithang """ from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import CountVectorizer from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix from sklearn.svm import LinearSVC import helper from collections import defaultdict #import numpy as np #def fool_classifier(test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = { 'gamma': 'auto', "C": 0.1, "degree": 10, "kernel": "linear", "coef0": -100 } ##..................................# # # # ## Your implementation goes here....# # #
def fool_classifier(test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... test_dt = None with open('test_data.txt', 'r') as infile: test_dt = [line.strip().split(' ') for line in infile] ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking constants = ['#' * i for i in range(100)] strategy_instance = helper.strategy() parameters = { 'C': 1, 'gamma': 'auto', 'kernel': 'linear', 'coef0': 0.0, 'degree': 3 } lines = [' '.join(line) for line in strategy_instance.class0] \ + [' '.join(line) for line in strategy_instance.class1] cv = CountVectorizer() cv.fit(lines) X_train = cv.transform(lines) model = strategy_instance.train_svm(parameters, X_train, np.array([0] * 360 + [1] * 180)) top_coef_sorted = np.argsort(model.coef_.data)[::-1] top_features = np.array(cv.get_feature_names()) ##..................................# modified_list = [] for record in test_dt: record_new = record for coef_index in top_coef_sorted[:1000]: feature = top_features[coef_index] feature_coef = model.coef_.data[coef_index] if feature_coef > 0 and feature in record_new: # if positivie weight, remove it record_new = [word for word in record_new if word != feature] if len((set(record) - set(record_new)) | \ (set(record_new) - set(record))) == 20: # no more modifications break for coef_index in top_coef_sorted[-1000:]: feature = top_features[coef_index] feature_coef = model.coef_.data[coef_index] if len((set(record) - set(record_new)) | \ (set(record_new) - set(record))) == 20: # no more modifications break if feature_coef < 0: # if negative, add to the record if it's not there if feature not in record_new: record_new = record_new + [feature] if len((set(record) - set(record_new)) | \ (set(record_new) - set(record))) != 20: for const in constants: if const not in record_new: record_new += [const] if len((set(record) - set(record_new)) | \ (set(record_new) - set(record))) == 20: break modified_list.append(record_new) ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... new_file = open("modified_data.txt", "w") for i in modified_list: new_file.write(' '.join(i)) new_file.write('\n') new_file.close() ## You can check that the modified text is within the modification limits. modified_data='./modified_data.txt' assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
def fool_classifier( test_data): ## Please do not change the function defination... ## Read the test data file, i.e., 'test_data.txt' from Present Working Directory... ## You are supposed to use pre-defined class: 'strategy()' in the file `helper.py` for model training (if any), # and modifications limit checking strategy_instance = helper.strategy() parameters = { 'gamma': 'auto', 'C': 0.020999999999999998, 'kernel': 'linear', 'degree': 3, 'coef0': 0 } test_file = test_data class_0 = strategy_instance.class0 class_1 = strategy_instance.class1 test = [] with open(test_file) as f: test = [line.strip().split(' ') for line in f] class_all = class_0 + class_1 vocabulary = set() for sentence in class_all: for word in sentence: vocabulary.add(word) word_list = sorted(vocabulary) train_data_matrix = [] for sample in class_all: temp_list = [] for word in word_list: if word in sample: temp_list.append(1) else: temp_list.append(0) train_data_matrix.append(temp_list) train_data_matrix = np.array(train_data_matrix) test_data_matrix = [] for sample in test: temp_list = [] for word in word_list: if word in sample: temp_list.append(1) else: temp_list.append(0) test_data_matrix.append(temp_list) test_data_matrix = np.array(test_data_matrix) train_label = [0] * 360 + [1] * 180 train_label = np.array(train_label) test_label = [1] * 200 test_label = np.array(test_label) ##clf_start = strategy_instance.train_svm(parameters, train_data_matrix, train_label) #### ##param_range = np.arange(0.001,1,0.01) ## ##param_grid = [{'C': param_range, 'kernel': ['linear']}] ##grid = GridSearchCV(clf_start, param_grid) ##grid.fit(train_data_matrix, train_label) ##clf = grid.best_estimator_ ##print(clf) clf = strategy_instance.train_svm(parameters, train_data_matrix, train_label) dual_coef = clf.dual_coef_[0] class_0_dual_coef = dual_coef[:clf.n_support_[0]] class_1_dual_coef = dual_coef[clf.n_support_[0]:] support_vector_index = clf.support_ class_0_dual_sv_index = [] class_1_dual_sv_index = [] for i in range(len(dual_coef)): if i < clf.n_support_[0]: class_0_dual_sv_index.append( (dual_coef[i], support_vector_index[i])) else: class_1_dual_sv_index.append( (dual_coef[i], support_vector_index[i])) class_0_dual_sv_index = sorted(class_0_dual_sv_index, key=lambda x: abs(x[0]), reverse=True) class_1_dual_sv_index = sorted(class_1_dual_sv_index, key=lambda x: x[0], reverse=True) for test_instance in test_data_matrix: change_count = set() for d1 in class_1_dual_sv_index: index = d1[1] train_instance = train_data_matrix[index] for i in range(len(test_instance)): if test_instance[i] == 1 and train_instance[i] == 1: if i in change_count: continue test_instance[i] = 0 change_count.add(i) if len(change_count) >= 20: break if len(change_count) >= 20: break if len(change_count) >= 20: continue for d0 in class_0_dual_sv_index: index = d0[1] train_instance = train_data_matrix[index] for i in range(len(test_instance)): if test_instance[i] == 0 and train_instance[i] == 1: if i in change_count: continue test_instance[i] = 1 if len(change_count) >= 20: break if len(change_count) >= 20: break if len(change_count) >= 20: continue modified_data = './modified_data.txt' with open(modified_data, 'a') as f: for i in range(len(test)): words_in_original = test[i] words_in_training = word_list words_all = set(words_in_original) | set(words_in_training) words_all = sorted(words_all) modified_test_instance = test_data_matrix[i] for word in words_all: if word not in words_in_training: f.write(f'{word} ') else: word_index = word_list.index(word) if modified_test_instance[word_index] == 0: continue f.write(f'{word} ') f.write('\n') ##..................................# # # # ## Your implementation goes here....# # # # ##..................................# ## Write out the modified file, i.e., 'modified_data.txt' in Present Working Directory... ## You can check that the modified text is within the modification limits. assert strategy_instance.check_data(test_data, modified_data) return strategy_instance ## NOTE: You are required to return the instance of this class.
import helper class0 = helper.strategy().class0 class1 = helper.strategy().class1 feather_list = [] for i in class0: for m in i: feather_list.append(m) for i in class1: for m in i: feather_list.append(m) feather_list = list(set(feather_list)) with open('test_data.txt','r') as test_data: test_data = [line.strip().split(' ') for line in test_data] test = [] for i in test_data: indexlist = [0 for i in range(len(feather_list))] for j in feather_list: if j in i: count = i.count(j) indexlist[feather_list.index(j)] += count test.append(indexlist)