def normalize_attributes(test_set, training_set): ''' A function to normalize training and test data at a given set of data_indices The attributes of data_indices should be numeric and continuous. Param training_set: A table of training data Param test_set: A table of testing data Param data_indices: The indices of the attributes to be normalized Returns: tables containing only the attributes given by data_indices normalized normalized_training_data - normalized training data [0,1] normalized_test_data - normalized test data [0,1] ''' normalized_training_data = [] normalized_test_data = [] for index in range(len(training_set[0])): # go by column training_column = utils.get_column(training_set, index) test_column = utils.get_column(test_set, index) min_val = min(training_column) max_val = max(training_column) training_column = normalize(min_val, max_val, training_column) test_column = normalize(min_val, max_val, test_column) normalized_training_data.append(training_column) normalized_test_data.append(test_column) # we appended columns to normalized data tables, so transpose them normalized_test_data = [ utils.convert_to_numeric(list(i)) for i in np.array(normalized_test_data).T ] normalized_training_data = [ utils.convert_to_numeric(list(i)) for i in np.array(normalized_training_data).T ] return normalized_test_data, normalized_training_data
def transform_rows(input_file, num_tests): random.seed() train_results = [] input_results = [] output_results = [] i = 0 with open(input_file) as f: while i < num_tests: i += 1 line = f.readline() path = utils.get_column(utils.Columns.path, line) if len(path) == 0: continue # FOR DEBUGGING #print line end = random.randint(0, len(path)) input_row = ",".join(string.split(line, ",", 8)[:8]) + "," input_row += utils.path_to_csv(path[:end]) input_results.append(input_row) output_row = "\"T" + str( utils.get_column(utils.Columns.trip_id, line)) + "\"" output_row += "," + str(path[-1][0]) + "," + str(path[-1][1]) output_results.append(output_row) for line in f: train_results.append(line) print len(train_results) return ("".join(train_results), "\n".join(input_results), "\n".join(output_results))
def get_training_infos(training_data): training_infos = [] i = 0 for line in training_data: if i % 100000 == 0: print i i += 1 id = utils.get_column(utils.Columns.trip_id, line) path = utils.get_column(utils.Columns.path, line) # Drop empty paths. if not path: continue training_infos.append((id, utils.path_to_array(path))) return training_infos
def main(): table = u.read_csv('ebay_data_dirty.csv') for row in table: del row[0] del row[0] del row[9] del row[-1] u.write_to_file(table, 'ebay_data_dirty_temp.csv') remove_modelnums_not_applicable('ebay_data_dirty_temp.csv') table_w_modelnums = u.read_csv('ebay_data_clean.csv') case_col = u.get_column(table_w_modelnums, 5) clean_case = clean_case_material(case_col) count = 0 for row in table_w_modelnums: del row[5] row.insert(5,str(clean_case[count])) count = count + 1 movement_col = u.get_column(table_w_modelnums, 4) clean_movement = clean_watch_movement(movement_col) count = 0 for row in table_w_modelnums: del row[4] row.insert(4,str(clean_movement[count])) count = count + 1 band_col = u.get_column(table_w_modelnums, 6) clean_band = clean_band_material(band_col) count = 0 for row in table_w_modelnums: del row[6] row.insert(6,str(clean_band[count])) count = count + 1 mod_col = u.get_column(table_w_modelnums, 7) clean_mod = clean_watch_model(mod_col) count = 0 for row in table_w_modelnums: del row[7] row.insert(7,str(clean_mod[count])) count = count + 1 u.write_to_file(table_w_modelnums, "ebay_data_clean.csv")
def load_variants(filename, extra=""): """Return a dict with the variants in a filename.""" extra_columns = [] if extra: for column in extra.split(","): extra_columns.append(utils.get_column(filename, column)) with open2(filename, encoding="utf-8", errors="replace") as f1: variants = {"header": f1.readline().rstrip().split("\t")} for line in f1: variant = line.split("\t") # Some lines ends in lots of empty cols # so we keep every one of them variant[-1] = variant[-1].rstrip() key = variant[:5] for column in extra_columns: if variant[column] in ["."]: # Some columns, like "." should be forced to be empty key.append("") else: try: key.append(round(float(variant[column]), 2)) except ValueError: # If the column cannot be casted to a float, let it in # as a string key.append(variant[column]) variants[tuple(key)] = variant return variants
def compute_naive_bayes_accuracy(train, test, header, attributes_list, class_label): ''' for a training/test set pair, computes the accuracy of the naive bayes classification parameter train is the training set of data (a table) for linear regression parameter test is the test set of data (a table) to classify parameters table and header are the data table and a list of the attributes in order, respectively returns the accuracy ''' predictions = [] # for each instance in the training set, compute naive bayes classifications for instance in test: predictions.append( classify_using_naive_bayes(train, header, instance, "math score class", attributes_list)) # get actual values actuals = utils.get_column(test, header.index("math score class")) # compute accuracy using predicted values and actuals con_mat = create_confusion_matrix(predictions, actuals) # print_confusion_matrix(con_mat[0], con_mat[1], con_mat[2], con_mat[3]) return acc(con_mat[0], con_mat[1], con_mat[2], con_mat[3])
def classify_using_naive_bayes(table, header, test_instance, class_label, attr_list): ''' uses Naive Bayes to classify an unseen instance given a training set parameter table is the training data to use parameter header is a list of the attributes in table parameter test_instance is an instance of the test set to be classified parameter class_label is the name (string) of the class parameter attr_list is a list of the attributes to use for classification returns the predicted class label for the test instance ''' # get list of all possible class values classes = utils.get_unique_items( utils.get_column(table, header.index(class_label))) # initialize list of class probabilities class_probabilities = [0 for value in classes ] # should be parallel list to classes # for each class, compute posteriors then multiply to get P(class | test_instance) for class_value in classes: current_class_index = classes.index(class_value) calculate_posterior(table, header, class_label, class_value, test_instance, \ attr_list, current_class_index, class_probabilities) # returns the class with the largest proportional probability given Naive Bayes classification max_class_index = class_probabilities.index(max(class_probabilities)) return classes[max_class_index]
def encrypt(self): print('[+] Starting encryption.') scrambled = self.image.matrix pbar = tqdm(total=2 * (len(self.r_vector) * len(self.c_vector)) + (len(self.r_vector) + len(self.c_vector))) for i in range(len(self.r_vector)): ith_row_sum = utils.sum_row(i, scrambled) if bool(ith_row_sum % 2): scrambled[i] = utils.shift(scrambled[i], self.r_vector[i]) else: scrambled[i] = utils.shift(scrambled[i], -self.r_vector[i]) pbar.update(1) for j in range(len(self.c_vector)): # jth_column_sum = utils.sum_column(j, scrambled) col = utils.get_column(scrambled, j) shifted = utils.shift(col, -self.c_vector[j]) scrambled = utils.set_column(scrambled, j, shifted) pbar.update(1) scrambled = self.xor_scrambled(scrambled, pbar) pbar.close() self.image.create_image(self.encrypted, scrambled) f = open(self.image.key_path, 'w') f.write(str(self.r_vector) + '\n') f.write(str(self.c_vector)) f.close()
def choose_columns(data, indexes): new_table = [[] for _ in range(len(data))] for i in indexes: column = utils.get_column(data, i) for row, item in zip(new_table, column): row.append(item) return new_table
def xor_scrambled(self, scrambled, pbar): for i in range(len(scrambled)): if not bool(i % 2): reverse = self.c_vector reverse.reverse() scrambled[i] = [utils.xor(scrambled[i][j], reverse[j], pbar) for j in range(len(self.c_vector))] else: scrambled[i] = [utils.xor(scrambled[i][j], self.c_vector[j], pbar) for j in range(len(self.c_vector))] pbar.update(1) for j in range(len(scrambled[0])): col = utils.get_column(scrambled, j) if not bool(j % 2): reverse = self.r_vector reverse.reverse() scrambled = utils.set_column(scrambled, j, [utils.xor(col[i], reverse[i], pbar) for i in range(len(self.r_vector))]) else: scrambled = utils.set_column(scrambled, j, [utils.xor(col[i], self.r_vector[i], pbar) for i in range(len(self.r_vector))]) pbar.update(1) return scrambled
def build_target_map(): target_map = {} wks = worksheets["SDG Targets"] target_col = utils.get_column(wks, 2) for cell in target_col: strsplit = cell.split() target_map[strsplit[0].strip()] = cell return target_map
def predict_destinations(training_data, tests, answers): training_infos = get_training_infos(training_data) nearest_neighbors = create_nearest_neighbors_predictor(training_infos) for line in tests: nearest_paths = [] test_id = utils.get_column(utils.Columns.trip_id, line) test_path = utils.get_column(utils.Columns.path, line) if not test_path: continue for index in nearest_neighbors.kneighbors([test_path[0]], return_distance=False)[0]: nearest_paths.append(training_infos[index]) predicted_destination = frechet_predictor(test_id, test_path, nearest_paths) print test_id, "|", predicted_destination, if answers is not None: real_destination = get_real_destination(answers) print "|", utils.distance(real_destination, predicted_destination) else: print ""
def normalize_combined(): combined_table, header = utils.read_table("combined_data.csv", True) columns = [] new_header = [] for x in range(len(header)): if x not in [2, 6, 7, 8, 9, 10, 11, 12, 13, 16]: new_header.append(header[x]) columns.append(utils.get_column(combined_table, x)) columns.append([ round(columns[6][i] * 12 * 100 / columns[3][i], 1) for i in range(len(columns[0])) ]) new_header.append("Pct_Income_as_Rent") columns[2] = normalize_data(columns[2]) # Poverty columns[3] = normalize_data(columns[3]) # Median Income columns[4] = discretize_data(columns[4], 5) # Crime Rate columns[5] = normalize_data(columns[5]) # Population columns[6] = normalize_data(columns[6]) # Rent columns[7] = normalize_data(columns[7]) # Rent as percent of income. new_table = [] for x in range(len(columns[0])): buffer = [] for column in columns: buffer.append(column[x]) new_table.append(buffer) new_table.insert(0, new_header) utils.write_table("combined_data_normalized.csv", new_table) columns[2] = discretize_data(columns[2], 3) # Poverty columns[3] = discretize_data(columns[3], 3) # Median Income #columns[4] = discretize_data(columns[4], 3) # Crime Rate columns[5] = discretize_data(columns[5], 5) # Population columns[6] = discretize_data(columns[6], 3) # Rent columns[7] = discretize_data(columns[7], 5) # Rent as percent of income. new_table = [] for x in range(len(columns[0])): buffer = [] for column in columns: buffer.append(column[x]) new_table.append(buffer) new_table.insert(0, new_header) utils.write_table("combined_data_discretized.csv", new_table)
def main(): table = u.read_csv('rolex_prices_data.csv') price_col = u.get_column(table, 1) clean_prices = [] for price in price_col: price = price.replace(',', '') price = price.replace('$', '') clean_prices.append(price) clean_table = [] count = 0 for row in table: clean_table.append([row[0], clean_prices[count]]) count = count + 1 u.write_to_file(clean_table, 'rolex_prices_data_clean.csv')
def main(): data = [] f = open(utils.kInputFile, "r") for line in f: path = utils.get_column(utils.Columns.path, line) if not path: continue data.append(utils.distance(path[0], path[-1])) binwidth = 1 bins = range(int(math.floor(min(data))), int(math.ceil(max(data) + binwidth)), binwidth) # the histogram of the data plt.hist(data, bins=bins, facecolor='green', alpha=0.75) plt.xlabel('Trip Distance(km)') plt.ylabel('Number of trips') plt.title(r'Proportion of trips') plt.axis([0, 20, 0, 1000000]) plt.grid(True) plt.show()
def normalize_table(table, predictors): ''' Reads a table and adds normalized columns. Parameter table: The table to be tested. Parameter predictors: List of predictors used to estimate. Returns: Normalized table. ''' new_table = copy.deepcopy(table) normalized = [[] for _ in predictors] for x in predictors: column = utils.get_column(new_table, x) if all(isinstance(item, int) for item in column) or all( isinstance(item, float) for item in column): normalized.append(normalize( column)) # Normalizes and adds each predictor column. else: normalized.append([]) for i in range(len(new_table)): for j in range(len(predictors)): if normalized[j] != []: new_table[i][predictors[j]] = normalized[j][i] return new_table
def find_similar_text(wks, args, rd, title_count): """ Uses similarity.py module to find similar text * We remove all duplicates first * We store row location of all indicator text * We apply NLP similarity on deduped list * In fast mode, we use sorted list and start comparing from the location of the first string. * In deep mode, we search every sentence with other """ col = utils.get_column(wks, 5) qacol = utils.get_column(wks, 10) col_line_dict = defaultdict(list) for n,x in enumerate(col): col_line_dict[x].append(n+1+title_count) colunique = list(set(col)) colunique.sort() all_similar_lines = [] header_written = False for n1, val1 in enumerate(colunique): if val1 == '': continue for n2, val2 in enumerate(colunique[n1:]): if val2 == '': continue if val1 == val2: continue similar_val = similarity.cosine_sim(val1, val2) if similar_val > 0.7: similar_lines = [col_line_dict[val1], val1, col_line_dict[val2], val2] # Filter by column "Indicator QA Status". If a value exists remove it from # the list qacoltest = [] qacoltest.extend(col_line_dict[val1]) qacoltest.extend(col_line_dict[val2]) #for x in qacoltest: # print(qacol[x-1-title_count]) allqacol_filled = all([qacol[x-1-title_count] in ('Complete', 'Needs Review') for x in qacoltest]) #print(allqacol_filled) # If this flag is set, do not filter by QA column if args.all_similar: allqacol_filled = False if not(allqacol_filled): all_similar_lines.append(similar_lines) #print(qacoltest) #print("ZZZZ===") #print(similar_lines) #print("=====ZZZZ") if len(all_similar_lines) > 0 and not header_written: rd["sheet"].write(rd["row"], 0, "Test for similar indicators values") rd["sheet"].write(rd["row"], 1, "Failed", rd["red"]) rd["row"]+=1 rd["sheet"].write(rd["row"], 0, None) rd["row"]+=1 rd["sheet"].write_row(rd["row"], 0, tuple(["Rows 1", "Similar Text 1", "Rows 2", "Similar Text 2", "Similarity Score"]), rd["bold"]) rd["row"]+=1 header_written = True if not(allqacol_filled): rd["sheet"].write_row(rd["row"], 0, tuple([ ','.join((str(s) for s in similar_lines[0])), "'{}'".format(similar_lines[1]), ','.join((str(s) for s in similar_lines[2])), "'{}'".format(similar_lines[3]), '{:.3f}'.format(similar_val)])) rd["row"]+=1 print("\n Row: {}\n'{}'\n----\n Row: {}\n'{}'\nSimilarity Score = {:.3f}\n\n\n ====".format( similar_lines[0], similar_lines[1], similar_lines[2], similar_lines[3], similar_val)) else: if not args.deeply_similar: break if len(all_similar_lines) == 0: print("\n\n Test for similar indicators values: Passed. Good Job!") rd["sheet"].write(rd["row"], 0, "Test for similar indicators values") rd["sheet"].write(rd["row"], 1, "Passed. Good Job!", rd["green"]) rd["row"]+=1
def stats(): data, header = utils.read_table("combined_data.csv", True) for row in data: if row[6] == min([x for x in utils.get_column(data, 6) if x != 0]): print(row)
def main(): '''audio_data = [] utils.read_file_to_table("small_audio_data.csv", audio_data) print("done reading") headers = ["Acousticness", "Danceability", "Duration", "Energy", "Instrumentalness", "Key", "Liveness", "Loudness", "Mode", "Speechiness", "Tempo", "Time Signature", "Valence"] for i in range(0, 13): filename = headers[i] + ".pdf" create_scatter_plot(utils.get_column(audio_data, i), utils.get_column( audio_data, 13), filename, headers[i])''' # kNN classifier to predict popularity (index 13) # using: acousticness (0), danceability (1), duration (2), energy (3), instrumentalness (4), # liveness (6), loudness (7), speechiness (9), tempo (10), valence (12) trimmed_data = [] utils.read_file_to_table("small_audio_data.csv", trimmed_data, [0, 1, 2, 3, 4, 6, 7, 9, 10, 12, 13]) # normalize duration (new index = 2), loudness (new index = 6), tempo (new index = 8) duration = utils.get_column(trimmed_data, 2) normalized_duration = utils.normalize(duration) loudness = utils.get_column(trimmed_data, 6) normalized_loudness = utils.normalize(loudness) tempo = utils.get_column(trimmed_data, 8) normalized_tempo = utils.normalize(tempo) # update table with normalized values # and discretize popularity for i in range(len(trimmed_data)): trimmed_data[i][2] = normalized_duration[i] trimmed_data[i][6] = normalized_loudness[i] trimmed_data[i][8] = normalized_tempo[i] trimmed_data[i][-1] = utils.discretize_popularity(trimmed_data[i][-1]) # decision trees # values for this tree are already between 0 and 1 col_names = [ "acousticness", "danceability", "duration", "energy", "instrumentalness", "liveness", "loudness", "speechiness", "tempo", "valence", "popularity" ] labels = { "acousticness": "Acousticness", "danceability": "Danceability", "duration": "Duration", "energy": "Energy", "instrumentalness": "Instrumentalness", "liveness": "Liveness", "loudness": "Loudness", "speechiness": "Speechiness", "tempo": "Tempo", "valence": "Valence", "popularity": "Popularity" } att_domains = { 0: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 1: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 2: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 3: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 4: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 5: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 6: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 7: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 8: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 9: [">=0.25", ">=0.50", ">=0.75", ">=1.0"], 10: [">=25", ">=50", ">=75", ">=100"] } class_index = len(col_names) - 1 # att_indexes is a list of attributes to use for building the tree att_indexes = list(range(len(col_names) - 1)) #spotify_tree = tree_utils.tdidt(tree_data, att_indexes, att_domains, class_index, col_names) #tree_utils.create_dot_tree(spotify_tree, labels, "spotify_tree") folds = utils.stratified_cross_folds(trimmed_data, 10) num_correct = 0 for i in range(0, 10): # range had to change train, test = utils.set_up_train_test(i, folds) actual_popularities = [x[-1] for x in test] att_indexes = list(range(len(col_names) - 1)) predicted_popularities = tree_utils.tree_classifier( train, test, att_indexes, att_domains, class_index, col_names) for i in range(len(test)): if actual_popularities[i] == predicted_popularities[i]: num_correct += 1 accuracy = num_correct / len(trimmed_data) print("Accuracy Decision Tree: " + str(round(accuracy * 100, 2)) + "%") # generate 10 stratified cross folds folds = utils.stratified_cross_folds(trimmed_data, 10) num_correct = 0 for i in range(0, 10): train, test = utils.set_up_train_test(i, folds) actual_popularities = [x[-1] for x in test] predicted_popularities = utils.knn_classifier(train, test) for i in range(len(test)): if actual_popularities[i] == predicted_popularities[i]: num_correct += 1 print(num_correct) accuracy = num_correct / len(trimmed_data) print("Accuracy kNN: " + str(round(accuracy * 100, 2)) + "%") # naive bayes num_correct_bayes = 0 for i in range(0, 10): train, test = utils.set_up_train_test(i, folds) priors = utils.compute_probabilities(train) actual_popularities_bayes = [x[-1] for x in test] predicted_popularities_bayes = [] for instance in test: predicted_popularity_bayes = utils.naive_bayes_classifier( priors, instance, train) predicted_popularities_bayes.append(predicted_popularity_bayes) for i in range(len(test)): if actual_popularities_bayes[i] == predicted_popularities_bayes[i]: num_correct_bayes += 1 print(num_correct_bayes) accuracy_bayes = num_correct_bayes / len(trimmed_data) print("Accuracy Naive Bayes: " + str(round(accuracy_bayes * 100, 2)) + "%") # ensemble classifier (kNN) # generate five weak learners, each using a different subset of attributes num_correct_ensemble = 0 for i in range(10): train, test = utils.set_up_train_test(i, folds) actual_popularities = [x[-1] for x in test] predicted_popularities = [] for instance in test: predictions = [] for j in range(6): training_subset = train[j:j + 4] prediction = utils.compute_class_knn(instance, training_subset) predictions.append(prediction) # use simple majority voting np_arr = np.array(predictions) majority_vote = np.bincount(np_arr).argmax() predicted_popularities.append(majority_vote) for i in range(len(test)): if predicted_popularities[i] == actual_popularities[i]: num_correct_ensemble += 1 accuracy_ensemble = num_correct_ensemble / len(trimmed_data) print("Accuracy ensemble kNN: " + str(round(accuracy_ensemble * 100, 2)) + "%") # compare with scikit-learn kNN df = pd.DataFrame(trimmed_data) X = np.array(df.loc[:, 0:9]) # features y = np.array(df.loc[:, 10]) # class label (popularity) # split into train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) knn = KNeighborsClassifier(n_neighbors=8) knn.fit(X_train, y_train) prediction = knn.predict(X_test) print("Scikit-learn accuracy (kNN): " + str(round(accuracy_score(y_test, prediction) * 100, 2)) + "%")
def tdidt(instances, att_indexes, all_att_indexes, att_domains, class_index, header, tree): ''' Uses the tdidt algorithm to build a decision tree based on a given set of data ''' print("Current Tree: ", tree) print("att_indexes = ", att_indexes) if att_indexes == []: return att_index = entropy(instances, header, att_domains, att_indexes) att_indexes.remove(att_index) partition = partition_instances(instances, att_index, att_domains[att_index]) partition_keys = partition.keys() tree.append("Attribute") tree.append(header[att_index]) count = 0 for i in range(len(att_domains[att_index])): print(i) tree.append(["Value", att_domains[att_index][count]]) col = utils.get_column(partition.get(att_domains[att_index][i]), len(header) - 1) items_in_col = [] for item in col: if item not in items_in_col: items_in_col.append(item) if len(items_in_col) == 1: tree[2 + count].append([ "Leaves", has_same_class_label(instances, header, att_index, class_index, col, items_in_col[0]) ]) elif len(att_indexes) == 0 and len(col) > 0: majority_class = compute_partition_stats(col) tree[2 + count].append([ "Leaves", has_same_class_label(instances, header, att_index, class_index, col, majority_class) ]) elif col == []: del tree[2 + count] return [] else: tree[2 + count].append([]) new_branch = [ tdidt(partition.get(att_domains[att_index][i]), att_indexes, all_att_indexes, att_domains, class_index, header, tree[2 + count][2]) ] if new_branch == [[]]: majority_class = compute_partition_stats(col) tree[2][2] = [ "Leaves", has_same_class_label(instances, header, att_index, class_index, col, majority_class) ] else: tree[2][2] = new_branch count += 1 return tree
def compute_domains(indexes,data): domains = {} for i in indexes: domains[i] = list(set(utils.get_column(data, i))) return domains
def missing_concept_code_from_target_sheet(rd): concept_code = set(utils.get_column(wks, 0)) concept_code_from_target_sheet = set(utils.get_column(wks_target_mapping, 0)) missing_in_sheet_1 = concept_code_from_target_sheet - concept_code missing_in_sheet_2 = concept_code - concept_code_from_target_sheet if len(missing_in_sheet_1) == 0: print("\n Test for Concept Code missing from 'BIA to SDG Target Mapping' worksheet: Passed. Good Job!") rd["sheet"].write(rd["row"], 0, "Test for Concept Code missing from 'BIA to SDG Target Mapping' worksheet") rd["sheet"].write(rd["row"], 1, "Passed. Good Job!", rd["green"]) rd["row"]+=1 rd["sheet"].write(rd["row"], 0, None) rd["row"]+=1 else: print("\nTest for Concept Code missing from 'BIA to SDG Target Mapping' worksheet: Failed") rd["sheet"].write(rd["row"], 0, "Test for Concept Code missing from 'BIA to SDG Target Mapping' worksheet") rd["sheet"].write(rd["row"], 1, "Failed", rd["red"]) rd["row"]+=1 rd["sheet"].write(rd["row"], 0, None) rd["row"]+=1 table = PrettyTable(["Concept Codes"]) table.border = True rd["sheet"].write_row(rd["row"], 0, tuple(["Concept Code"]), rd["bold"]) rd["row"]+=1 for x in missing_in_sheet_1: # Fix Row count by adding number of rows used for title table.add_row([x]) rd["sheet"].write_row(rd["row"], 0, tuple([x])) rd["row"]+=1 rd["sheet"].write(rd["row"], 0, None) rd["row"]+=1 rd["sheet"].write(rd["row"], 0, None) rd["row"]+=1 print(table) if len(missing_in_sheet_2) == 0: print("\n Test for Concept Code from this sheet but missing in 'BIA to SDG Target Mapping' worksheet: Passed. Good Job!") rd["sheet"].write(rd["row"], 0, "Test for Concept Code from this sheet but missing in 'BIA to SDG Target Mapping' worksheet") rd["sheet"].write(rd["row"], 1, "Passed. Good Job!", rd["green"]) rd["row"]+=1 rd["sheet"].write(rd["row"], 0, None) rd["row"]+=1 else: print("\nTest for Concept Code from this sheet but missing in 'BIA to SDG Target Mapping' worksheet: Failed") rd["sheet"].write(rd["row"], 0, "Test for Concept Code from this sheet but missing in 'BIA to SDG Target Mapping' worksheet") rd["sheet"].write(rd["row"], 1, "Failed", rd["red"]) rd["row"]+=1 rd["sheet"].write(rd["row"], 0, None) rd["row"]+=1 table = PrettyTable(["Concept Codes"]) table.border = True rd["sheet"].write_row(rd["row"], 0, tuple(["Concept Code"]), rd["bold"]) rd["row"]+=1 for x in missing_in_sheet_2: # Fix Row count by adding number of rows used for title table.add_row([x]) rd["sheet"].write_row(rd["row"], 0, tuple([x])) rd["row"]+=1 rd["sheet"].write(rd["row"], 0, None) rd["row"]+=1 rd["sheet"].write(rd["row"], 0, None) rd["row"]+=1 print(table)
def make_kNN_prediction(test_instance, training_set, k): closest = get_k_closest(test_instance, training_set, k) class_labels = utils.get_column(closest, -1) return np.median( class_labels ) # the most common of the two class labels will be the median
import utils from random import randint f = open("train.csv", "r") out = open(utils.kRegressionTrainFile, "w") for line in f: origin_call = utils.get_column(2, line) if origin_call is None: origin_call = 0 origin_stand = utils.get_column(3, line) if origin_stand is None: origin_stand = 0 taxi_id = utils.get_column(4, line) timestamp = utils.get_column(5, line) missing_data = utils.get_column(7, line) path = utils.get_column(8, line) if len(path) < 2 or missing_data: continue point = path[randint(1, len(path) - 1)] out.write("%s,%s,%s,%s,%f,%f,%f,%f,%f,%f\n" % (origin_call, origin_stand, taxi_id, timestamp, path[0][0], path[0][1], point[0], point[1], path[-1][0], path[-1][1])) f = open("test.csv", "r") out = open(utils.kRegressionTestFile, "w") for line in f: id = utils.get_column(0, line) origin_call = utils.get_column(2, line) if origin_call == "NA": origin_call = 0 origin_stand = utils.get_column(3, line)
def data_vis(): data, header = utils.read_table("combined_data.csv", True) x_data, y_data = utils.get_column(data, 3), utils.get_column(data, 5) dv.scatter_plot(x_data, y_data, "Poverty Levels v. Crime Rate", "Poverty Levels (%)", "Crime Rate per 100,000 people", 10, "Poverty_v_Crime_graphed.png", 100, True)