def selectAttibutesGA(): matrix = csv.readCsvRaw(csv.headered_name_pca_corr) num_attributes = len(matrix[0])-1 if False: num_subset = 5 num_trials = max(100, num_attributes*2) results = findBestOfSize(matrix, num_subset, num_trials) order = orderByResults(results,num_attributes) if True: sort_order = [i for i in range(num_attributes)] for num_subset in range(5, num_attributes, 5): num_trials = max(100, num_attributes*2) csv_matrix_name = csv.makeCsvPath('subset.matrix' +('%03d'%num_subset)) csv_results_name = csv.makePath('subset.results'+('%03d'%num_subset)) csv_best_name = csv.makeCsvPath('subset.best' +('%03d'%num_subset)) csv_summary_name = csv.makeCsvPath('subset.summary'+('%03d'%num_subset)) ordered_matrix = pca.reorderMatrix(matrix, sort_order) csv.writeCsv(csv_matrix_name, ordered_matrix) results = findBestOfSize(ordered_matrix, num_subset, num_trials, csv_summary_name) sort_order = orderByResults(results,num_attributes) #c_x = results[0].columns + [-1] # include outcome #sub_matrix = [[row[i] for i in c_x] for row in ordered_matrix] #csv.writeCsv(csv_best_name,sub_matrix, ) if not is_testing: shutil.copyfile(results[0]['csv'],csv_best_name) shutil.copyfile(results[0]['results'],csv_results_name)
def findAutoCorrelations(time_series_csv, max_lag, fraction_training): """ Run auto-correlations independently 2 column time series by converting into a regression with max_lag x and y lags per instance fraction_training is the fraction of sample used for training """ base_name = os.path.splitext(time_series_csv)[0] auto_correlation_matrix_csv = base_name + '.autocorrelation.csv' time_series_data,header = csv.readCsvFloat2(time_series_csv, True) number_training = int(float(len(time_series_data))*fraction_training) print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data) assert(number_training > max_lag) time_series = NP.transpose(NP.array(time_series_data)) days_downloads = getDaysOfWeekToKeep(time_series[0,:number_training]) days_purchases = getDaysOfWeekToKeep(time_series[1,:number_training]) print days_downloads print days_purchases exit() removeOutlierDaysOfWeek(time_series[1,:number_training]) removeOutiers(time_series[1,:number_training], 0.8) downloads = time_series[0,:number_training] purchases = time_series[1,:number_training] #auto_correlations = [getAutoCorrelation(time_series[i,:number_training], max_lag) for i in range(time_series.shape[2])] #return (getAutoCorrelation(downloads, max_lag),getAutoCorrelation(purchases, max_lag)) auto_correlation_data = NP.hstack([getAutoCorrelation(downloads, max_lag),getAutoCorrelation(purchases, max_lag)]) csv.writeCsv(auto_correlation_matrix_csv, list(auto_correlation_data), header)
def getPCAProjections(input_filename): """ Run PCA on the Kushmerick ad data Stop when there are sufficient PCA components to explain threshold_variance Project input data onto the top PCA components that explain threshold_variance Normalize this data Sort attributes by their correlation with output - input_filename : prepocessed data - Output 'pca': data projected onto PCA components 'norm': pca data normalized to std deviation 1 'corr': normalized data sorted by correlation with output 'index': """ print 'getPCAProjections:', input_filename explained_variance = 0.99 root_name = 'pca%03d' % round(explained_variance * 100.0) pca_filename = csv.makeCsvPath(root_name) pca_norm_filename = csv.makeCsvPath(root_name + '.norm') pca_norm_corr_filename = csv.makeCsvPath(root_name + '.norm.corr') corr_index_filename = csv.makeCsvPath(root_name + '.corr.idx') pca.pcaAdData(explained_variance, input_filename, pca_filename) pca.normalizeData(pca_filename, pca_norm_filename) sort_order, corr_index = pca.rankByCorrelationWithOutcomes(pca_norm_filename) def reorder(in_cells): return pca.reorderMatrix(in_cells, sort_order) csv.modifyCsvRaw(pca_norm_filename, pca_norm_corr_filename, reorder) csv.writeCsv(corr_index_filename, corr_index) return {'pca': pca_filename, 'norm':pca_norm_filename, 'corr':pca_norm_corr_filename, 'index':corr_index_filename}
def timeSeriesToMatrixCsv(regression_matrix_csv, time_series, max_lag): """ Convert a 2 row time series into a regression matrix """ regression_matrix = timeSeriesToMatrixArray(time_series, max_lag) header_x = ['x[%0d]' % i for i in range(-max_lag,0)] header_y = ['y[%0d]' % i for i in range(-max_lag,1)] header = header_x + header_y csv.writeCsv(regression_matrix_csv, list(regression_matrix), header)
def makeTimeSeriesCsv(filename, purchase_max_lag, number_days, mean_downloads_per_day, mean_purchases_per_download, mean_other_purchases): (downloads, purchases) = makeTimeSeries(purchase_max_lag, number_days, mean_downloads_per_day, mean_purchases_per_download, mean_other_purchases) data = zip(downloads, purchases) csv.writeCsv(filename, data, ['downloads', 'purchases'])
def findBestAttributes(output_dir, base_filename, algo_key, data, attributes, is_inclusive): num_attrs = len(attributes) - 1 # Track results for each round series_results = [] # Track best results in each round for back-tracking. This makes series_results redundant all_results = [] def getBestPreviousSubsets(subset_size): """ Construct best subset of size <subset_size> from top performers of each smaller subset size """ best_previous_subsets = [] for results in all_results: subset = set() for r in results: part_subset = [i for i in r['subset'] if i != class_index] for i in part_subset: subset.add(i) if len(subset) >= subset_size: break if len(subset) >= subset_size: break if len(subset) == subset_size: subset.add(class_index) best_previous_subsets.append(sorted(subset)) print 'best_previous_subsets =', best_previous_subsets return best_previous_subsets num_attrs_start = 1 if is_inclusive else 0 # Loop through all sizes of subsets of attributes, largest first # Go hhel way !@#$ for subset_size in range(num_attrs_start, (len(attributes) + 3)//4): if subset_size == num_attrs_start: if is_inclusive: results = [getSubsetResultDict(output_dir, algo_key, data, attributes, [class_index, i], True) for i in range(num_attrs) if i != class_index] else: results = [getSubsetResultDict(output_dir, algo_key, data, attributes, [], is_inclusive)] else: best_previous_subsets = getBestPreviousSubsets(subset_size) if is_inclusive else [] results = findBestAttributesForSubsetSize(output_dir, base_filename, algo_key, data, attributes, results, subset_size, is_inclusive, best_previous_subsets) results.sort(key = lambda x: -x['score']) results[0]['num_attrs'] = num_attrs-subset_size series_results.append(results[0]) all_results.append(results[:100]) # Write out the results out_filename = makeFileName(output_dir, base_filename, algo_key, -1, 'csv') header = getCsvResultHeader() results_matrix = [getCsvResultRow(r, attributes, is_inclusive) for r in series_results] csv.writeCsv(out_filename, results_matrix, header) out_num_attrs_filename = makeFileName(output_dir, base_filename, algo_key + '.%02d'%subset_size, -1, 'csv') header = getCsvResultHeader() results_matrix = [getCsvResultRow(r, attributes, is_inclusive) for r in results[:100]] csv.writeCsv(out_num_attrs_filename, results_matrix, header) return series_results
def wekaToPredict(weka_filename): """ Convert a Weka formatted .cvs file to a Google Predict .csv file by moving class from last column to first """ parts = os.path.splitext(weka_filename) predict_filename = parts[0] + '.gp' + parts[1] print 'wekaToPredict:', weka_filename,'=>', predict_filename weka = csv.readCsvRaw(weka_filename) predict = [[w[-1]] + w[:-1] for w in weka] csv.writeCsv(predict_filename, predict)
def testMatrixMLP(matrix, columns, opts = mlp_opts): "Run MLP on attributes with index in columns" c_x = columns + [-1] # include outcome sub_matrix = [[row[i] for i in c_x] for row in matrix] temp_base = csv.makeTempPath('subset'+('%03d'%len(columns))+'_') temp_csv = temp_base + '.csv' temp_results = temp_base + '.results' if is_testing: num_attributes = len(matrix[0]) - 1 accuracy,dt = 1.0/float(sum([abs(x-num_attributes/2) for x in columns])), 0.1 else: csv.writeCsv(temp_csv, sub_matrix) accuracy,dt = runMLPTrain(temp_csv, temp_results, opts) return (accuracy, temp_csv, temp_results, dt)
def timeSeriesToMatrixCsv(regression_matrix_csv, time_series, masks, max_lag): """ Convert a 2 row time series into a regression matrix """ regression_matrix,regression_mask, means, stddevs = timeSeriesToMatrixArray(time_series, masks, max_lag) header_x = ['x[%0d]' % i for i in range(-max_lag,0)] header_y = ['y[%0d]' % i for i in range(-max_lag,1)] header = header_x + header_y regression_data = [[str(regression_matrix[i,j]) if regression_mask[i,j] else '?' for j in range(regression_matrix.shape[1])] for i in range(regression_matrix.shape[0])] # Eliminate rows with no output regression_data = [x for x in regression_data if not x[len(x)-1] == '?'] print regression_data[0] csv.writeCsv(regression_matrix_csv, regression_data, header) return (means, stddevs)
def normalizeData(in_fn, out_fn): """ Normalize ad data to equal std dev in_fn : read input data from this csv file out_fn : write output data to this csv fuile """ print 'normalizeData:', in_fn, '=>', out_fn in_cells = csv.readCsvRaw(in_fn) csv.validateMatrix2(in_cells) # Remove header row on top and category row on right in_data = [[float(e.strip()) for e in row[:-1]] for row in in_cells[1:3280]] print 'data', len(in_data), len(in_data[0]) out_data = normalizeMatrix(in_data) print 'out_data', len(out_data), len(out_data[0]) out_cells = [in_cells[0]] + [out_data[i-1] + [in_cells[i][-1]] for i in range(1,len(in_cells))] csv.writeCsv(out_fn, out_cells)
def pcaAdData(theshold_variance, in_filename, out_filename): """ Run PCA on the Kushmerick ad data Stop when there are sufficient PCA components to explain threshold_variance Project input data onto these PCA components - in_filename : input data read from this CSV file - out_filename : output data written to this CSV file """ h2data = csv.readCsvRaw(in_filename) csv.validateMatrix(h2data) # Boolean data are columns 3 to second last bool_data = [[float(e) for e in v[3:-1]] for v in h2data[1:]] print 'bool_data', len(bool_data), len(bool_data[0]) x = array(bool_data) # Find the output dimension (#basis vectors) required to explain # threshold_variance print 'output_dim, explained_variance, time(sec)' for odim in range(50, len(x[0]), 50): start_time = time.clock() pcanode = mdp.nodes.PCANode(svd=True, output_dim = odim, dtype='float64') pcanode.train(x) p = pcanode.get_projmatrix() d = pcanode.output_dim print '%10d' % d, ',', v = pcanode.explained_variance print '%15.03f' % v, ',', print '%6.1f' % (time.clock() - start_time) if v >= theshold_variance: break #print '-----------------------------1' print 'p', len(p), len(p[0]) #print '-----------------------------2' # Project out data onto PCA components xfd = dot(x, p) pca = [[x for x in row] for row in xfd] print 'pca', len(pca), len(pca[0]) pca_header = ['pca_%03d' % i for i in range(len(pca[0]))] header = h2data[0][:3] + pca_header + [h2data[0][-1]] num_data = [h2data[i+1][:3] + pca[i] + [h2data[i+1][-1]] for i in range(len(h2data)-1)] data = [header] + num_data csv.writeCsv(out_filename, data)
def preprocess(raw_name, headered_name, headered_name_pp): """ Add headers and pre-process the raw Kushmerick data. This needs to be done once. - raw_name is the Kushmerick data that is input - headered_name is the name of CSV file with headers that is created - headered_name_pp is the named a file created by preprocessing header name that is created """ print 'preprocess', raw_name, '=>', headered_name, '=>', headered_name_pp header = csv.makeHeader() data = csv.readCsvRaw(raw_name) hdata = [header] + data assert(len(hdata)==len(data)+1) csv.validateMatrix(hdata) #swapMatrixColumn(data, 3, -1) csv.writeCsv(headered_name, hdata) h2data = csv.readCsvRaw(headered_name) csv.replaceMissingValues(hdata) csv.writeCsv(headered_name_pp, hdata)
if __name__ == '__main__': if len(sys.argv) < 2: print 'Usage: jython find_duplicate_attributes.py <arff-file>' sys.exit() base_filename = sys.argv[1] print base_filename relation, comments, attributes, data = arff.readArff(base_filename) sorted_data = sorted(data, key = lambda x: x[1:] + [x[0]]) csv.writeCsv('temp.csv', sorted_data, [a['name'] for a in attributes]) duplicates = [] for i in range(1, len(sorted_data)): if sorted_data[i] == sorted_data[i-1]: duplicates.append(i) print 'duplicates', len(duplicates), duplicates num_attrs = len(attributes) def getHamming(d1, d2): hamming = 0 for i in range(1, num_attrs): if d1[i] != d2[i]: hamming += 1 return hamming
if len(listGpu) > 1: choose = chooseItem(item, listGpu) if len(choose) > 0: listGpu = choose if len(listGpu) > 1: choose = chooseItemPairs(item, listGpu) if len(choose) > 0: listGpu = choose if len(listGpu) == 1: (gpuName, gpuNameList, gpuPerf, gpuLink) = listGpu[0] listRecognized.append([name, gpuName, price, gpuPerf, link, gpuLink]) else: print(link) unknown = [] for gpu in listGpu: (gpuName, gpuNameList, gpuPerf, gpuLink) = gpu unknown += [gpuName, gpuPerf] listUnknown.append([link, name, price] + unknown) print('Recognized: {0}\nUnknown: {1}'.format(len(listRecognized), len(listUnknown))) #print(listRecognized) # save merged list csv.writeCsv(listRecognized, 'recognized.csv') # save unrecognized list csv.writeCsv(listUnknown, 'undefined.csv') #print(listUnknown[-1]) #print(listSearch[-1]) timeDuration = -timeStart + time.time() print('Duration: {0} seconds'.format(int(ceil(timeDuration))))
def analyzeTimeSeries(filename, max_lag, fraction_training): """ Main function. Analyze time series in 'filename' (assumed to be a CSV for now) Create model with up to mag_lag lags Use the first fraction_training of data for training and the remainder for testing """ base_name = os.path.splitext(filename)[0] regression_matrix_csv = base_name + '.regression.csv' results_filename = base_name + '.results' model_filename = base_name + '.model' prediction_matrix_csv = base_name + '.prediction.csv' """ Assume input file is a CSV with a header row """ time_series_data, header = csv.readCsvFloat2(filename, True) """ Assume a weekly pattern """ number_training = (int(float(len(time_series_data))*fraction_training)//7)*7 print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data) assert(number_training > max_lag) time_series = NP.transpose(NP.array(time_series_data)) describeNPArray('time_series', time_series) training_time_series = NP.transpose(NP.array(time_series_data[:number_training])) print 'training_time_series.shape', training_time_series.shape t = NP.arange(time_series.shape[1]) training_t = NP.arange(training_time_series.shape[1]) num_series = training_time_series.shape[0] num_rows = training_time_series.shape[1] days_to_keep = [getDaysOfWeekToKeep(training_time_series[i,:]) for i in range(num_series)] masks = [getDaysOfWeekMask(days_to_keep[i], time_series.shape[1]) for i in range(num_series)] training_masks = [getDaysOfWeekMask(days_to_keep[i], num_rows) for i in range(num_series)] trends = [getTrend(training_t, training_time_series[i,:], training_masks[i]) for i in range(num_series)] x = [removeTrend1D(trends[i], training_t, training_time_series[i], training_masks[i]) for i in range(num_series)] for i in range(num_series): describeNPVector('x[%0d]'%i, x[i]) detrended_training_time_series = NP.zeros([num_series, x[0].shape[0]]) print 'detrended_training_time_series.shape', detrended_training_time_series.shape for i in range(num_series): print 'x[%0d].shape'%i, x[i].shape detrended_training_time_series[i,:] = x[i] print 'detrended_training_time_series.shape', detrended_training_time_series.shape # filtered_time_series = NP.vstack([filterDaysOfWeek(training_time_series[i,:], days_to_keep[i]) for i in range(num_series)]) # print 'filtered_time_series.shape', filtered_time_series.shape for i in range(num_series): describeNPVector('detrended_training_time_series[%0d]'%i, detrended_training_time_series[i]) means, stddevs = timeSeriesToMatrixCsv(regression_matrix_csv, detrended_training_time_series, training_masks, max_lag) print 'means', means print 'stddevs', stddevs run_weka.runMLPTrain(regression_matrix_csv, results_filename, model_filename, True, '-H 4') coefficients = run_weka.getCoefficients(results_filename) print '--------------------------------------------' print 'coefficients', len(coefficients) print coefficients print '--------------------------------------------' print 'means', len(means) print means print '--------------------------------------------' print 'stddevs', len(stddevs) print stddevs print '--------------------------------------------' #exit() detrended_full_x = [removeTrend1D(trends[i], t, time_series[i], masks[i]) for i in range(num_series)] detrended_time_series = NP.zeros([num_series, detrended_full_x[0].shape[0]]) print 'detrended_time_series.shape', detrended_time_series.shape for i in range(num_series): print 'full_x[%0d].shape'%i, detrended_full_x[i].shape detrended_predictions = predictTimeSeries(coefficients, means, stddevs, t, detrended_full_x[0], detrended_full_x[1], number_training, max_lag, masks) predictions = addTrend1D(trends[1], t, detrended_predictions, masks[1]) print '--------------------------------------------' print 'predictions =', predictions.shape # print predictions full_x = [NP.array(time_series[i]) for i in range(num_series)] print 't.shape', t.shape print 'full_x[0].shape', full_x[0].shape print 'full_x[1].shape', full_x[1].shape print 'predictions.shape', predictions.shape predicted_time_series = NP.vstack([t, full_x[0], full_x[1], predictions]) print 'predicted_time_series.shape', predicted_time_series.shape # retrend !@#$\\ prediction_header = ['t', 'x', 'y', 'y_pred'] predicted_time_series_data = [[str(predicted_time_series[i,j]) for i in range(predicted_time_series.shape[0])] for j in range(predicted_time_series.shape[1])] csv.writeCsv(prediction_matrix_csv, predicted_time_series_data, prediction_header)
def getGradeCounts(): """ Read the data file This is a set of grades """ data,header = csv.readCsvRaw2(os.path.join(dir, data_file), True) """ Category is in column 1 """ categories = set([instance[1] for instance in data]) print 'categories -------------------------------' for cat in sorted(categories): print cat """ Grades end in _g """ subject_grades_columns = [i for i, h in enumerate(header) if '_g' in h] num_subjects = len(subject_grades_columns) print 'grades columns names ---------------------' for i in range(num_subjects): print '%5d, %6d,' % (i, subject_grades_columns[i]), header[subject_grades_columns[i]] possible_grades = frozenset(['HD','D','C','P','N']) print 'grades -----------------------------------' for i in range(num_subjects): print '%-10s' % header[subject_grades_columns[i]], possible_grades """ counts = categories : subjects : grades """ """ First create all the counters """ counts = {} for cat in categories: counts[cat] = [{}.fromkeys(possible_grades, 0) for i in range(num_subjects)] """ Count all the category:subject:grade bins """ for instance in data: cat = instance[1] cnt = counts[cat] for i in range(num_subjects): col = subject_grades_columns[i] v = instance[col] cnt[i][v] = cnt[i][v] + 1 """ Calculate totals """ totals = {} for cat in categories: totals[cat] = {} cnt = counts[cat] for i in range(num_subjects): for k in cnt[i].keys(): totals[cat][k] = totals[cat].get(k,0) + cnt[i][k] print 'totals[%s]' % cat, totals[cat] counts[cat] = cnt + [totals[cat]] header.append('total') subject_grades_columns.append(len(header)-1) num_subjects = num_subjects + 1 print header """ Display the data as a .csv """ count_header = ['subject'] for i in range(num_subjects): count_header.append(header[subject_grades_columns[i]]) for j in range(len(possible_grades)): count_header.append('') count_header2 = ['grade'] for i in range(num_subjects): count_header2.append('') for k in possible_grades: count_header2.append(k) count_data = [count_header, count_header2] for cat in sorted(counts.keys()): row = [''] for i in range(num_subjects): row.append('cat_%s' % cat) for k in possible_grades: row.append(counts[cat][i][k]) count_data.append(row) csv.writeCsv(os.path.join(dir,counts_file), transpose(count_data))
def runWekaOnTimeSeries(time_series_csv, max_lag, fraction_training): """ Run Weka training a 2 column time series by converting into a regression with max_lag x and y lags per instance fraction_training is the fraction of sample used for training """ base_name = os.path.splitext(time_series_csv)[0] regression_matrix_csv = base_name + '.regression.csv' results_filename = base_name + '.results' model_filename = base_name + '.model' predictions_filename = base_name + '.predict' test_filename = base_name + '.test.csv' evaluation_filename = base_name + '.evaluation.csv' time_series_data,_ = csv.readCsvFloat2(time_series_csv, True) number_training = (int(float(len(time_series_data))*fraction_training)//7)*7 print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data) assert(number_training > max_lag) training_time_series = NP.transpose(NP.array(time_series_data[:number_training])) print '1: training_time_series.shape', training_time_series.shape if True: days_downloads = getDaysOfWeekToKeep(training_time_series[0,:]) days_purchases = getDaysOfWeekToKeep(training_time_series[1,:]) training_time_series = NP.vstack([filterDaysOfWeek(training_time_series[0,:], days_downloads), filterDaysOfWeek(training_time_series[1,:], days_purchases)]) print '2: training_time_series.shape', training_time_series.shape if True: timeSeriesToMatrixCsv(regression_matrix_csv, training_time_series, max_lag) run_weka.runMLPTrain(regression_matrix_csv, results_filename, model_filename, True) print 'number_training, training_time_series.shape[1]', number_training, training_time_series.shape[1] number_training_x = number_training #- 5 prediction_data = CP.deepcopy(time_series_data) prediction_data_downloads = [[row[0],0] for row in prediction_data] for i in range(number_training_x, len(prediction_data)): if i%7 in days_purchases: prediction_array = NP.transpose(NP.array(prediction_data[i-max_lag:i+1])) timeSeriesToMatrixCsv(test_filename, prediction_array, max_lag) run_weka.runMLPPredict(test_filename, model_filename, predictions_filename) prediction_list = run_weka.getPredictionsRegression(predictions_filename) print 'predictions', prediction_list prediction = prediction_list[0]['predicted'] if False: prediction_array_downloads = NP.transpose(NP.array(prediction_data_downloads[i-max_lag:i+1])) timeSeriesToMatrixCsv(test_filename, prediction_array_downloads, max_lag) run_weka.runMLPPredict(test_filename, model_filename, predictions_filename) prediction_list_downloads = run_weka.getPredictionsRegression(predictions_filename) print 'predictions_downloads', prediction_list_downloads prediction_downloads = prediction_list[0]['predicted'] else: prediction = -1 prediction_downloads = -1 prediction_data[i][1] = prediction #prediction_data[i] = [prediction_data[i][0], prediction, prediction_downloads] evaluation_data = [] for i in range(len(prediction_data)-number_training_x): if i%7 in days_purchases: row = [0]*5 for j in [0,1]: row[j] = time_series_data[number_training_x+i][j] row[2] = prediction_data[number_training_x+i][1] row[3] = abs(row[2]-row[1]) row[4] = row[3]/abs(row[2]+row[1]) if abs(row[2]+row[1]) else row[3] evaluation_data.append([number_training_x+i]+row) evaluation_header = ['i', 'x', 'y_actual', 'y_predicted', 'abs_error', 'normalized_error'] csv.writeCsv(evaluation_filename, evaluation_data, evaluation_header)
'RFCD.Percentage.1', 'Number.of.Unsuccessful.Grant', 'SEO.Percentage.2', 'Number.of.Successful.Grant', 'Start.date'] def makeAttrs(data_dict, numeric_keys): header = sorted(data_dict.keys(), key = lambda x: ' ' if x == 'Grant.Status' else x ) attrs = {} for k,v in data_dict.items(): if k in numeric_keys: attrs[k] = 'numeric' else: attrs[k] = sorted(set(x for x in v if x not in NO_VALUES)) return header, attrs header, attrs = makeAttrs(data_dict_many, numeric_keys) columns_many = [data_dict_many[k] for k in header] data_many = misc.transpose(columns_many) data_many.sort(key = lambda x: -getNumElements(x)) print header arff.writeArff2(outname + '.arff', None, 'relation', header, attrs, data_many[:10000]) csv.writeCsv(outname + '.csv', data_many, header) if False: name = 'SEO.Code.4' keys, histo = getFreqHisto(data_dict[name]) print name, ['%s:%d' % (k, histo[k]) for k in keys]
def preprocessSoybeanData(): """ Pre-process the Soybean data set downloaded from http://archive.ics.uci.edu/ml/machine-learning-databases/soybean/ """ """ Read the data files """ training_data = csv.readCsvRaw(os.path.join(dir, training_file)) test_data = csv.readCsvRaw(os.path.join(dir, test_file)) """ Combined data file """ combined_data = test_data + training_data print 'combined data', len(combined_data), len(combined_data[0]) """ Random data file where the percentage of each class and attribute matches the combined data """ random_data = getRandomData(combined_data) """ Find the duplicate instances in each data set The number of duplicates in random_data provides an estimate of the number of duplicates that would occur in the real data sets by pure chance """ training_duplicates = getDuplicates(training_data) print 'training_duplicates =', len(training_duplicates) test_duplicates = getDuplicates(test_data) print 'test_duplicates =', len(test_duplicates) combined_duplicates = getDuplicates(combined_data) print 'combined_duplicates =', len(combined_duplicates) random_duplicates = getDuplicates(random_data) duplicates_warning = '*** Data files should not contain duplicates!' if len(random_duplicates) == 0 else '' print 'random_duplicates =', len(random_duplicates), duplicates_warning """ Remove duplicate instances within each data set We know removing duplicates is valid if len(random_duplicates) is zero """ filtered_training_data = removeDuplicates(training_data, training_duplicates, False) filtered_test_data = removeDuplicates(test_data, test_duplicates, False) filtered_combined_data = removeDuplicates(combined_data, combined_duplicates, False) filtered_random_data = removeDuplicates(random_data, random_duplicates, False) """ Remove the instances in duplicate-free test data that duplicate instances in duplicate-free training data """ all_duplicates = getDuplicates(filtered_training_data + filtered_test_data) filtered_test_data = removeDuplicates(filtered_test_data, all_duplicates, True) """ Sanity check """ assert(len(filtered_test_data) + len(filtered_training_data) + len(combined_duplicates) == len(combined_data)) """ Write out the intermediate .csv files with duplicates marked for debugging """ csv.writeCsv(appendDescription(dir, training_file, 'sorted'), markDuplicates(training_data)) csv.writeCsv(appendDescription(dir, test_file, 'sorted'), markDuplicates(test_data)) csv.writeCsv(appendDescription(dir, combined_file, 'sorted'), markDuplicates(combined_data)) csv.writeCsv(appendDescription(dir, random_file, 'sorted'), markDuplicates(random_data)) """ Read the names of the classes and attributes from downloaded files """ classes = parseClasses(os.path.join(dir, classes_file)) attrs = parseAttrs(os.path.join(dir, attrs_file)) """ Add class and attribute names to original data, for comparison with filter data """ original_named_training_data = applyAttrs(training_data, attrs) original_named_test_data = applyAttrs(test_data, attrs) original_named_combined_data = applyAttrs(combined_data, attrs) """ Add class and attribute names to filtered data """ named_training_data = applyAttrs(filtered_training_data, attrs) named_test_data = applyAttrs(filtered_test_data, attrs) named_combined_data = applyAttrs(filtered_combined_data, attrs) named_random_data = applyAttrs(filtered_random_data, attrs) """ Get the class distribution """ class_distribution_training = getClassDistribution(named_training_data) class_distribution_test = getClassDistribution(named_test_data) class_distribution_combined = getClassDistribution(named_combined_data) named_training_data = removeClassesWithFewerInstances(named_training_data, class_distribution_training,2) """ Create a header row for the .csv file """ header = makeHeaderRow(attrs) """ Write out the .csv files """ csv.writeCsv(appendDescription(dir, training_file, 'distribution'), dictToMatrix(class_distribution_training), ['Class', 'Number']) csv.writeCsv(appendDescription(dir, training_file, 'orig'), named_training_data, header) csv.writeCsv(appendDescription(dir, test_file, 'orig'), named_test_data, header) csv.writeCsv(appendDescription(dir, combined_file, 'orig'), named_combined_data, header) csv.writeCsv(appendDescription(dir, training_file, 'named'), original_named_training_data, header) csv.writeCsv(appendDescription(dir, test_file, 'named'), original_named_test_data, header) csv.writeCsv(appendDescription(dir, combined_file, 'named'), original_named_combined_data, header) """ Write out the .arff files """ writeArff(buildPath(dir, training_file, '.arff', 'orig'), 'soybean', classes, attrs, original_named_training_data) writeArff(buildPath(dir, test_file, '.arff', 'orig'), 'soybean', classes, attrs, original_named_test_data) writeArff(buildPath(dir, combined_file, '.arff', 'orig'), 'soybean', classes, attrs, original_named_combined_data) writeArff(buildPath(dir, training_file, '.arff'), 'soybean', classes, attrs, named_training_data) writeArff(buildPath(dir, test_file, '.arff'), 'soybean', classes, attrs, named_test_data) writeArff(buildPath(dir, combined_file, '.arff'), 'soybean', classes, attrs, named_combined_data) writeArff(buildPath(dir, random_file, '.arff'), 'soybean', classes, attrs, named_random_data)
import csv CSV_FILE_NAME = 'search.csv' # start timer and go timeStart = time.time() search = SearchDownload() # download first search result page html = search.getData() # count pages of search results bsSearch = BeautifulSoup(html, "html.parser") pagesCount = parse_search.pagesCount(bsSearch) # get items from search results itemList = parse_search.itemList(bsSearch) #print(itemList) csv.writeCsv(itemList, CSV_FILE_NAME) # download all pages of search results for page in range(2, pagesCount + 1): time.sleep(1) html = search.getData(page) bsSearch = BeautifulSoup(html, "html.parser") itemList = parse_search.itemList(bsSearch) csv.writeCsv(itemList, CSV_FILE_NAME, append=True) timeDuration = -timeStart + time.time() print('Duration: {0} seconds'.format(int(ceil(timeDuration))))