Esempio n. 1
0
def selectAttibutesGA():
	matrix = csv.readCsvRaw(csv.headered_name_pca_corr)
	num_attributes = len(matrix[0])-1
	if False:
		num_subset = 5
		num_trials = max(100, num_attributes*2)
		results = findBestOfSize(matrix, num_subset, num_trials)
		order = orderByResults(results,num_attributes)
	if True:
		sort_order = [i for i in range(num_attributes)]
		for num_subset in range(5, num_attributes, 5):
			num_trials = max(100, num_attributes*2)
			csv_matrix_name  = csv.makeCsvPath('subset.matrix' +('%03d'%num_subset))
			csv_results_name = csv.makePath('subset.results'+('%03d'%num_subset))
			csv_best_name    = csv.makeCsvPath('subset.best'   +('%03d'%num_subset))
			csv_summary_name  = csv.makeCsvPath('subset.summary'+('%03d'%num_subset))
		
			ordered_matrix = pca.reorderMatrix(matrix, sort_order)
			csv.writeCsv(csv_matrix_name, ordered_matrix)
			
			results = findBestOfSize(ordered_matrix, num_subset, num_trials, csv_summary_name)
			
			sort_order = orderByResults(results,num_attributes)
			#c_x = results[0].columns + [-1]      # include outcome
			#sub_matrix = [[row[i] for i in c_x] for row in ordered_matrix]
			#csv.writeCsv(csv_best_name,sub_matrix, )
			if not is_testing:
				shutil.copyfile(results[0]['csv'],csv_best_name)
				shutil.copyfile(results[0]['results'],csv_results_name)
Esempio n. 2
0
def findAutoCorrelations(time_series_csv, max_lag, fraction_training):
    """ Run auto-correlations independently 2 column time series 
        by converting into a regression with max_lag x and y lags
        per instance 
        fraction_training is the fraction of sample used for training
    """  
    base_name = os.path.splitext(time_series_csv)[0]
    auto_correlation_matrix_csv = base_name + '.autocorrelation.csv'
    time_series_data,header = csv.readCsvFloat2(time_series_csv, True)
    number_training = int(float(len(time_series_data))*fraction_training)
    print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data)
    assert(number_training > max_lag)
    time_series = NP.transpose(NP.array(time_series_data))
    days_downloads = getDaysOfWeekToKeep(time_series[0,:number_training])
    days_purchases = getDaysOfWeekToKeep(time_series[1,:number_training])
    print days_downloads
    print days_purchases
    exit()
    removeOutlierDaysOfWeek(time_series[1,:number_training])
    removeOutiers(time_series[1,:number_training], 0.8)
    downloads = time_series[0,:number_training]
    purchases = time_series[1,:number_training]
    #auto_correlations = [getAutoCorrelation(time_series[i,:number_training], max_lag) for i in range(time_series.shape[2])]
    #return (getAutoCorrelation(downloads, max_lag),getAutoCorrelation(purchases, max_lag))
    auto_correlation_data = NP.hstack([getAutoCorrelation(downloads, max_lag),getAutoCorrelation(purchases, max_lag)])
    csv.writeCsv(auto_correlation_matrix_csv, list(auto_correlation_data), header)
Esempio n. 3
0
def getPCAProjections(input_filename):
	""" Run PCA on the Kushmerick ad data
        Stop when there are sufficient PCA components to explain threshold_variance
        Project input data onto the top PCA components that explain threshold_variance
        Normalize this data
        Sort attributes by their correlation with output
        - input_filename : prepocessed data
        - Output 'pca': data projected onto PCA components
        		 'norm': pca data normalized to std deviation 1
        		 'corr': normalized data sorted by correlation with output
        		 'index': 
  	"""
  	print 'getPCAProjections:', input_filename
 	explained_variance = 0.99
 	root_name = 'pca%03d' % round(explained_variance * 100.0)
 	pca_filename = csv.makeCsvPath(root_name)
 	pca_norm_filename = csv.makeCsvPath(root_name + '.norm')
 	pca_norm_corr_filename = csv.makeCsvPath(root_name + '.norm.corr')
 	corr_index_filename = csv.makeCsvPath(root_name + '.corr.idx')
    
	pca.pcaAdData(explained_variance, input_filename, pca_filename)
	pca.normalizeData(pca_filename, pca_norm_filename)    
	
	sort_order, corr_index = pca.rankByCorrelationWithOutcomes(pca_norm_filename)
	def reorder(in_cells):
	    return pca.reorderMatrix(in_cells, sort_order)
	csv.modifyCsvRaw(pca_norm_filename, pca_norm_corr_filename, reorder)
	csv.writeCsv(corr_index_filename, corr_index)
	
	return {'pca': pca_filename, 'norm':pca_norm_filename, 'corr':pca_norm_corr_filename, 'index':corr_index_filename}
Esempio n. 4
0
def timeSeriesToMatrixCsv(regression_matrix_csv, time_series, max_lag):
    """ Convert a 2 row time series into a 
    regression matrix """
    regression_matrix = timeSeriesToMatrixArray(time_series, max_lag)
    header_x = ['x[%0d]' % i for i in range(-max_lag,0)]
    header_y = ['y[%0d]' % i for i in range(-max_lag,1)]
    header = header_x + header_y
    csv.writeCsv(regression_matrix_csv, list(regression_matrix), header)
Esempio n. 5
0
def makeTimeSeriesCsv(filename, purchase_max_lag, number_days,
                      mean_downloads_per_day, mean_purchases_per_download,
                      mean_other_purchases):
    (downloads, purchases) = makeTimeSeries(purchase_max_lag, number_days,
                                            mean_downloads_per_day,
                                            mean_purchases_per_download,
                                            mean_other_purchases)
    data = zip(downloads, purchases)
    csv.writeCsv(filename, data, ['downloads', 'purchases'])
def findBestAttributes(output_dir, base_filename, algo_key, data, attributes, is_inclusive):
    num_attrs = len(attributes) - 1

    # Track results for each round
    series_results = []
    # Track best results in each round for back-tracking. This makes series_results redundant
    all_results = []
    

    def getBestPreviousSubsets(subset_size):
        """ Construct best subset of size <subset_size> from top performers of each smaller subset size """
        best_previous_subsets = []
        for results in all_results:
            subset = set()
            for r in results:
                part_subset = [i for i in r['subset'] if i != class_index]
                for i in part_subset:
                    subset.add(i)
                    if len(subset) >= subset_size: break
                if len(subset) >= subset_size: break
            if len(subset) == subset_size:
                subset.add(class_index)
                best_previous_subsets.append(sorted(subset))
        print 'best_previous_subsets =', best_previous_subsets
        return best_previous_subsets
                    
        
    num_attrs_start = 1 if is_inclusive else 0

    # Loop through all sizes of subsets of attributes, largest first
    # Go hhel way !@#$
    for subset_size in range(num_attrs_start, (len(attributes) + 3)//4):
        if subset_size == num_attrs_start:
            if is_inclusive:
                results = [getSubsetResultDict(output_dir, algo_key, data, attributes, [class_index, i], True) for i in range(num_attrs) if i != class_index]
            else:
                results = [getSubsetResultDict(output_dir, algo_key, data, attributes, [], is_inclusive)]
        else:
            best_previous_subsets = getBestPreviousSubsets(subset_size) if is_inclusive else []
            results = findBestAttributesForSubsetSize(output_dir, base_filename, algo_key, data, attributes, results, subset_size, is_inclusive, best_previous_subsets)
        results.sort(key = lambda x: -x['score'])
        results[0]['num_attrs'] = num_attrs-subset_size
        series_results.append(results[0])
        all_results.append(results[:100])

        # Write out the results
        out_filename = makeFileName(output_dir, base_filename, algo_key, -1, 'csv')
        header = getCsvResultHeader()
        results_matrix = [getCsvResultRow(r, attributes, is_inclusive) for r in series_results]
        csv.writeCsv(out_filename, results_matrix, header)
        
        out_num_attrs_filename = makeFileName(output_dir, base_filename, algo_key + '.%02d'%subset_size, -1, 'csv')
        header = getCsvResultHeader()
        results_matrix = [getCsvResultRow(r, attributes, is_inclusive) for r in results[:100]]
        csv.writeCsv(out_num_attrs_filename, results_matrix, header)
        
    return series_results
Esempio n. 7
0
def wekaToPredict(weka_filename):
    """ Convert a Weka formatted .cvs file to a Google
        Predict .csv file by moving class from last 
        column to first
    """
    parts = os.path.splitext(weka_filename)
    predict_filename = parts[0] + '.gp' + parts[1]
    print 'wekaToPredict:', weka_filename,'=>', predict_filename
    weka = csv.readCsvRaw(weka_filename)
    predict = [[w[-1]] + w[:-1] for w in weka]
    csv.writeCsv(predict_filename, predict) 
Esempio n. 8
0
def testMatrixMLP(matrix, columns, opts = mlp_opts):
	"Run MLP on attributes with index in columns"
	c_x = columns + [-1]      # include outcome
	sub_matrix = [[row[i] for i in c_x] for row in matrix]
	temp_base = csv.makeTempPath('subset'+('%03d'%len(columns))+'_')
	temp_csv = temp_base + '.csv'
	temp_results = temp_base + '.results'
	if is_testing:
		num_attributes = len(matrix[0]) - 1
		accuracy,dt = 1.0/float(sum([abs(x-num_attributes/2) for x in columns])), 0.1
	else:
		csv.writeCsv(temp_csv, sub_matrix)
		accuracy,dt = runMLPTrain(temp_csv, temp_results, opts)
	return (accuracy, temp_csv, temp_results, dt)
Esempio n. 9
0
def timeSeriesToMatrixCsv(regression_matrix_csv, time_series, masks, max_lag):
    """ Convert a 2 row time series into a 
    regression matrix """
    regression_matrix,regression_mask, means, stddevs = timeSeriesToMatrixArray(time_series, masks, max_lag)
    header_x = ['x[%0d]' % i for i in range(-max_lag,0)]
    header_y = ['y[%0d]' % i for i in range(-max_lag,1)]
    header = header_x + header_y
    regression_data = [[str(regression_matrix[i,j]) if regression_mask[i,j] else '?' 
                        for j in range(regression_matrix.shape[1])]
                            for i in range(regression_matrix.shape[0])]
    # Eliminate rows with no output 
    regression_data = [x for x in regression_data if not x[len(x)-1] == '?']
    print regression_data[0]
        
    csv.writeCsv(regression_matrix_csv, regression_data, header)
    return (means, stddevs) 
Esempio n. 10
0
def normalizeData(in_fn, out_fn):
    """ Normalize ad data to equal std dev
        in_fn : read input data from this csv file
        out_fn : write output data to this csv fuile
    """
    print 'normalizeData:', in_fn, '=>', out_fn
    in_cells = csv.readCsvRaw(in_fn)
    csv.validateMatrix2(in_cells)
  
    # Remove header row on top and category row on right
    in_data = [[float(e.strip()) for e in row[:-1]] for row in in_cells[1:3280]]
    print 'data', len(in_data), len(in_data[0])
        
    out_data = normalizeMatrix(in_data)
    print 'out_data', len(out_data), len(out_data[0])  
    
    out_cells = [in_cells[0]] + [out_data[i-1] + [in_cells[i][-1]] for i in range(1,len(in_cells))]  
    csv.writeCsv(out_fn, out_cells)
Esempio n. 11
0
def pcaAdData(theshold_variance, in_filename, out_filename):   
    """ Run PCA on the Kushmerick ad data
        Stop when there are sufficient PCA components to explain threshold_variance
        Project input data onto these PCA components
        - in_filename : input data read from this CSV file
        - out_filename : output data written to this CSV file
    """
    h2data = csv.readCsvRaw(in_filename)
    csv.validateMatrix(h2data)
    
    # Boolean data are columns 3 to second last
    bool_data = [[float(e) for e in v[3:-1]] for v in h2data[1:]]
    print 'bool_data', len(bool_data), len(bool_data[0])
    x = array(bool_data)
    
    # Find the output dimension (#basis vectors) required to explain
    # threshold_variance
    print 'output_dim, explained_variance, time(sec)' 
    for odim in range(50, len(x[0]), 50):
        start_time = time.clock()
        pcanode = mdp.nodes.PCANode(svd=True, output_dim = odim, dtype='float64')
        pcanode.train(x)
        p = pcanode.get_projmatrix()
        d = pcanode.output_dim
        print '%10d' % d, ',',
        v = pcanode.explained_variance
        print '%15.03f' % v, ',',
        print '%6.1f' % (time.clock() - start_time)
        if v >= theshold_variance:
            break
    #print '-----------------------------1'
    print 'p', len(p), len(p[0]) 
    #print '-----------------------------2'
    # Project out data onto PCA components    
    xfd = dot(x, p)    
    pca = [[x for x in row] for row in xfd]
    print 'pca', len(pca), len(pca[0])    
    pca_header = ['pca_%03d' % i for i in range(len(pca[0]))]
    header = h2data[0][:3] + pca_header + [h2data[0][-1]]
    num_data = [h2data[i+1][:3] + pca[i] + [h2data[i+1][-1]] for i in range(len(h2data)-1)] 
    data = [header] + num_data   
    csv.writeCsv(out_filename, data)
Esempio n. 12
0
def preprocess(raw_name, headered_name, headered_name_pp):
	"""	Add headers and pre-process the raw Kushmerick  data. 
		This needs to be done once.
		- raw_name is the Kushmerick data that is input
		- headered_name is the name of CSV file with headers that is created
		- headered_name_pp is the named a file created by preprocessing header name that is created
	"""
	print 'preprocess', raw_name, '=>', headered_name, '=>', headered_name_pp
	header = csv.makeHeader()
	data = csv.readCsvRaw(raw_name)
    
	hdata = [header] + data
	assert(len(hdata)==len(data)+1)
	csv.validateMatrix(hdata)

	#swapMatrixColumn(data, 3, -1)
	csv.writeCsv(headered_name, hdata)
	h2data = csv.readCsvRaw(headered_name)
    
	csv.replaceMissingValues(hdata)
	csv.writeCsv(headered_name_pp, hdata)
if __name__ == '__main__':

	if len(sys.argv) < 2:
		print 'Usage: jython find_duplicate_attributes.py  <arff-file>'
		sys.exit()

	base_filename = sys.argv[1]
	
	print base_filename

	relation, comments, attributes, data = arff.readArff(base_filename)

	sorted_data = sorted(data, key = lambda x: x[1:] + [x[0]])

	csv.writeCsv('temp.csv', sorted_data, [a['name'] for a in attributes])

	duplicates = []
	for i in range(1, len(sorted_data)):
		if sorted_data[i] == sorted_data[i-1]:
			duplicates.append(i)

	print 'duplicates', len(duplicates), duplicates
	
	num_attrs = len(attributes)
	def getHamming(d1, d2):
		hamming = 0
		for i in range(1, num_attrs):
			if d1[i] != d2[i]:
				hamming += 1
		return hamming
Esempio n. 14
0
def makeTimeSeriesCsv(filename, purchase_max_lag, number_days, mean_downloads_per_day, mean_purchases_per_download, mean_other_purchases):
    (downloads, purchases) = makeTimeSeries(purchase_max_lag, number_days, mean_downloads_per_day, mean_purchases_per_download, mean_other_purchases) 
    data = zip(downloads, purchases)
    csv.writeCsv(filename, data, ['downloads', 'purchases'])           
Esempio n. 15
0
    if len(listGpu) > 1:
        choose = chooseItem(item, listGpu)
        if len(choose) > 0: listGpu = choose
    if len(listGpu) > 1:
        choose = chooseItemPairs(item, listGpu)
        if len(choose) > 0: listGpu = choose
    if len(listGpu) == 1:
        (gpuName, gpuNameList, gpuPerf, gpuLink) = listGpu[0]
        listRecognized.append([name, gpuName, price, gpuPerf, link, gpuLink])
    else:
        print(link)
        unknown = []
        for gpu in listGpu:
            (gpuName, gpuNameList, gpuPerf, gpuLink) = gpu
            unknown += [gpuName, gpuPerf]
        listUnknown.append([link, name, price] + unknown)
print('Recognized: {0}\nUnknown: {1}'.format(len(listRecognized),
                                             len(listUnknown)))
#print(listRecognized)

# save merged list
csv.writeCsv(listRecognized, 'recognized.csv')

# save unrecognized list
csv.writeCsv(listUnknown, 'undefined.csv')
#print(listUnknown[-1])
#print(listSearch[-1])

timeDuration = -timeStart + time.time()
print('Duration: {0} seconds'.format(int(ceil(timeDuration))))
Esempio n. 16
0
def analyzeTimeSeries(filename, max_lag, fraction_training):
    """ Main function. 
        Analyze time series in 'filename' (assumed to be a CSV for now)
        Create model with up to mag_lag lags
        Use the first fraction_training of data for training and the 
        remainder for testing
    """
    
    base_name = os.path.splitext(filename)[0]
    regression_matrix_csv = base_name + '.regression.csv'
    results_filename = base_name + '.results' 
    model_filename = base_name + '.model' 
    prediction_matrix_csv = base_name + '.prediction.csv'
    
    """ Assume input file is a CSV with a header row """
    time_series_data, header = csv.readCsvFloat2(filename, True)
    
    """ Assume a weekly pattern """
    number_training = (int(float(len(time_series_data))*fraction_training)//7)*7
    
    print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data)
    assert(number_training > max_lag)
    
    time_series = NP.transpose(NP.array(time_series_data))
    describeNPArray('time_series', time_series)
        
    training_time_series = NP.transpose(NP.array(time_series_data[:number_training]))
    print 'training_time_series.shape', training_time_series.shape
    
    t = NP.arange(time_series.shape[1])
    training_t = NP.arange(training_time_series.shape[1])
    
    num_series = training_time_series.shape[0]
    num_rows = training_time_series.shape[1]
    
    days_to_keep = [getDaysOfWeekToKeep(training_time_series[i,:]) for i in range(num_series)]
    
    masks = [getDaysOfWeekMask(days_to_keep[i], time_series.shape[1]) for i in range(num_series)]
    training_masks = [getDaysOfWeekMask(days_to_keep[i], num_rows) for i in range(num_series)]
    
    trends = [getTrend(training_t, training_time_series[i,:], training_masks[i]) for i in range(num_series)]
    
    x = [removeTrend1D(trends[i], training_t, training_time_series[i], training_masks[i]) for i in range(num_series)]
    for i in range(num_series):
        describeNPVector('x[%0d]'%i, x[i])
    detrended_training_time_series = NP.zeros([num_series, x[0].shape[0]])
    print 'detrended_training_time_series.shape', detrended_training_time_series.shape
    for i in range(num_series):
        print 'x[%0d].shape'%i, x[i].shape
        detrended_training_time_series[i,:] = x[i]
    print 'detrended_training_time_series.shape', detrended_training_time_series.shape
    # filtered_time_series = NP.vstack([filterDaysOfWeek(training_time_series[i,:], days_to_keep[i]) for i in range(num_series)])
    # print 'filtered_time_series.shape', filtered_time_series.shape
   
    for i in range(num_series):
        describeNPVector('detrended_training_time_series[%0d]'%i, detrended_training_time_series[i])
        
    means, stddevs = timeSeriesToMatrixCsv(regression_matrix_csv, detrended_training_time_series, training_masks, max_lag)
    print 'means', means
    print 'stddevs', stddevs
    run_weka.runMLPTrain(regression_matrix_csv, results_filename, model_filename, True, '-H 4')
    coefficients = run_weka.getCoefficients(results_filename)
   
    print '--------------------------------------------'
    print 'coefficients', len(coefficients)
    print coefficients
    print '--------------------------------------------'
    print 'means', len(means)
    print means
    print '--------------------------------------------'
    print 'stddevs', len(stddevs)
    print stddevs
    print '--------------------------------------------'
    #exit()
    detrended_full_x = [removeTrend1D(trends[i], t, time_series[i], masks[i]) for i in range(num_series)]
    detrended_time_series = NP.zeros([num_series, detrended_full_x[0].shape[0]])
    print 'detrended_time_series.shape', detrended_time_series.shape
    for i in range(num_series):
        print 'full_x[%0d].shape'%i, detrended_full_x[i].shape
    detrended_predictions = predictTimeSeries(coefficients, means, stddevs, t, detrended_full_x[0], detrended_full_x[1], number_training, max_lag, masks)
    predictions = addTrend1D(trends[1], t, detrended_predictions, masks[1]) 
    print '--------------------------------------------'
    print 'predictions =', predictions.shape
    # print predictions
    full_x = [NP.array(time_series[i]) for i in range(num_series)]
    
    print 't.shape', t.shape
    print 'full_x[0].shape', full_x[0].shape
    print 'full_x[1].shape', full_x[1].shape
    print 'predictions.shape', predictions.shape
    
    predicted_time_series = NP.vstack([t, full_x[0], full_x[1], predictions])
    
    print 'predicted_time_series.shape', predicted_time_series.shape
    # retrend !@#$\\
    prediction_header = ['t', 'x', 'y', 'y_pred']
    predicted_time_series_data = [[str(predicted_time_series[i,j]) 
                                    for i in range(predicted_time_series.shape[0])]
                                        for j in range(predicted_time_series.shape[1])]
                            
    csv.writeCsv(prediction_matrix_csv, predicted_time_series_data, prediction_header)
Esempio n. 17
0
def getGradeCounts():
	""" Read the data file
		This is a set of grades """
	data,header = csv.readCsvRaw2(os.path.join(dir, data_file), True)

	""" Category is in column 1 """
	categories = set([instance[1] for instance in data])

	print 'categories -------------------------------'
	for cat in sorted(categories):
		print cat

	""" Grades end in _g """
	subject_grades_columns = [i for i, h in enumerate(header) if '_g' in h]
	num_subjects = len(subject_grades_columns)

	print 'grades columns names ---------------------'
	for i in range(num_subjects):
		print '%5d, %6d,' % (i, subject_grades_columns[i]), header[subject_grades_columns[i]]

	possible_grades = frozenset(['HD','D','C','P','N'])
	print 'grades -----------------------------------'
	
	for i in range(num_subjects):
		print '%-10s' % header[subject_grades_columns[i]], possible_grades

	""" counts = categories : subjects : grades """
	"""	First create all the counters """
	counts = {} 
	for cat in categories:
		counts[cat] = [{}.fromkeys(possible_grades, 0) for i in range(num_subjects)] 

	""" Count all the category:subject:grade bins """
	for instance in data:
		cat = instance[1]
		cnt = counts[cat]
		for i in range(num_subjects):
			col = subject_grades_columns[i]
			v = instance[col]
			cnt[i][v] = cnt[i][v] + 1

	""" Calculate totals """
	totals = {}

	for cat in categories:
		totals[cat] = {}
		cnt = counts[cat]
		for i in range(num_subjects):
			for k in cnt[i].keys():
				totals[cat][k] = totals[cat].get(k,0) + cnt[i][k]
		print 'totals[%s]' % cat, totals[cat]
		counts[cat] = cnt + [totals[cat]]

	header.append('total')
	subject_grades_columns.append(len(header)-1)
	num_subjects = num_subjects + 1 
	print header

	""" Display the data as a .csv """
	count_header = ['subject']
	for i in range(num_subjects):
		count_header.append(header[subject_grades_columns[i]])
		for j in range(len(possible_grades)):
			count_header.append('')

	count_header2 = ['grade']
	for i in range(num_subjects):
		count_header2.append('')
		for k in possible_grades:
			count_header2.append(k)

	count_data = [count_header, count_header2]
	for cat in sorted(counts.keys()):
		row = ['']
		for i in range(num_subjects):
			row.append('cat_%s' % cat)
			for k in possible_grades:
				row.append(counts[cat][i][k])
		count_data.append(row)

	csv.writeCsv(os.path.join(dir,counts_file), transpose(count_data))	
Esempio n. 18
0
def runWekaOnTimeSeries(time_series_csv, max_lag, fraction_training):
    """ Run Weka training a 2 column time series 
        by converting into a regression with max_lag x and y lags
        per instance 
        fraction_training is the fraction of sample used for training
    """  
    base_name = os.path.splitext(time_series_csv)[0]
    regression_matrix_csv = base_name + '.regression.csv'
    results_filename = base_name + '.results' 
    model_filename = base_name + '.model' 
    predictions_filename =  base_name + '.predict'
    test_filename = base_name + '.test.csv'
    evaluation_filename = base_name + '.evaluation.csv'
    
    time_series_data,_ = csv.readCsvFloat2(time_series_csv, True)
    number_training = (int(float(len(time_series_data))*fraction_training)//7)*7

    print 'number_training', number_training, 'fraction_training', fraction_training,'len(time_series_data)',len(time_series_data)
    assert(number_training > max_lag)
    
    training_time_series = NP.transpose(NP.array(time_series_data[:number_training]))
    print '1: training_time_series.shape', training_time_series.shape
    
    if True:
        days_downloads = getDaysOfWeekToKeep(training_time_series[0,:])
        days_purchases = getDaysOfWeekToKeep(training_time_series[1,:])
        training_time_series = NP.vstack([filterDaysOfWeek(training_time_series[0,:], days_downloads),
         filterDaysOfWeek(training_time_series[1,:], days_purchases)])
        print '2: training_time_series.shape', training_time_series.shape
    
    if True:
    
        timeSeriesToMatrixCsv(regression_matrix_csv, training_time_series, max_lag)
        run_weka.runMLPTrain(regression_matrix_csv, results_filename, model_filename, True)
    
    print 'number_training, training_time_series.shape[1]', number_training, training_time_series.shape[1]
    number_training_x = number_training #- 5
    
    prediction_data = CP.deepcopy(time_series_data)
    prediction_data_downloads = [[row[0],0] for row in prediction_data]

    for i in range(number_training_x, len(prediction_data)):
        if i%7 in days_purchases:
            prediction_array = NP.transpose(NP.array(prediction_data[i-max_lag:i+1]))
            timeSeriesToMatrixCsv(test_filename, prediction_array, max_lag)
            run_weka.runMLPPredict(test_filename, model_filename, predictions_filename)
            prediction_list = run_weka.getPredictionsRegression(predictions_filename)
            print 'predictions', prediction_list
            prediction = prediction_list[0]['predicted']
            if False:
                prediction_array_downloads = NP.transpose(NP.array(prediction_data_downloads[i-max_lag:i+1]))
                timeSeriesToMatrixCsv(test_filename, prediction_array_downloads, max_lag)
                run_weka.runMLPPredict(test_filename, model_filename, predictions_filename)
                prediction_list_downloads = run_weka.getPredictionsRegression(predictions_filename)
                print 'predictions_downloads', prediction_list_downloads
                prediction_downloads = prediction_list[0]['predicted']
        else:
            prediction = -1
            prediction_downloads = -1
        prediction_data[i][1] = prediction
        #prediction_data[i] = [prediction_data[i][0], prediction, prediction_downloads]
       
          
    evaluation_data = []
    for i in range(len(prediction_data)-number_training_x):
        if i%7 in days_purchases:
            row = [0]*5
            for j in [0,1]:
                row[j] = time_series_data[number_training_x+i][j]
            row[2] = prediction_data[number_training_x+i][1] 
            row[3] = abs(row[2]-row[1])
            row[4] = row[3]/abs(row[2]+row[1]) if abs(row[2]+row[1]) else row[3]
            evaluation_data.append([number_training_x+i]+row)
     
    evaluation_header = ['i', 'x', 'y_actual', 'y_predicted', 'abs_error', 'normalized_error']
    
    csv.writeCsv(evaluation_filename, evaluation_data, evaluation_header)
                    'RFCD.Percentage.1',
                    'Number.of.Unsuccessful.Grant',
                    'SEO.Percentage.2',
                    'Number.of.Successful.Grant',
                    'Start.date']

    def makeAttrs(data_dict, numeric_keys):
        header = sorted(data_dict.keys(), key = lambda x: ' ' if x == 'Grant.Status' else x )
        attrs = {}
        for k,v in data_dict.items():
            if k in numeric_keys:
                attrs[k] = 'numeric'
            else:
                attrs[k] = sorted(set(x for x in v if x not in NO_VALUES))
        return header, attrs

    header, attrs = makeAttrs(data_dict_many, numeric_keys)
    columns_many = [data_dict_many[k] for k in header]
    data_many = misc.transpose(columns_many)
    data_many.sort(key = lambda x: -getNumElements(x))

    print header

    arff.writeArff2(outname + '.arff', None, 'relation', header, attrs, data_many[:10000])
    csv.writeCsv(outname + '.csv', data_many, header)

    if False:
        name = 'SEO.Code.4'
        keys, histo = getFreqHisto(data_dict[name])
        print name, ['%s:%d' % (k, histo[k]) for k in keys]
Esempio n. 20
0
def preprocessSoybeanData():
	""" Pre-process the Soybean data set downloaded from http://archive.ics.uci.edu/ml/machine-learning-databases/soybean/
	"""

	""" Read the data files """
	training_data = csv.readCsvRaw(os.path.join(dir, training_file))
	test_data = csv.readCsvRaw(os.path.join(dir, test_file))
	
	""" Combined data file """
	combined_data = test_data + training_data
	print 'combined data', len(combined_data), len(combined_data[0])

	""" Random data file where the percentage of each class and attribute
		matches the combined data """
	random_data = getRandomData(combined_data)

	""" Find the duplicate instances in each data set
		The number of duplicates in random_data provides an estimate
		of the number of duplicates that would occur in the real
		data sets by pure chance """
	training_duplicates = getDuplicates(training_data)
	print 'training_duplicates =', len(training_duplicates)
	test_duplicates = getDuplicates(test_data)
	print 'test_duplicates =', len(test_duplicates)
	combined_duplicates = getDuplicates(combined_data)
	print 'combined_duplicates =', len(combined_duplicates)
	random_duplicates = getDuplicates(random_data)
	duplicates_warning = '*** Data files should not contain duplicates!' if len(random_duplicates) == 0 else ''
	print 'random_duplicates =', len(random_duplicates), duplicates_warning
	
	""" Remove duplicate instances within each data set 
		We know removing duplicates is valid if len(random_duplicates) is zero """
	filtered_training_data = removeDuplicates(training_data, training_duplicates, False)
	filtered_test_data = removeDuplicates(test_data, test_duplicates, False)
	filtered_combined_data = removeDuplicates(combined_data, combined_duplicates, False)
	filtered_random_data = removeDuplicates(random_data, random_duplicates, False)

	""" Remove the instances in duplicate-free test data that duplicate instances 
		in duplicate-free training data """
	all_duplicates = getDuplicates(filtered_training_data + filtered_test_data)
	filtered_test_data = removeDuplicates(filtered_test_data, all_duplicates, True)

	""" Sanity check """
	assert(len(filtered_test_data) + len(filtered_training_data) + len(combined_duplicates) == len(combined_data))

	""" Write out the intermediate .csv files with duplicates marked for debugging """
	csv.writeCsv(appendDescription(dir, training_file, 'sorted'), markDuplicates(training_data))
	csv.writeCsv(appendDescription(dir, test_file, 'sorted'), markDuplicates(test_data))
	csv.writeCsv(appendDescription(dir, combined_file, 'sorted'), markDuplicates(combined_data))
	csv.writeCsv(appendDescription(dir, random_file, 'sorted'), markDuplicates(random_data))

	""" Read the names of the classes and attributes from downloaded files """
	classes = parseClasses(os.path.join(dir, classes_file))
	attrs = parseAttrs(os.path.join(dir, attrs_file))

	""" Add class and attribute names to original data, for comparison with filter data """
	original_named_training_data = applyAttrs(training_data, attrs)
	original_named_test_data = applyAttrs(test_data, attrs)
	original_named_combined_data = applyAttrs(combined_data, attrs)
	
	""" Add class and attribute names to filtered data """
	named_training_data = applyAttrs(filtered_training_data, attrs)
	named_test_data = applyAttrs(filtered_test_data, attrs)
	named_combined_data = applyAttrs(filtered_combined_data, attrs)
	named_random_data = applyAttrs(filtered_random_data, attrs)
	
	""" Get the class distribution """
	class_distribution_training = getClassDistribution(named_training_data)
	class_distribution_test = getClassDistribution(named_test_data)
	class_distribution_combined = getClassDistribution(named_combined_data)

	named_training_data = removeClassesWithFewerInstances(named_training_data, class_distribution_training,2)

	""" Create a header row for the .csv file """
	header = makeHeaderRow(attrs)

	""" Write out the .csv files """
	
	csv.writeCsv(appendDescription(dir, training_file, 'distribution'), dictToMatrix(class_distribution_training), ['Class', 'Number'])
	
	csv.writeCsv(appendDescription(dir, training_file, 'orig'), named_training_data, header)
	csv.writeCsv(appendDescription(dir, test_file, 'orig'), named_test_data, header)
	csv.writeCsv(appendDescription(dir, combined_file, 'orig'), named_combined_data, header)

	csv.writeCsv(appendDescription(dir, training_file, 'named'), original_named_training_data, header)
	csv.writeCsv(appendDescription(dir, test_file, 'named'), original_named_test_data, header)
	csv.writeCsv(appendDescription(dir, combined_file, 'named'), original_named_combined_data, header)

	""" Write out the .arff files """
	writeArff(buildPath(dir, training_file, '.arff', 'orig'), 'soybean', classes, attrs, original_named_training_data)
	writeArff(buildPath(dir, test_file, '.arff', 'orig'), 'soybean', classes, attrs, original_named_test_data)
	writeArff(buildPath(dir, combined_file, '.arff', 'orig'), 'soybean', classes, attrs, original_named_combined_data)
	
	writeArff(buildPath(dir, training_file, '.arff'), 'soybean', classes, attrs, named_training_data)
	writeArff(buildPath(dir, test_file, '.arff'), 'soybean', classes, attrs, named_test_data)
	writeArff(buildPath(dir, combined_file, '.arff'), 'soybean', classes, attrs, named_combined_data)
	writeArff(buildPath(dir, random_file, '.arff'), 'soybean', classes, attrs, named_random_data)
Esempio n. 21
0
import csv

CSV_FILE_NAME = 'search.csv'

# start timer and go
timeStart = time.time()
search = SearchDownload()

# download first search result page
html = search.getData()

# count pages of search results
bsSearch = BeautifulSoup(html, "html.parser")
pagesCount = parse_search.pagesCount(bsSearch)

# get items from search results
itemList = parse_search.itemList(bsSearch)
#print(itemList)
csv.writeCsv(itemList, CSV_FILE_NAME)

# download all pages of search results
for page in range(2, pagesCount + 1):
    time.sleep(1)
    html = search.getData(page)
    bsSearch = BeautifulSoup(html, "html.parser")
    itemList = parse_search.itemList(bsSearch)
    csv.writeCsv(itemList, CSV_FILE_NAME, append=True)

timeDuration = -timeStart + time.time()
print('Duration: {0} seconds'.format(int(ceil(timeDuration))))