Example #1
0
    def calcWaypoints(self):
        ''' Given the waypoints, find their xy-coordinates. '''

        startLoc = dm.haversine(*self.waypoints['initial'])
        endLoc = dm.haversine(*self.waypoints['final'])

        return [endLoc[0] - startLoc[0], endLoc[1] - startLoc[1]]
Example #2
0
def hyper_parameters_testing():
    data = flr.read_from_file("train_x.txt")
    data = dm.add_bias(data)
    data = dm.convert_sex_to_number(data, 1, 1, 0)
    data = np.array(data)

    lables = flr.read_from_file("train_y.txt")
    lables = np.array(lables)
    lables = dm.convert_to_float(lables)

    # Normalaise the data
    min_range = np.ones(data.shape[1])
    min_range = np.multiply(-1, min_range)
    max_range = np.ones(data.shape[1])
    data = dm.min_max_normalization(data, min_range, max_range)

    # Set the properties we want to use
    ignore = np.ones(len(data[0]))

    # Pa algorithem
    alg_pa = pa.Pa(data, 0.1, 100, ignore, 3)
    alg_pa.train(data, lables, 0.01, 26, 5)
    alg_pa.print_details()

    eta_list = []
    epocnum_list = []
    accuracy_list = []
Example #3
0
 def bprintPlaylist(self):
     self.listWidget.clear()
     li = DataManipulation.test(
         DataManipulation.truncatePlusArtist(self.so), self.ar,
         DataManipulation.truncatePlusArtist(self.al))
     for i in li:
         self.listWidget.addItem(i)
def create_scatter_with_stats():
    """
    Creates a scatter plot which tries to visualize correlation between capture rate, base egg steps, and base total
    of Pokémon. To use 3 variables in a 2-dimensional plot, the colour channel is utilized to signify base total.
    """
    all_data = dm.load_data()
    data = dm.group_data_mean(all_data, "type")
    x_var = "capture_rate"
    y_var = "base_egg_steps"
    colour_by = "base_total"

    fig = go.Figure(
        data=go.Scatter(x=data[x_var].astype(int),
                        y=data[y_var].astype(int),
                        mode='markers',
                        marker=dict(size=16,
                                    color=data[colour_by],
                                    colorscale="Viridis",
                                    colorbar=dict(
                                        title="{}".format(colour_by)),
                                    showscale=True)))

    fig.update_layout(title="Correlation between {}, {}, and {}".format(
        x_var, y_var, colour_by),
                      xaxis_title="{}".format(x_var),
                      yaxis_title="{}".format(y_var))

    fig.update_xaxes(range=(-5, 260))
    fig.update_yaxes(range=(-1000, 35000))

    save_plot(fig, "CorrelationPlot")
Example #5
0
 def __init__(self):
     super(self.__class__, self).__init__()
     self.setupUi(self)
     self.msg = QMessageBox()
     self.btnSo = QtGui.QPushButton('Songs')
     self.btnAr = QtGui.QPushButton('Artists')
     self.btnAl = QtGui.QPushButton('Albums')
     self.msg.setText("Choose Field to get length of")
     self.msg.setWindowTitle("Get Length")
     self.msg.addButton(self.btnAl, QtGui.QMessageBox.YesRole)
     self.msg.addButton(self.btnSo, QtGui.QMessageBox.AcceptRole)
     self.msg.addButton(self.btnAr, QtGui.QMessageBox.NoRole)
     self.al = DataManipulation.getAlbums()
     self.ar = DataManipulation.getArtists()
     self.so = DataManipulation.getSongs()
     self.di = DataManipulation.getSongCountPerArtist(self.ar)
     self.btnPrint.clicked.connect(self.bprintPlaylist)
     self.btnGetSongs.clicked.connect(self.bgetSongs)
     self.btnGetArtists.clicked.connect(self.bgetArtists)
     self.btnGetAlbums.clicked.connect(self.bgetAlbums)
     self.btnFindDups.clicked.connect(self.bfindDups)
     self.btnNumSongs.clicked.connect(self.bnumSongs)
     self.btnTopA.clicked.connect(self.btopArtists)
     self.btnSingles.clicked.connect(self.bsingles)
     self.btnCreateSS.clicked.connect(self.bcreateSS)
     self.btnLen.clicked.connect(self.bgetLength)
def main():
    trainData = pd.read_csv('CrimeClassification/Dataset/train-2.csv')
    classesMap = dm.mapClasses(trainData)
    print trainData.info()
    print(classesMap)
    cleanedTrainData,normalizationValues = dm.cleanTrainData(trainData,classesMap)
    print(cleanedTrainData.info())
    data = cleanedTrainData.values
    np.random.shuffle(data.astype(np.float64))
    Ytrain = binarizeLabels(data[0:,0])
    Xtrain = data[0:,1:]
    model = trainModel(Xtrain,Ytrain)
    output = testProbaModel(model,Xtrain)


    testData = pd.read_csv('CrimeClassification/Dataset/test-2.csv')
    cleanedTestData = dm.cleanTestData(testData,normalizationValues)
    print(cleanedTestData.info())
    output = testProbaModel(model,cleanedTestData.values[:,1:])

    result = np.c_[cleanedTestData.values[:,0].astype(int),output]
    outputVec = sorted(classesMap, key=classesMap.__getitem__)
    outputVec.insert(0,'Id')
    dataFrameResults = pd.DataFrame(result,columns=outputVec)
    dataFrameResults['Id']=dataFrameResults['Id'].astype(int)
    dm.saveResults(dataFrameResults)
Example #7
0
 def bcreateSS(self):
     self.listWidget.clear()
     li = DataManipulation.prettyPrint(
         DataManipulation.truncatePlusArtist(self.so),
         DataManipulation.truncateArtists(self.ar, sortedSet=False),
         DataManipulation.truncatePlusArtist(self.al))
     for i in li:
         self.listWidget.addItem(i)
Example #8
0
def load_specific_kinase(kinase, start_center):
    fin_extract = read_kinase('Kinase_Substrate_Dataset');
    
    #select specidfied kinase and organism
    fg_read = [x[2] for x in fin_extract if x[0] == kinase and x[1] == 'human'];
    data_A = DataManipulation.list_to_formA(fg_read); 
    data_A = DataManipulation.subset_formA(data_A,0,start_center);
    
    return dict([('seq',fg_read),
                 ('formA',data_A)]);
def main():
    trainData = pd.read_csv("CrimeClassification/Dataset/train-2.csv")
    trainData.info()
    classesMap = dm.mapClasses(trainData)
    cleanedTrainData = dm.cleanData(trainData, classesMap)
    cleanedTrainData.describe()
    heatMapXY(cleanedTrainData, "Global heatMap of crimes")
    heatMapPerCategory(cleanedTrainData, classesMap)
    histogramOfCategories(cleanedTrainData, classesMap)
    textHistogram(cleanedTrainData, classesMap)
def main():

    Data = dm.extractData()
    Vectors = dm.dataToVectors(Data[:2000])  # Read First 2000 lines

    TrainingSet = Vectors[:1500]  # First 1500 Training Data
    TestingSet = Vectors[1500:]  # Last 500 Testing Data

    #testRBF(TrainingSet, TestingSet) # Test RBF Network
    kNearestNeighbors(TrainingSet, TestingSet, 50)  # Test kNearestNeighbors
Example #11
0
def exportProjections(newFile, projectionsFile='projections.csv', model_dir='edited'):
    projections = pd.read_csv(projectionsFile)
    predictions = runBatchPredict(projectionsFile, model_dir)
    newDF = pd.DataFrame()
    
    for i in predictions:
        projections['Sal'] = i[1]
        newRow = projections.loc[projections['Player'] == i[0]]
        newDF = newDF.append(newRow)
        
    DM.export(newDF, newFile)
    print('Exported!')
def main():
    trainData = pd.read_csv('CrimeClassification/Dataset/train-2.csv')
    classesMap = dm.mapClasses(trainData)
    print trainData.info()
    print(classesMap)
    cleanedTrainData,normalizationValues = dm.cleanTrainData(trainData,classesMap)
    print(cleanedTrainData.info())
    [Xtrain, Ytrain, Xtest,Ytest]=splitData(cleanedTrainData.values)
    model = trainModel(Xtrain,Ytrain)
    Ypred = testModel(model,Xtest)
    confMatrix = da.confusionMatrix(Ypred,Ytest)
    titleCM = da.orderClassesMapKeys(classesMap)
    da.plotConfusionMatrix(confMatrix,titleCM)
    print (da.f1Score(Ypred,Ytest))
def main():
    trainData = pd.read_csv("CrimeClassification/Dataset/train-2.csv")
    classesMap = dm.mapClasses(trainData)
    print trainData.info()
    print (classesMap)
    cleanedTrainData = dm.cleanData(trainData, classesMap)
    testData = pd.read_csv("CrimeClassification/Dataset/test-2.csv")
    cleanedTestData = dm.cleanTestData(testData)
    print (cleanedTrainData.info())
    model = trainModel(cleanedTrainData.values)
    result = np.c_[cleanedTestData.values[:, 0].astype(int), output]
    outputVec = sorted(classesMap, key=classesMap.__getitem__)
    outputVec.insert(0, "Id")
    dataFrameResults = pd.DataFrame(result, columns=outputVec)
    dataFrameResults["Id"] = dataFrameResults["Id"].astype(int)
    dm.saveResults(dataFrameResults)
Example #14
0
    def inputGPS(self):
        ''' End user must manually specify starting and stopping GPS. '''

        #        textInputApp = tk.Tk()
        #        textInput = tk.simpledialog.askfloat('Input GPS', \
        #            'Enter Lat in Deg: ', parent = textInputApp, minvalue = -180, \
        #            maxvalue = 180)

        #        self.halt(textInputApp)

        self.waypoints = {}

        #        startLat = float(input('Enter Starting Latitutde in Deg> '))
        #        startLon = float(input('Enter Starting Longitude in Deg> '))
        startLat = 0
        startLon = 0
        self.waypoints['initial'] = [startLat, startLon]

        #        endLat = float(input('Enter Ending Latitutde in Deg> '))
        #        endLon = float(input('Enter Ending Longitude in Deg> '))
        endLat = 0.0005
        endLon = 0.0005
        self.waypoints['final'] = [endLat, endLon]

        # Before we do anything, convert to radians
        for key in self.waypoints.keys():
            #map(dm.degToRad, self.waypoints[key])
            for i in range(len(self.waypoints[key])):
                self.waypoints[key][i] = dm.degToRad(self.waypoints[key][i])

        return None
Example #15
0
    def updateCompass(self):
        ''' Get new bearing data and update plot. '''

        # Clear the old compass point
        self.pole.cla()

        radii = [0, 1]
        # After many days of debugging, it was found that pyplot expects
        # theta coordiantes in radians even though the default theta
        # coordinate axes display degrees. Good to know!
        thisAngle = dm.degToRad(self.bearing)
        angles = [thisAngle for i in range(2)]

        # Plot data
        self.pole.plot(angles, radii, color='red')

        # Formatting
        #self.pole.set_title('Bearing')
        self.pole.set_yticklabels([])
        self.pole.set_xticklabels(['E', 'NE', 'N', 'NW', 'W', 'SW', 'S', 'SE'])

        # Add data and flush
        self.polar.canvas.draw()
        self.polar.canvas.flush_events()

        return None
Example #16
0
    def updateTextBoxes(self):
        ''' Update any text information in the gui. '''

        # Create text box for bearing and xy-coordinate
        msg = ''
        thisPos = dm.getLatestPositionData(self.updateCount)

        msg += 'X-pos: ' + str(round(thisPos[0], self.figs)) + '\n'
        msg += 'Y-pos: ' + str(round(thisPos[1], self.figs)) + '\n'
        msg += 'Bearing: ' + str(self.bearing) + '\n'

        # And embed this text box in the application
        self.bearingText = tk.Label(self.master, text=msg)
        self.bearingText.config(width=20)
        self.bearingText.config(font=('Consolas', 12))

        textColumn = 2
        textRow = 0
        self.bearingText.grid(column=textColumn, row=textRow, sticky='NW')

        #        gps = ''
        #        gps += 'Initial' + '\n'
        #        gps += 'Lat: ' + str(round(self.waypoints['start'][0], self.figs))
        #        gps += 'Lon: ' + str(round(self.waypoints['start'][1], self.figs))
        #
        #        self.gpsText = tk.Label(self.master, text = gps)

        return None
Example #17
0
def loadTestingData(date):
    """
    Loads the testing data from disk. Note, because of the potential
    for very large amounts of testing data, the testing data is returned
    as a numpy memmap.

    :param date: (string) Date in which the data was collected (YYYY_MMDD)

    :return: tuple containing the testing data (row ordered feature vectors),
             as well as the target labels (X, y). X has type np.memmap, and 
             y has type np.array
    """

    dataDirectory = DATA_DIRECTORIES[date + "_ML"]
    testingDataPath = os.path.join(dataDirectory, TESTING_DATA_PATH)

    sampleCounts = DataManipulation.loadSampleCounts(date)
    testingSamples = sampleCounts["testing"]

    # Open the file for appending
    testingData = np.memmap(testingDataPath,
                            mode='r',
                            dtype=np.float32,
                            shape=(testingSamples, NUM_WAVELENGTHS + 1))

    X = testingData[:, 1:]
    y = testingData[:, 0]

    return X, y
Example #18
0
def regression_linear_regression(data_array, cont_dis, cls_rmv, sig, y_col,
                                 split_array):
    print('-----------------------------------')
    print('Using Linear Regression Imputation')
    print('-----------------------------------')

    imp = 'Linear Regression Imputation:'

    # use linear regression for imputation
    d_a, stat_a, x, y, x_n, y_n, xr, rt, y_ar = DataManipulation.linear_regression_imputation(
        list(data_array), cont_dis, cls_rmv, sig, y_col)

    cod_r, n_cod_r, lse_r, n_lse_r, mse_r, n_mse_r = er_t(
        list(x), list(y), list(x_n), list(y_n), split_array)

    print('Unnormalized Data:')
    show_results(imp, cod_r, lse_r, mse_r)
    print('Normalized Data:')
    show_results(imp, n_cod_r, n_lse_r, n_mse_r)

    figure(1)
    title('Weight vs. Horse Power')
    plot(xr, rt, 'r--', label='weight vs regression')
    plot(xr, y_ar, 'o', label='c label')
    xlabel('Car Weight')
    ylabel('Horse Power')
    legend(['regression data', 'raw data'])
    show()

    return
Example #19
0
def loadTestingData(date):
    """
    Loads the testing data from disk. Note, because of the potential
    for very large amounts of testing data, the testing data is returned
    as a numpy memmap.

    :param date: (string) Date in which the data was collected (YYYY_MMDD)

    :return: tuple containing the testing data (row ordered feature vectors),
             as well as the target labels (X, y). X has type np.memmap, and 
             y has type np.array
    """

    dataDirectory = DATA_DIRECTORIES[date+"_ML"]
    testingDataPath = os.path.join(dataDirectory, TESTING_DATA_PATH)

    sampleCounts = DataManipulation.loadSampleCounts(date)
    testingSamples = sampleCounts["testing"]

    # Open the file for appending
    testingData = np.memmap(testingDataPath, 
                            mode='r',
                            dtype=np.float32,
                            shape=(testingSamples, NUM_WAVELENGTHS+1))
    
    X = testingData[:, 1:]
    y = testingData[:, 0]

    return X, y
Example #20
0
 def getLength(self, str):
     self.listWidget.clear()
     if str == "s": self.listWidget.addItem("%s" % len(self.so))
     elif str == "ar":
         self.listWidget.addItem(
             "%s" % len(DataManipulation.truncateArtists(self.ar)))
     elif str == "al":
         self.listWidget.addItem("%s" % len(sortedset(self.al)))
Example #21
0
def main(argv):
    (train_x, train_y), (test_x, test_y) = DM.offLoad()
    train(train_x, train_y, test_x, test_y)
    #losses = kfoldCrossValidate(8)
    print("-" * 30)
    #print("Average loss for kfold validation: ", sum(losses)/len(losses))
    #Average loss for kfold validation:  3.50362616777  -- with 8 folds
    print("-" * 30)
Example #22
0
 def btopArtists(self):
     maxsize = 50
     numS = QtGui.QInputDialog.getInt(self, "Choose Number of Artists",
                                      "Display this number of Artists:",
                                      maxsize)
     self.listWidget.clear()
     li = DataManipulation.topArtists(numS[0], self.di)
     for i in li:
         self.listWidget.addItem(i)
def main():
    pd.set_option("display.precision",3)
    trainData = pd.read_csv('CrimeClassification/Dataset/train01-tsc.csv')
    classesMap = dm.mapClasses(trainData)
    print(classesMap)
    cleanedTrainData = dm.cleanData(trainData,classesMap)
    testData = pd.read_csv('CrimeClassification/Dataset/test-tsc.csv')
    cleanedTestData = dm.cleanTestData(testData)
    print(cleanedTrainData.info())
    model = trainModel(cleanedTrainData.values)
    print(cleanedTestData.info())
    output = testProbaModel(model,cleanedTestData.values)
    result = np.c_[cleanedTestData.values[:,0].astype(int),output]
    outputVec = sorted(classesMap, key=classesMap.__getitem__)
    outputVec.insert(0,'Id')
    
    dataFrameResults = pd.DataFrame(result,columns=outputVec)
    dataFrameResults['Id']=dataFrameResults['Id'].astype(int)
Example #24
0
 def bnumSongs(self):
     maxsize = 20
     numS = QtGui.QInputDialog.getInt(
         self, "Choose Song Limit",
         "Artists with more than this number of songs will be displayed:",
         maxsize)
     li = DataManipulation.numSongs(numS[0], self.di)
     self.listWidget.clear()
     for i in li:
         self.listWidget.addItem(i)
Example #25
0
def saveTrainingData(date, X, y):
    """
    Saves a given matrix of training data as a np.memmap
    in the proper location for later use in training machine
    learning models.

    :param date: (string) Date in which the data was collected (YYYY_MMDD)
    :param X: (np.array) Array of training features (n_samples x n_features)
    :param y: (np.array) Array of labels for the training data (n_samples)

    :return: (None)
    """

    sampleCounts = DataManipulation.loadSampleCounts(date)
    dataDirectory = DATA_DIRECTORIES[date + "_ML"]
    trainingDataPath = os.path.join(dataDirectory, TRAINING_DATA_PATH)

    if not os.path.exists(trainingDataPath):
        # Open the file for the first time to write
        samples, features = X.shape
        trainingData = np.memmap(trainingDataPath,
                                 mode='w+',
                                 dtype=np.float32,
                                 shape=(samples, features + 1))

        trainingData[:, 0] = y
        trainingData[:, 1:] = X

        # Flush the data to disk and close the memmap
        del trainingData

    else:
        DataManipulation.updateTrainingData(date, X, y)

    # Update the sample counts file
    for index in y:
        labelString = INDEX_TO_LABEL[index]
        sampleCounts[labelString + "_training"] += 1

    sampleCounts["training"] += len(y)
    DataManipulation.updateSampleCounts(date, sampleCounts)
Example #26
0
def saveTrainingData(date, X, y):
    """
    Saves a given matrix of training data as a np.memmap
    in the proper location for later use in training machine
    learning models.

    :param date: (string) Date in which the data was collected (YYYY_MMDD)
    :param X: (np.array) Array of training features (n_samples x n_features)
    :param y: (np.array) Array of labels for the training data (n_samples)

    :return: (None)
    """

    sampleCounts = DataManipulation.loadSampleCounts(date)
    dataDirectory = DATA_DIRECTORIES[date+"_ML"]
    trainingDataPath = os.path.join(dataDirectory, TRAINING_DATA_PATH)

    if not os.path.exists(trainingDataPath):
        # Open the file for the first time to write
        samples, features = X.shape
        trainingData = np.memmap(trainingDataPath, 
                                 mode='w+',
                                 dtype=np.float32, 
                                 shape=(samples, features+1))

        trainingData[:, 0] = y
        trainingData[:, 1:] = X

        # Flush the data to disk and close the memmap
        del trainingData

    else:
        DataManipulation.updateTrainingData(date, X, y)
    
    # Update the sample counts file
    for index in y:
        labelString = INDEX_TO_LABEL[index]
        sampleCounts[labelString+"_training"] += 1
    
    sampleCounts["training"] += len(y)
    DataManipulation.updateSampleCounts(date, sampleCounts)
Example #27
0
def load_fasta_background(filename, center='.'):
    fasta_list = FileIO.read_fasta(filename)
    result = fasta_to_chunks(fasta_list)

    if (center != '.'):
        result = filter_chunks(result, center)

    result = result[0:300000]
    #Concatenate because I don't have enough ram
    formA = DataManipulation.list_to_formA(result)

    return dict([('seq', result), ('formA', formA)])
Example #28
0
def regression_linear_regression_fs(data_array, cont_dis, cls_rmv, sig, y_col,
                                    split_array):
    print('-----------------------------------')
    print(
        'Using Linear Regression Imputation with Forward Selection Dimension Reduction'
    )
    print('-----------------------------------')

    attribute_labels = [
        'mpg',  # 0
        'Cylinders',  # 1
        'Displacement',  # 2
        'Horse Power',  # 3
        'Weight',  # 4
        'Acceleration',  # 5
        'Model Year',  # 6
        'Origin',  # 7
        'Car Type'
    ]  # 8

    imp = 'Linear Regression Imputation with Forward Selection:'

    d_a, stat_a, x, y, x_n, y_n, xr, rt, y_ar = DataManipulation.linear_regression_imputation(
        list(data_array), cont_dis, cls_rmv, sig, y_col)

    F, min_mse, cols_f = forward_selector_test(list(x), list(y),
                                               split_array[0])
    f_n, min_mse_n, cols_f_n = forward_selector_test(list(x_n), list(y_n),
                                                     split_array[0])

    print('F is using ' + str(len(F[0]) - 1) + ' attributes')
    for i in range(len(cols_f)):
        print(attribute_labels[cols_f[i]])

    cod_r, n_cod_r, lse_r, n_lse_r, mse_r, n_mse_r = er_t(
        list(F), list(y), list(f_n), list(y_n), split_array)

    print('Unnormalized Data:')
    show_results(imp, cod_r, lse_r, mse_r)
    print('')
    print('Normalized Data:')
    show_results(imp, n_cod_r, n_lse_r, n_mse_r)

    figure(1)
    title('Weight vs. Horse Power')
    plot(xr, rt, 'r--', label='weight vs regression')
    plot(xr, y_ar, 'o', label='c label')
    xlabel('Car Weight')
    ylabel('Horse Power')
    legend(['regression data', 'raw data'])
    show()

    return
Example #29
0
def load_fasta_background(filename, center='.'):
    fasta_list = FileIO.read_fasta(filename);
    result = fasta_to_chunks(fasta_list);
    
    if(center != '.'):
        result = filter_chunks(result, center);
        
    result = result[0:300000]; #Concatenate because I don't have enough ram    
    formA = DataManipulation.list_to_formA(result);
    
    return dict([('seq',result),
                 ('formA',formA)]);
Example #30
0
def final_format(data_file, lables_file, test_file):
    # Getting the data and convert it to be useable by the algorthems
    data = flr.read_from_file(data_file)
    data = dm.add_bias(data)
    data = dm.convert_sex_to_number(data, 1, 1, 0)
    data = np.array(data)

    # Normalaize the data
    min_range = np.ones(data.shape[1])
    min_range = np.multiply(-1, min_range)
    max_range = np.ones(data.shape[1])
    data = dm.min_max_normalization(data, min_range, max_range)

    # Getting the lables and convert it to be useable by the algorthems
    lables = flr.read_from_file(lables_file)
    lables = np.array(lables)
    lables = dm.convert_to_float(lables)

    # Get the test data and convert it to be useable by the algorthems
    test = flr.read_from_file(test_file)
    test = dm.add_bias(test)
    test = dm.convert_sex_to_number(test, 1, 1, 0)
    test = np.array(test)

    # Normalaize the test data
    min_range = np.ones(data.shape[1])
    min_range = np.multiply(-1, min_range)
    max_range = np.ones(data.shape[1])
    test = dm.min_max_normalization(test, min_range, max_range)

    # Set the properties we want the algorithems to use to use
    ignore = np.ones(len(data[0]))

    # Perceptron algorithem
    alg_peceptron = prtn.Perceptron(data, 0.1, 100, ignore, 3)
    alg_peceptron.train(data, lables, 0.01, 20)

    # Svm algorithem
    alg_svm = svm.Svm(data, 0.01, 100, ignore, 0.001, 3)
    alg_svm.lamda = 0.1
    alg_svm.train(data, lables, 0.1, 20)

    # Pa algorithem
    alg_pa = pa.Pa(data, 0.1, 100, ignore, 3)
    alg_pa.train(data, lables, 0.01, 25)

    # Compare the algoritems on the test
    for test_data in test:
        line_to_print = "perceptron: " + str(
            alg_peceptron.predict(test_data)) + ", "
        line_to_print += "svm: " + str(alg_svm.predict(test_data)) + ", "
        line_to_print += "pa: " + str(alg_pa.predict(test_data))
        print(line_to_print)
Example #31
0
def find_first(x_data, y_data, split):
    col_size = len(x_data[0])
    min_mse = [10000]
    min_col = [10000]
    best_col = [0]
    for col in range(1, col_size):
        x_column = list(DataManipulation.column_getter(x_data, col))

        m, b, x, y, yg, mse = reg_lin_regression_msr(x_column, y_data, split)

        if mse < min_mse[0]:
            min_col[0] = col
            min_mse[0] = mse
            best_col[0] = list(x_column)

    return list(min_col), list(min_mse), list(best_col)
Example #32
0
def regression_discard(data_array, cont_dis, cols_rmv, sig, y_col,
                       split_array):
    print('-----------------------------------')
    print('Using Discard Imputation')
    print('-----------------------------------')
    imp = 'Discard Imputation:'
    d_array, stat_a, x, y, x_n, y_n = DataManipulation.discard_imputation(
        list(data_array), cont_dis, cols_rmv, sig, y_col)

    cod_r, n_cod_r, lse_r, n_lse_r, mse_r, n_mse_r = er_t(
        list(x), list(y), list(x_n), list(y_n), split_array)

    print('Unnormalized Result: ')
    show_results(imp, cod_r, lse_r, mse_r)
    print('Normalized Result: ')
    show_results(imp, n_cod_r, n_lse_r, n_mse_r)
    return
 def timeOut(self):
     passive = PassiveAccelerometer.arduino(self.xAvg, self.yAvg, self.zAvg)
     while (self.sameCount < 50):
         time.sleep(0.080);
         vals = passive.readValues();
         
         self.data.append(vals)
         
         try:
             valAvg = ((vals[0]+vals[1]+vals[2])/3)
         except Exception:
             passive.readValues()
         
         if (valAvg <= (abs(self.valPrev + 300)) or valAvg >= (abs(self.valPrev - 300))):
             self.sameCount += 1
         valPrev = valAvg
     
     manip = DataManipulation.simpleFunctions(self.data, self.weight, self.workout)
Example #34
0
def regression_average_fs(data_array, cont_dis, cols_rmv, sig, y_col,
                          split_array):
    print('-----------------------------------')
    print(
        'Using Average Imputation with Forward Selection Dimension Reduction')
    print('-----------------------------------')

    attribute_labels = [
        'mpg',  # 0
        'Cylinders',  # 1
        'Displacement',  # 2
        'Horse Power',  # 3
        'Weight',  # 4
        'Acceleration',  # 5
        'Model Year',  # 6
        'Origin',  # 7
        'Car Type'
    ]  # 8

    imp = 'Average Imputation:'

    d_array, stat_a, x, y, x_n, y_n = DataManipulation.average_imputation(
        list(data_array), cont_dis, cols_rmv, sig, y_col)

    F, min_mse, cols_f = forward_selector_test(list(x), list(y),
                                               split_array[0])
    f_n, min_mse_n, cols_f_n = forward_selector_test(list(x_n), list(y_n),
                                                     split_array[0])

    print('F is using ' + str(len(F[0]) - 1) + ' attributes')
    for i in range(len(cols_f)):
        print(attribute_labels[cols_f[i]])

    cod_r, n_cod_r, lse_r, n_lse_r, mse_r, n_mse_r = er_t(
        list(F), list(y), list(f_n), list(y_n), split_array)

    # show_results(imp, error, cod_result, lse_result, mse_result)
    print('Unnormalized Data:')
    show_results(imp, cod_r, lse_r, mse_r)
    print('')
    print('Normalized Data:')
    show_results(imp, n_cod_r, n_lse_r, n_mse_r)

    return
Example #35
0
def kfoldCrossValidate(k):
    '''
    cross validate the model
    '''
    df = shuffle(DM.clean(DM.offData))
    loss = []
    rows = len(df.index)
    kf = KFold(rows, n_folds=k)
    for train_index, test_index in kf:
        train_set = df.iloc[train_index]
        test_set = df.iloc[test_index]
        train_x = train_set.drop(['R$'], axis=1)
        train_y = train_set['R$']
        test_x = test_set.drop(['R$'], axis=1)
        test_y = test_set['R$']
        l = train(train_x, train_y, test_x, test_y)
        loss.append(l)

    return loss
Example #36
0
def collect_parameters2(x_d, y_d, split_a):
    w_list = list()
    tr_l = list()
    y_tr_l = list()
    val_l = list()
    y_val_l = list()

    for x in range(len(split_a)):
        tr, val, y_tr, y_val, rand = DataManipulation.dos_data_splitter(
            x_d, y_d, split_a[x])

        # get w from training data
        w_list.append(multi_linear_regressor(tr, y_tr))
        tr_l.append(tr)
        y_tr_l.append(y_tr)
        val_l.append(val)
        y_val_l.append(y_val)

    return w_list, tr_l, y_tr_l, val_l, y_val_l
def createQueriesDictionary(Data):
    
    InvertedIndex, Queries = dm.readInvertedIndex(), {}

    N = 537933 # Total Number of Queries
    
    for data in Data:
        
        for i in range(1, 3):
            qid, query = data[i], [data[i+2]]
            if qid not in Queries:
                Queries.update({qid:query})
        
    for qid in Queries:

        Words = TextBlob(Queries[qid][0]).lower().words # Dictionary word -> frequency
        
        Hashes, Weights = [], []
        
        try:
            maxf = max(Words.count(w) for w in Words) # Max Frequency of a term in the query
        except:
            continue # Corrupted Data
            
        for w in Words:
            
            Hashes.append(hashFunction(w, 64))
            
            f, n = Words.count(w)/maxf, len(InvertedIndex[w]) # f(t), n(t)
            
            idf = math.log(N/n)/math.log(N) # IDF(t)
            
            Weights.append(f*idf)
        
        
        queryHash = HashQuery(Hashes, Weights)
        Queries[qid].append(queryHash)
        
    
    with open('Queries.txt', 'wb') as file:
        pickle.dump(Queries, file)
Example #38
0
    def updatePlot(self):
        '''
        Get the new position data from file and add that point to the
        scatter plot.
        '''

        # Fetch and plot data
        newPos = dm.getLatestPositionData(self.updateCount)
        self.xData.append(newPos[0])
        self.yData.append(newPos[1])
        self.axes.plot(self.xData, self.yData, color='black')

        # Formatting
        self.axes.set_title('X and Y Position Relative to Starting Point')
        self.axes.set_xlabel('meters')
        self.axes.set_ylabel('meters')
        self.axes.legend(loc='lower right')

        # Add data to plot and flush
        self.fig.canvas.draw()
        self.fig.canvas.flush_events()

        return None
Example #39
0
def modelPredict(predict_x, path='saved', expected=[0]):
    '''This function rebuilds a NN from a directory where it was saved in a training job
    It then runs a predction job based on the given inputs'''
    
    #build the feature columns
    ageCol = tf.feature_column.numeric_column(key='Age')
    atbatCol = tf.feature_column.numeric_column(key='AB')
    hitCol = tf.feature_column.numeric_column(key='H')
    runCol = tf.feature_column.numeric_column(key='R')
    rbiCol = tf.feature_column.numeric_column(key='RBI')
    hrCol = tf.feature_column.numeric_column(key='HR')
    sbCol = tf.feature_column.numeric_column(key='SB')
    
    #define the feature columns in a list
    feature_columns = [
       #ageCol,
       atbatCol, 
       hitCol,
       runCol,
       rbiCol,
       hrCol,
       sbCol,      
       tf.feature_column.indicator_column(tf.feature_column.crossed_column(['H', 'AB'], hash_bucket_size=int(1e4))),
      # tf.feature_column.indicator_column(tf.feature_column.crossed_column(['HR', 'RBI', 'R'], hash_bucket_size=int(1e4))),
      # tf.feature_column.indicator_column(tf.feature_column.crossed_column(['H', 'AB', 'SB'], hash_bucket_size=int(1e4))),                                                                    
      # tf.feature_column.indicator_column(tf.feature_column.crossed_column(['H', 'AB', 'HR', 'RBI', 'R', 'SB'], hash_bucket_size=int(1e4))),
      # tf.feature_column.indicator_column(tf.feature_column.crossed_column(['H', 'AB', 'HR', 'RBI', 'R'], hash_bucket_size=int(1e4))),   
    ]
    
    #configure checkpoints:
    my_checkpointing_config = tf.estimator.RunConfig(
                        save_checkpoints_secs = 20,  # Save checkpoints every 20 secs.
                        keep_checkpoint_max = 10,       # Retain the 10 most recent checkpoints.
    )
   
    # Build the Estimator.
    #model = tf.estimator.LinearRegressor(
    model = tf.estimator.DNNRegressor(
                        hidden_units=[31, 22, 15, 12],
                        feature_columns=feature_columns,
                        config=my_checkpointing_config,
                        model_dir=path
    )
    

    predictions = model.predict(
          input_fn=lambda:DM.eval_input_fn(predict_x,
                                                  labels=None,
                                                  batch_size=100))

    template = ('\nPrediction is "{}" , expected "{}"')
    ret = []
    
    
    
    for pred_dict,expec in zip(predictions, expected):
        #ret.append(pred_dict["predictions"])
        
        print(template.format(pred_dict["predictions"][0], expec))
        ret.append(pred_dict["predictions"][0])

    
    return ret
Example #40
0
import os
import time
import pdb

#################################
##### DataManipulation.py
#################################
import DataManipulation
reload(DataManipulation)

checksum = []

function = 'remove_x'
argument = np.array([[0., 1.], [1., 3.], [2., 1.]])
ideal_output = np.array([1., 3., 1.])
real_output = DataManipulation.remove_x(argument)
a = (ideal_output == real_output).all()
checksum.append(a)

print function, a

function = 'add_x'
argument = np.array([1., 3., 1.])
ideal_output = np.array([[0, 1.], [1, 3.], [2, 1.]])
real_output = DataManipulation.add_x(argument)
a = (ideal_output == real_output).all()
checksum.append(a)

print function, a

function = 'normalize'