コード例 #1
0
def localWords(feed1, feed0):

    docList = []; classList=[]; fullText = [];
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1) #NY is class 1
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = PrepareData.createVocabList(docList)#create vocabulary
    top30Words = calcMostFreq(vocabList,fullText)   #remove top 30 words
    for pairW in top30Words:
        if pairW[0] in vocabList: vocabList.remove(pairW[0])
    trainingSet = range(2*minLen); testSet=[]           #create test set
    for i in range(20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat=[]; trainClasses = []
    for docIndex in trainingSet:#train the classifier (get probs) trainNB0
        trainMat.append(PrepareData.bagOfWords2VecMN(vocabList, docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = NaiveBayesianModel.trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:        #classify the remaining items
        wordVector = PrepareData.bagOfWords2VecMN(vocabList, docList[docIndex])
        if NaiveBayesianModel.classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
            errorCount += 1
    print 'the error rate is: ',float(errorCount)/len(testSet)
    return vocabList,p0V,p1V
コード例 #2
0
def main():

    if len(sys.argv) == 1:
        n_gram_list = [1]
    else:
        n_gram_list = sys.argv[1:len(sys.argv)]
        n_gram_list = map(int, n_gram_list)

    x = PrepareData.feat_extraction(n_gram_list, x_one_hot)
    x_validate = PrepareData.feat_extraction(n_gram_list, x_one_hot)
    n_feat = x.shape[1]
    raw_data_size = (n_feat, text_length, 1)

    # train_seq, test_seq = data_set_split(x.shape[0], 0.2)
    n_classes = y.shape[1]
    k = 5
    k_fold_sequence = data_set_k_fold_separation(x.shape[0], k)

    output_train = open('train_acc.txt', 'wb')
    output_test = open('train_acc.txt', 'wb')
    y_validate = []
    for i in range(k):
        test_seq = k_fold_sequence[i]
        train_seq = []
        for j in range(k):
            if i != j:
                train_seq.extend(k_fold_sequence[j])

        nc = NC(input_size=raw_data_size,
                n_classes=n_classes,
                raw_feature_dim=n_feat)
        xtrain = x[train_seq]
        ytrain = y[train_seq]
        nc.fit([xtrain, xtrain], ytrain)

        eval_train_result = nc.evaluation(xtrain, ytrain)
        print(eval_train_result)
        print >> output_train, [k, eval_train_result]
        eval_test_result = nc.evaluation(x[test_seq], y[test_seq])
        print(eval_test_result)
        print >> output_test, [k, eval_test_result]

        y_validate_k = nc.predict(x_validate)
        y_validate_k = y_validate_k.argmax(axis=1)

    print >> output_train, {'average', np.mean(eval_train_result, axis=0)}
    print >> output_test, {'average', np.mean(eval_test_result, axis=0)}
    y_validate_file_path = 'ytest.txt'
    np.savetxt(fname=y_validate_file_path,
               X=np.asarray(y_validate_k),
               fmt='%1.2f')
コード例 #3
0
    def __init__(self):
        self.data = PrepareData()
        self.dataset = Seq2SeqDataset()
        self.data_loader = DataLoader(dataset=self.dataset,
                                      batch_size=1,
                                      shuffle=True)
        self.lang_1 = data.lang_1
        self.lang_2 = data.lang_2
        self.char2index = data.char2index
        self.index2char = data.index2char

        self.input_size = 100
        self.hidden_size = 64
        self.output_size = 100
        self.learning_rate = 0.01
        self.num_epoch = 500
        self.teacher_forcing = True
        self.use_cuda = torch.cuda.is_available()
        self.device = 'cuda:0' if self.use_cuda else 'cpu'

        self.encoder = EncoderRNN(input_size=self.input_size, hidden_size=self.hidden_size)
        self.decoder = DecoderRNN(output_size=self.output_size, hidden_size=self.hidden_size)
        self.attn_decoder = AttnDecoder(self.hidden_size, self.output_size)

        if use_cuda:
            self.encoder = self.encoder.to(self.device)
            self.decoder = self.decoder.to(self.device)

        self.encoder_optimizer = torch.optim.Adam(self.encoder.parameters(), lr=self.learning_rate)
        self.decoder_optimizer = torch.optim.Adam(self.decoder.parameters(), lr=self.learning_rate)

        self.loss_function = nn.NLLLoss()
コード例 #4
0
    def testFutureData(self):

        fakePrice = np.array(range(1,16))  # 0-14
        fakevolume=  np.array(range(15))
        fakeDate = np.array(range(15))

        dataFrame = pd.DataFrame({'price':fakePrice}, index = fakeDate)
        dataFrame['volume'] = pd.Series(fakevolume,index = fakeDate)

        myfilter = np.array([0, 0.5,0.5])
        myfilter = myfilter[::-1]

        retFr = PrepareData.getFutureFiltered(dataFrame, myfilter)


        #check that length of dataframe is unchenged
        self.assertEqual(len(retFr),15)
        filteredFrame = retFr[np.isfinite(retFr['FutureFilter'])]
        # Checking length and content of returned vectors
        self.assertEqual( len(filteredFrame['FutureFilter']), 13)
        self.assertEqual( retFr['price'][12], 13)
        self.assertEqual( retFr['volume'][12], 12)


        # Checking resulting vector
        self.assertEqual( retFr['FutureFilter'][0], 1.5) ## (2+3)/2 = 2.5 = > 2.5/1 - 1.0 = 1.5
        self.assertEqual( retFr['FutureFilter'][12], 14.5/13-1.0) ## (2+3)/2 = 14.5 = > 14.5/13 - 1.0 = 1.5
コード例 #5
0
ファイル: TestNB.py プロジェクト: Guhaifudeng/MLInAction
def testingNB():
    listOPosts, listClasses = LoadData.loadDataSet()
    myVocabList = PrepareData.createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(PrepareData.setOfWords2Vec(myVocabList, postinDoc))
    p0V, p1V, pAb = NaiveBayesianModel.trainNB0(array(trainMat),
                                                array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(PrepareData.setOfWords2Vec(myVocabList, testEntry))
    print testEntry, 'classified as: ', NaiveBayesianModel.classifyNB(
        thisDoc, p0V, p1V, pAb)
    testEntry = ['stupid', 'garbage']
    thisDoc = array(PrepareData.setOfWords2Vec(myVocabList, testEntry))
    print testEntry, 'classified as: ', NaiveBayesianModel.classifyNB(
        thisDoc, p0V, p1V, pAb)
コード例 #6
0
def get_link_index(usecase, sourcecode):
    test_meta_data = PrepareData.get_test_meta_data()
    for index in range(len(test_meta_data)):
        usecase_item = test_meta_data[index]["link name"][0]
        sourcecode_item = test_meta_data[index]["link name"][1]
        if usecase == usecase_item and sourcecode == sourcecode_item:
            return index
    return -1
コード例 #7
0
    def testUseHistFilter(self):
        price = range(1,20)
        volume = range(1,20)
        timestamp = range(1,20)
        dataFrame = pd.DataFrame({'price':price,'volume':volume}, index = timestamp)

        myfilter = PrepareData.getHistoryFilter([5, 4, 2])
        retFrame = PrepareData.getHistoryFiltered(dataFrame, myfilter)

        i = 0;
        for i in range(5,len(retFrame)):
            meanP = (price[i-5] + price[i-4] + price[i-2])/3.0
            change = meanP/price[i]-1.0;
            print change
            print retFrame['HistoryFilter'].iloc[i]
            self.assertAlmostEqual(change, retFrame['HistoryFilter'].iloc[i])
            i = i+1;
コード例 #8
0
 def testFutureFilter(self):
     f = PrepareData.getFutureFilter([2, 4, 6, 8])
     self.assertEqual( len(f), 9)
     self.assertEqual( f[-2], 0.25)
     self.assertEqual( f[-4], 0.25)
     self.assertEqual( f[-6], 0.25)
     self.assertEqual( f[-8], 0.25)
     self.assertEqual( np.sum(f), 1.0)
コード例 #9
0
    def select_images(self):

        all_conditions_list = {
            "DENSITY":
            self.get_checked_boxes(self.den_var_categories, "DENSITY"),
            "SUBTLETY":
            self.get_checked_boxes(self.subt_var_categories, "SUBTLETY"),
            "ASSESSMENT":
            self.get_checked_boxes(self.assesm_var_categories, "ASSESSMENT"),
            "LESION_TYPE":
            self.get_checked_boxes(self.lesion_var_categories, "LESION_TYPE")
        }

        selected_data = PrepareData(all_conditions_list)
        selected_data.count_values()

        return all_conditions_list
コード例 #10
0
def CompareInitData(OldTmsIP, NewTmsIP, OldPort, NewPort):
    OldInitData = ParseInit(OldTmsIP, OldPort)
    NewInitData = ParseInit(NewTmsIP, NewPort)
    OldInitData = PrepareData.PrepareData(OldInitData)
    NewInitData = PrepareData.PrepareData(NewInitData)
    # print(OldInitData)
    # print(NewInitData)
    for tableOld in OldInitData:
        TableIdOld = tableOld[0]
        TableDataOld = tableOld[1]
        for tableNew in NewInitData:
            TableIdNew = tableNew[0]
            TableDataNew = tableNew[1]
            if TableIdOld == TableIdNew:
                if TableDataOld == TableDataNew:
                    print("TableID: " + str(TableIdOld) + "\t was Successful")
                    # print(TableDataOld + "\n" + TableDataNew)
                else:
                    print("TableID: " + str(TableIdOld) + "\t was Failed!!!")
                    print(TableDataOld + "\n" + TableDataNew)
コード例 #11
0
    def testUseFutureFilter(self):

        price = range(1,20)
        volume = range(1,20)
        timestamp = range(0,19)

        dataFrame = pd.DataFrame({'price':price}, index = timestamp)
        dataFrame['volume'] = pd.Series(volume,index = timestamp)

        myfilter = PrepareData.getFutureFilter([1, 3, 5])
        newFrame = PrepareData.getFutureFiltered(dataFrame, myfilter)


        length = len(newFrame['price'])-len(myfilter)

        for i in range(length):
            meanP = (price[i+1] + price[i+3] + price[i+5])/3.0
            change = meanP/price[i]-1.0;
            self.assertAlmostEqual(change, newFrame['FutureFilter'][i])
            i = i+1;
コード例 #12
0
def Run(args):
    cid = plt.gcf().canvas.mpl_connect('key_press_event', closePlot)

    # get the combined data - cleaned
    Data = ReadData.Run(args)
    measure_cols = ['DNI', 'GHI', 'DHI']

    # sort out the scale info
    scale_map = {'Hourly': 'Hour', 'Daily': 'Date'}
    scale = scale_map[args['scale']]
    window = 30

    # take sum on a given time scale
    Data_sum = PrepareData.aggregateDf(Data, scale[0], 'sum')

    # get correlation between the measure columns
    print 'Correlation in {}'.format(measure_cols)
    print Data_sum.corr(), '\n'

    # plot Data
    # plot(['DNI'], Data_sum, '-')

    # make the dataset & dump
    dump_dir = 'Dumped Dataset/Suny/{} {}'.format('Cont', window)
    site = dump_dir.split('/')[1].lower()
    if not os.path.exists(dump_dir):
        os.makedirs(dump_dir)

    PrepareData.createDataSets(Data_sum,
                               'Cont',
                               input_measure_cols=['DNI'],
                               output_measure_cols=['DNI'],
                               split=True,
                               window=window,
                               dump_dir=dump_dir)
    train_in, train_out, test_in, test_out = PrepareData.loadDumpedData(
        dump_dir)

    with open(os.path.join('Params', '{}.json'.format(site))) as paramsF:
        paramsFactory = json.load(paramsF)
        runModels(train_in, train_out, test_in, test_out, scale, paramsFactory)
コード例 #13
0
def capture_more_link():
    test_x = []
    test_y = []
    test_meta_data = PrepareData.get_test_meta_data()
    PrepareData.get_ml_data(test_meta_data, test_x, test_y)

    # print("test_meta_data =", len(test_meta_data), test_meta_data)

    predict_y_phase1 = Phase3Classifier.train()
    # print("predict_y =", len(predict_y), predict_y)
    predict_y_phase1 = list(predict_y_phase1)
    predict_y_phase2 = predict_y_phase1.copy()

    sourcecode_dict = ProcessSourceCode.get_sourcecode_infomation()

    for index in range(len(predict_y_phase1)):
        if predict_y_phase1[index] == 1:
            link_name = test_meta_data[index]["link name"]
            usecase = link_name[0]
            sourcecode = link_name[1]

            Implements_Extends_Recovery.capture_implements_link(
                usecase, sourcecode, predict_y_phase2, test_meta_data,
                sourcecode_dict)

            Implements_Extends_Recovery.capture_extends_link(
                usecase, sourcecode, predict_y_phase2, test_meta_data,
                sourcecode_dict)

    # print("test_data[20] =", len(test_data), test_data[20])
    print("phase 1 : ")
    Result.get_result(test_y, predict_y_phase1)
    tag_1_count = predict_y_phase1.count(1)
    print("capture link =", tag_1_count)
    print("phase 2 : ")
    Result.get_result(test_y, predict_y_phase2)
    tag_1_count = predict_y_phase2.count(1)
    print("capture link =", tag_1_count)
コード例 #14
0
ファイル: main.py プロジェクト: zardoss/invoice-generator
    def selectFile(self):
        print("Selecting CSV/Excel file")
        # Current directory
        self.filename = QFileDialog.getOpenFileName()
        path = self.filename[0]

        # If file is selected
        if len(path) > 0:
            # Store file name to filename
            self.filename = os.path.basename(path)
            if self.filename.endswith(".xlsx"):
                print(f"[*]\tConverting excel spreadsheet to csv")
                # Read and store content of the excel file
                try:
                    self.df = pd.read_excel(
                        self.filename)  # sheetname is optional
                    # Replaces .xlsx with nothing
                    self.filename = self.filename.replace('.xlsx', '')
                    # Adds .csv extension to name before converting .xlsx file type to .csv
                    self.filename = f"{self.filename}.csv"
                    # Write the dataframe object into csv file
                    self.df.to_csv(
                        self.filename, index=False
                    )  # index=False prevents pandas to write row index
                except Exception as e:
                    print(e)
            # Check if it's a csv file
            if self.filename.endswith(".csv"):
                print("[*]\tCSV File selected")
                try:
                    o.main(self.filename)
                    self.filename = "reformatted_" + self.filename
                except Exception as e:
                    print(f"[*]\tFail\t{e}")
            # Check if it's an excel file
            else:
                print(f"Not sure what format this is...\"{self.filename}\"")
        self.dFileName = self.filename
コード例 #15
0
ファイル: LoadData.py プロジェクト: Guhaifudeng/MLInAction
def loadImages(dirName):
    from os import listdir
    hwLabels = []
    trainingFileList = listdir(dirName)           #load the training set
    m = len(trainingFileList)
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]
        fileStr = fileNameStr.split('.')[0]     #take off .txt
        classNumStr = int(fileStr.split('_')[0])
        if classNumStr == 9: hwLabels.append(-1)
        else: hwLabels.append(1)
        trainingMat[i,:] = pred.img2vector('%s/%s' % (dirName, fileNameStr))
    return trainingMat, hwLabels
コード例 #16
0
def plot_avalanche_activity_index():
    '''
    :def: This function creates box plot showing the Avalanche Activity Index (AAI) per day when AAI > 0
    (only data available from dataset)

    :return: void
    '''

    # Get data to plot
    data = []
    for i in sorted(df['max.danger.corr'].unique()):
        # Take a subset of specific avalanche danger level
        df_av_freq = df[df['max.danger.corr'] == i]

        # Gather the data to create the box plot
        data.append(df_av_freq['x'].value_counts().values)

    # Box plot
    utils.create_box_plot(df=data,
                          xlabel='Avalanche Danger Level',
                          ylabel='Avalanche Activity Index (AAI)')

    # Plot the avalanche danger level with respect to the four year seasons
    utils.create_stacked_bar_plot(df, 'max.danger.corr', 'season')
コード例 #17
0
    def testParseOrderdepth(self):

        config = ConfigParser.ConfigParser()
        config.read('config.cfg')

        filename = config.get('Section1', 'orderdepthfilename')
        directory = config.get('Section1', 'bitcoinHistDataFolder')
        a = PrepareData.getOrderDepthRatio(directory + filename, 60)
        size = np.shape(a)
        self.assertEqual(size[1], 1)
        self.assertEqual(size[0], 613)
        indexes = a.index

        startTime = indexes[0]
        a = np.array(a)
        indexes  = a[:,0] - startTime


        frame = pd.DataFrame(a)
        frame.to_csv('test.csv')
コード例 #18
0
def performPredictions():
    model_path = Parameters.model_path + 'model.json'
    weights_path = Parameters.weights_path
    prepare = PrepareData.prepareData()
    test, test_label, test_names = prepare.generateTestingSamples()

    # load model file
    jason_file = open(model_path, 'r')
    loaded_jason_file = jason_file.read()
    jason_file.close()
    model = model_from_json(loaded_jason_file)

    # load weights file
    lastWeights = 0
    for f in listdir(weights_path):
        epoch = (weights_path + f).split('/')[-1].split('.')[0]
        if int(epoch) > lastWeights:
            lastWeights = int(epoch)
    model.load_weights(weights_path + str(lastWeights) + '.h5')

    optimizer = optimizers.rmsprop(lr=Parameters.learning_rate)
    model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

    # perform predictions
    loss, acc = model.evaluate(test, test_label)
    score = model.predict(test)
    # Print number of 0s and 1s predicted
    predicted_label = model.predict_classes(test)
    unique, counts = np.unique(predicted_label, return_counts=True)
    print("\tPredicted labels:", dict(zip(unique, counts)))
    # ROC
    roc = roc_auc_score(test_label, score)

    print('Run:', Parameters.run)
    print('Performance of ProDec-BLSTM: roc:', roc)
    # save predictions to disk
    writeResults(test_names, test_label, score, predicted_label,
                 Parameters.run, roc, acc, True)
コード例 #19
0
    def testHistData(self):
        fakePrice = np.array(range(1,16))  # 1-15
        fakevolume=  np.array(range(15))
        fakeDate = np.array(range(15))
        myfilter = np.array([0.5,0.5,0])
        myfilter = myfilter[::-1]

        dataframe =  pd.DataFrame( columns=['price', 'volume','date'])
        dataframe['price'] = fakePrice
        dataframe['volume'] = fakevolume
        dataframe['date'] = fakeDate

        df = PrepareData.getHistoryFiltered(dataframe, myfilter)

        # Checking length and content of returned vectors
        self.assertEqual( len(df['HistoryFilter']), 15)
        self.assertTrue( math.isnan(df['HistoryFilter'][0]))
        self.assertTrue(math.isnan(df['HistoryFilter'][1]))
        print df['HistoryFilter']

        # Checking resulting vector
        self.assertEqual( df['HistoryFilter'][2], -0.5) ## (1+2)/2 = 1.5 = > 1.5/3 - 1.0 = 0.5
        self.assertEqual( df['HistoryFilter'][14], 13.5/15-1.0) ## (14+13)/2 = 13.5 = > 15/13.5- 1.0 = 1.5
コード例 #20
0
ファイル: Classifier_Model.py プロジェクト: Shniya3/TLR-KRL
def test_classifier_model():
    train_x = []
    train_y = []
    test_x = []
    test_y = []

    train_meta_data = PrepareData.get_train_meta_data()
    test_metadata = PrepareData.get_test_meta_data()
    PrepareData.get_ml_data(train_meta_data, train_x, train_y)
    PrepareData.get_ml_data(test_metadata, test_x, test_y)

    print("train correct link =", train_y.count(1))
    print("test correct link =", test_y.count(1))
    # print("precision", "\t"*3, "recall", "\t"*3, "accuracy", "\t"*3, "f1-score")
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    test_x = np.array(test_x)
    test_y = np.array(test_y)

    print("capture link =", list(predict_y).count(1))
コード例 #21
0
def PlotResults(setupClient, model, X_train, X_test, y_train, y_test, w_train,
                w_test, ix_train, ix_test):

    print(Fore.BLUE + "--------------------------")
    print(Back.BLUE + "         RESULTS          ")
    print(Fore.BLUE + "--------------------------")

    if setupClient.runMode == 'binary' or setupClient.runMode == 'param' or setupClient.runMode == 'SimpleRNN':
        print('Evaluating model on X_test, y_test')
        score = model.evaluate(X_test,
                               y_test,
                               batch_size=setupClient.Params['BatchSize'])
        # testLoss = 'Test loss:%0.3f' % score[0]
        # testAccuracy = 'Test accuracy:%0.3f' % score[1]
        print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Test loss', score[0]))
        print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Test accuracy',
                                        score[1]))

    # get the architecture as a json string
    arch = model.to_json()
    with open(os.path.join(setupClient.ModelSavePath, 'architecture.json'),
              'w') as arch_file:
        print('Saving model as json',
              os.path.join(setupClient.ModelSavePath, 'architecture.json'))
        arch_file.write(arch)
    # now save the weights as an HDF5 file
    model.save_weights(os.path.join(setupClient.ModelSavePath,
                                    'ModelWeights.h5'),
                       overwrite=True)

    if not os.path.isfile(setupClient.TrainedModelPath + '/DNN_Setup'):
        print("Pickle file not found!")
        quit()
    foo = open(setupClient.TrainedModelPath + 'DNN_Setup', "rb")
    bla = pickle.load(foo)
    minusMean = np.multiply(-1, bla.Scaler.mean_)
    OneOverStd = np.divide(1, np.sqrt(bla.Scaler.var_))

    with open(os.path.join(setupClient.ModelSavePath, 'Scaling.txt'),
              'w') as scaleFileOut:
        scaleFileOut.write(
            str(setupClient.InputDNNVariables[setupClient.VarSet]) + '\n')
        scaleFileOut.write('Mean\n' + str(bla.Scaler.mean_) + '\n')
        scaleFileOut.write('minusMean\n' + str(minusMean) + '\n')
        scaleFileOut.write('Var\n' + str(bla.Scaler.var_) + '\n')
        scaleFileOut.write('sqrtVar\n' + str(np.sqrt(bla.Scaler.var_)) + '\n')
        scaleFileOut.write('OneOverStd\n' + str(OneOverStd) + '\n')

    theClasses = []
    print('\nRunning model prediction on X train/test samples')
    yResult_test_cls = []
    yResult_train_cls = []

    yResult_test = model.predict(X_test,
                                 verbose=True,
                                 batch_size=setupClient.Params['BatchSize'])
    yResult_train = model.predict(X_train,
                                  verbose=True,
                                  batch_size=setupClient.Params['BatchSize'])

    #insert the score result back into the original file
    # ix_test['DNN_Score'] = yResult_test
    # ix_train['DNN_Score'] = yResult_train

    # ix_test.to_pickle(setupClient.ModelSavePath+'/ResultsTestPD.pkl',protocol=2)
    # ix_train.to_pickle(setupClient.ModelSavePath+'/ResultsTrainPD.pkl',protocol=2)

    # np.save( os.path.join(setupClient.ModelSavePath, "ResultsTestPD.npy") , ix_test ) # antonio
    # np.save( os.path.join(setupClient.ModelSavePath, "ResultsTrainPD.npy") , ix_train ) # antonio
    # np.save( os.path.join(setupClient.ModelSavePath, "rootBranchSubSample.npy") , ix_test.columns.values) # antonio

    if setupClient.runMode == 'multi':
        yResult_test_cls = np.argmax(
            yResult_test, axis=1)  #stores the element with max score
        yResult_train_cls = np.argmax(
            yResult_train, axis=1)  #stores the element with max score
        theClasses = ['Zjets', 'Signal', 'Diboson', 'Top']
    else:
        yResult_test_cls = np.array([int(round(x[0])) for x in yResult_test])
        yResult_train_cls = np.array([int(round(x[0])) for x in yResult_train])
        theClasses = ['Background', 'Signal']

    # print(X_test[:20])
    # print ('')
    #
    # print(ix_test[:20])
    # print ('')
    # print(yResult_test)
    # quit()
    #
    # print(yResult_test_cls)
    # print ('')
    # print(yResult_train)
    # print ('')
    # print(yResult_train_cls)

    if setupClient.doConfusionMatrix:
        # Plot the confusion matrix
        plt.clf()
        # The class method is:  sklearn.metrics.confusion_matrix(y_true, y_pred, labels=None, sample_weight=None)
        cnf_matrix = confusion_matrix(y_test,
                                      yResult_test_cls,
                                      sample_weight=w_test)
        np.set_printoptions(precision=2)
        plot_confusion_matrix(setupClient,
                              cnf_matrix,
                              classes=theClasses,
                              normalize=True,
                              title='Normalized confusion matrix')

    if setupClient.doEfficiency:
        print('Calculating Efficiencies on Test sample')
        if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN':
            s_eff = w_test[(y_test == 1) & (
                yResult_test_cls > 0.5)].sum() / w_test[y_test == 1].sum()
            b_eff = w_test[(y_test != 1) & (
                yResult_test_cls > 0.5)].sum() / w_test[y_test != 1].sum()
            print(" ")
            print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Signal efficiency',
                                            s_eff))
            print('{:<35} {:<25.3f}'.format(
                Fore.GREEN + 'Background efficiency:', b_eff))
            print('{:<35} {:<25.3f}'.format(
                Fore.GREEN + 'Background rejection:', 1.0 / b_eff))
        if setupClient.runMode == 'multi':
            channelEffi = channelDic.copy()
            for channel, i in channelDic.items():
                channelEffi[channel] = w_test[(y_test == i) & (
                    yResult_test_cls == 1)].sum() / w_test[y_test == i].sum()
            for channel, eff in channelEffi.items():
                print('{:<35} {:<25.3f}'.format(
                    Fore.GREEN + channel + ' efficiency', eff))

            b_eff = w_test[(y_test != 1) & (
                yResult_test_cls == 1)].sum() / w_test[y_test != 1].sum()
            print('{:<30} {:<20.3f}'.format('Background efficiency', b_eff))
            print('{:<30} {:<20.3f}'.format('Background rejection',
                                            1.0 / b_eff))
        print(" ")

    if setupClient.doScore:
        if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN' or setupClient.runMode == 'param':
            # First create one sample of X_train only from signal and one only from background events
            Xtrain_signal = X_train[y_train == 1]
            Xtrain_background = X_train[y_train != 1]

            # Then do the same for Xtest
            Xtest_signal = X_test[y_test == 1]
            Xtest_background = X_test[y_test != 1]

            # Get predictions of the model on these -train- samples
            print('Running model prediction on Xtrain_signal')
            yhat_train_signal = model.predict(
                Xtrain_signal, batch_size=setupClient.Params['BatchSize'])
            print('Running model prediction on Xtrain_background')
            yhat_train_background = model.predict(
                Xtrain_background, batch_size=setupClient.Params['BatchSize'])

            # Get predictions of the model on these -test- samples
            print('Running model prediction on Xtest_signal')
            yhat_test_signal = model.predict(
                Xtest_signal, batch_size=setupClient.Params['BatchSize'])
            print('Running model prediction on Xtest_background')
            yhat_test_background = model.predict(
                Xtest_background, batch_size=setupClient.Params['BatchSize'])

        hasData = False
        if setupClient.runMode == 'binary' and setupClient.unblind == True:
            # Get the data PD file
            dataFileName = setupClient.PDPath + setupClient.MixPD_TrainTestTag + '_Data.pkl'
            if os.path.isfile(dataFileName):
                hasData = True
                print('Reading Data file:', dataFileName)
                data_full = pd.read_pickle(dataFileName)
                data_full_matrix = data_full[setupClient.InputDNNVariables[
                    setupClient.VarSet]].as_matrix()

                print('{:<45} {:<15}'.format(
                    'Getting Scaler of Training sample from file',
                    Fore.GREEN + setupClient.TrainedModelPath + 'DNN_Setup'))
                if not os.path.isfile(setupClient.TrainedModelPath +
                                      '/DNN_Setup'):
                    print("Pickle file not found!")
                    quit()
                f = open(setupClient.TrainedModelPath + 'DNN_Setup', "rb")
                savedSetupClient = pickle.load(f)
                data_full_matrix = savedSetupClient.Scaler.transform(
                    data_full_matrix)

                # Get predictions on data
                print('Running model prediction on data')
                yhat_data = model.predict(
                    data_full_matrix,
                    verbose=True,
                    batch_size=setupClient.Params['BatchSize'])
                yhat_data_rounded = np.array([round(x[0]) for x in yhat_data])
                # Save as numpy array
                # np.save( os.path.join(setupClient.ModelSavePath,"yhat_data.npy") , yhat_data)
            else:
                print('Data file:', dataFileName,
                      ' not found. Will proceed to MC only')

        if setupClient.runMode == 'SimpleRNN':  # antonio
            for ifile in setupClient.InputFilesSB['Data']:
                dataFileName = setupClient.PDPath + ifile + '_FullNoRandom.pkl'
                if os.path.isfile(dataFileName):
                    hasData = False
                    print('Reading Data file:', dataFileName)
                    data_full = pd.read_pickle(dataFileName)

                    VariablesSet = setupClient.InputDNNVariables[
                        setupClient.VarSet]
                    data_full_matrix = data_full[VariablesSet].copy()
                    var_names = data_full_matrix.keys()
                    new_data_full_matrix = np.zeros(
                        (data_full_matrix.shape[0], 6, 4))

                    for i in range(0, data_full_matrix.shape[0]):
                        for j in range(0, data_full_matrix.shape[1]):
                            new_data_full_matrix[i, int(j / 4), j %
                                                 4] = data_full_matrix.iloc[i,
                                                                            j]
                    data_full_matrix = new_data_full_matrix

                    PrepareData.scale(data_full_matrix,
                                      ['pt', 'eta', 'phi', 'E'], False,
                                      setupClient)  # apply scaling to test set

                    # Get predictions on data
                    print('Running model prediction on data')
                    yhat_data = model.predict(
                        data_full_matrix,
                        verbose=True,
                        batch_size=setupClient.Params['BatchSize'])

                    data_full['RNN_Score'] = yhat_data
                    print(data_full.shape)
                    np.save(
                        os.path.join(setupClient.ModelSavePath,
                                     "ResultsDataMLPD_" + ifile + ".npy"),
                        data_full)  # antonio
                    np.save(
                        os.path.join(
                            setupClient.ModelSavePath,
                            "rootBranchSubSampleForDataML_" + ifile + ".npy"),
                        data_full.columns.values)  # antonio

                else:
                    print('Data file:', dataFileName,
                          ' not found. Will proceed to MC only')

        sns.set_palette("coolwarm", 4)
        # Plot scores
        bins = np.linspace(0, 1, 50)
        plt.hist(yhat_train_signal,
                 bins=bins,
                 histtype='step',
                 lw=2,
                 alpha=0.5,
                 label=[r'Signal Train'],
                 normed=True)
        plt.hist(yhat_test_signal,
                 bins=bins,
                 histtype='stepfilled',
                 lw=2,
                 alpha=0.5,
                 label=[r'Signal Test'],
                 normed=True)
        plt.hist(yhat_test_background,
                 bins=bins,
                 histtype='stepfilled',
                 lw=2,
                 alpha=0.5,
                 label=[r'Background Test'],
                 normed=True)
        plt.hist(yhat_train_background,
                 bins=bins,
                 histtype='step',
                 lw=2,
                 alpha=0.5,
                 label=[r'Background Train'],
                 normed=True)
        if hasData and setupClient.unblind == True:
            # Plot the data as well. Using skh_plt because matplotlib does not come with markers for hist class
            skh_plt.hist(yhat_data,
                         bins=bins,
                         errorbars=True,
                         histtype='marker',
                         label='Data',
                         color='black',
                         normed=True)
        plt.ylabel('Norm. Entries')
        plt.xlabel('DNN score')
        plt.legend(loc="upper center")
        plt.savefig(setupClient.ModelSavePath + "/MC_Data_TrainTest_Score.png")
        plt.yscale('log')
        plt.savefig(setupClient.ModelSavePath +
                    "/MC_Data_TrainTest_Score_log.png")
        plt.clf()

    if setupClient.doROC:
        if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN' or setupClient.runMode == 'param':
            # Get 'Receiver operating characteristic' (ROC)
            fpr, tpr, thresholds = roc_curve(y_test, yResult_test)

            # Compute Area Under the Curve (AUC) from prediction scores
            roc_auc = auc(fpr, tpr)
            print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'ROC AUC', roc_auc))

            # print "ROC AUC: %0.3f" % roc_auc
            plt.plot(fpr,
                     tpr,
                     color='darkorange',
                     lw=2,
                     label='Full curve (area = %0.2f)' % roc_auc)
            plt.plot([0, 0], [1, 1], color='navy', lw=2, linestyle='--')
            plt.xlim([-0.05, 1.0])
            plt.ylim([0.0, 1.05])
            plt.ylabel('True Positive Rate')
            plt.xlabel('False Positive Rate')
            plt.title('ROC curves for Signal vs Background')
            plt.legend(loc="lower right")
            # plt.plot([0.038], [0.45], marker='*', color='red',markersize=5, label="Cut-based",linestyle="None")
            # plt.plot([0.038, 0.038], [0,1], color='red', lw=1, linestyle='--') # same background rejection point
            plt.savefig(setupClient.ModelSavePath + "/ROC.png")
            plt.clf()

            ### NOW try the weighted ROC curve
            fpr_w, tpr_w, thresholds_w = roc_curve(y_test,
                                                   yResult_test,
                                                   sample_weight=w_test)
            roc_auc_w = auc(fpr_w, tpr_w, reorder=True)
            print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'ROC AUC weighted',
                                            roc_auc_w))
            plt.plot(fpr_w,
                     tpr_w,
                     color='darkorange',
                     lw=2,
                     label='Full curve (area = %0.2f)' % roc_auc_w)
            plt.plot([0, 0], [1, 1], color='navy', lw=2, linestyle='--')
            plt.xlim([-0.05, 1.0])
            plt.ylim([0.0, 1.05])
            plt.ylabel('True Positive Rate (weighted)')
            plt.xlabel('False Positive Rate (weighted)')
            plt.title('ROC curve for Signal vs Background')
            plt.legend(loc="lower right")
            # plt.plot([0.038], [0.45], marker='*', color='red',markersize=5, label="Cut-based",linestyle="None")
            # plt.plot([0.038, 0.038], [0,1], color='red', lw=1, linestyle='--') # same background rejection point
            plt.savefig(setupClient.ModelSavePath + "/ROC_weighted.png")
            plt.clf()

            np.save(os.path.join(setupClient.ModelSavePath, "tpr_w.npy"),
                    tpr_w)
            np.save(os.path.join(setupClient.ModelSavePath, "fpr_w.npy"),
                    fpr_w)
            np.save(
                os.path.join(setupClient.ModelSavePath, "thresholds_w.npy"),
                thresholds_w)
            np.save(os.path.join(setupClient.ModelSavePath, "thresholds.npy"),
                    thresholds)
            np.save(os.path.join(setupClient.ModelSavePath, "tpr.npy"), tpr)
            np.save(os.path.join(setupClient.ModelSavePath, "fpr.npy"), fpr)

            np.save(os.path.join(setupClient.ModelSavePath, "AUC.npy"),
                    roc_auc)
            np.save(os.path.join(setupClient.ModelSavePath, "AUC_w.npy"),
                    roc_auc_w)
コード例 #22
0
import Clusterisation, LoadData, MakePlot, PDF_maker, PrepareData, DataFromClusters

n_clusters = 156 #or -1 for clasters == crimeies not more 156
primaryType = True # == !FBI code

X, y, data, dictionary_crimes = PrepareData.prepareData(LoadData.load_from_csv(primaryType), primaryType)  # dictionary is exist onprimaryType == True

k_means, n_clusters = Clusterisation.make_k_means(n_clusters, data)

MakePlot.make_plot(k_means, X, y, n_clusters)

data_clusters = DataFromClusters.get_data_clusters(k_means, X, y)

if primaryType:
    n = 1
    for i in data_clusters:
        PDF_maker.make_pdf_clusters(i,dictionary_crimes, n)
        n += 1
else:
    n = 1
    for i in data_clusters:
        PDF_maker.make_pdf_clusters_without_descr(i, n)
        n += 1

PDF_maker.makeCounter(data, dictionary_crimes, primaryType)

コード例 #23
0
            slope_output_layer = self.sigmoid_derivative(output_layer_output)
            slope_hidden_layer = self.sigmoid_derivative(hidden_layer_output)

            d_output = E * slope_output_layer

            Error_at_hidden_layer = dot(d_output, self.weights1.T)
            d_hiddenlayer = Error_at_hidden_layer * slope_hidden_layer
            #GRADIENT DESCENT
            self.weights1 = self.weights1 + dot(hidden_layer_output.T,
                                                d_output) * learning_rate
            self.weights0 = self.weights0 + dot(input_layer.T,
                                                d_hiddenlayer) * learning_rate

    def prediction(self, pred):
        layer1 = self.sigmoid(dot(pred, self.weights0))
        layer2 = self.sigmoid(dot(layer1, self.weights1))
        return layer2


ann = NeuralNetwork()

PD.prepare_training_data()
nn_train_inp = array(PD.training_input)
nn_train_oup = expand_dims(PD.training_output, axis=1)

ann.train(nn_train_inp, nn_train_oup, 10000)

PD.prepare_testing_data()
nn_test_inp = array(PD.testing_input)
nn_test_oup = ann.prediction(nn_test_inp)
コード例 #24
0
import random

import torch
from torch import nn
from torch.autograd import Variable
from torch.utils.data import DataLoader

from DecoderRNN import DecoderRNN
from EncoderRNN import EncoderRNN
from PrepareData import *
from attn import Attn
from attn_decoder import AttnDecoder
from seq2seq_dataset import Seq2SeqDataset

data = PrepareData()

SOS = 0
use_cuda = torch.cuda.is_available()

class Translate():
    def __init__(self):
        self.data = PrepareData()
        self.dataset = Seq2SeqDataset()
        self.data_loader = DataLoader(dataset=self.dataset,
                                      batch_size=1,
                                      shuffle=True)
        self.lang_1 = data.lang_1
        self.lang_2 = data.lang_2
        self.char2index = data.char2index
        self.index2char = data.index2char
コード例 #25
0
    if setupClient.ConvertRootToPD:
        print(Fore.BLUE + "--------------------------")
        print(Back.BLUE + ' CONVERTING ROOT-->PANDAS ')
        print(Fore.BLUE + "--------------------------")
        print('{:<45} {:<15}'.format(
            "Input Flat Ntuples directory",
            Fore.GREEN + setupClient.InputMLNtuplePath))
        print(
            '{:<45} {:<15}'.format('Output Pandas Dataframe directory',
                                   Fore.GREEN + setupClient.PDPath),
            checkCreateDir(setupClient.PDPath))
        print('{:<45} {:<15}'.format(
            'Branches to keep from ROOT file',
            Fore.GREEN + str(setupClient.rootBranchSubSample)))
        PrepareData.convertToPanda(setupClient)
    elif setupClient.CreateTrainTestPD:
        print(Fore.BLUE + "--------------------------")
        print(Back.BLUE + '  CREATING TRAIN/TEST PDs ')
        print(Fore.BLUE + "--------------------------")
        print('{:<45} {:<15}'.format('InputFilesSB',
                                     Fore.GREEN + str(InputFilesSB)))
        print(
            '{:<45} {:<15}'.format('I/O Pandas Dataframe directory',
                                   Fore.GREEN + setupClient.PDPath),
            checkCreateDir(setupClient.PDPath))
        print('{:<45} {:<15}'.format(
            'PD Train/Test Name Tag',
            Fore.MAGENTA + setupClient.MixPD_TrainTestTag))
        print('{:<45} {:<15}'.format(
            'PreselectionCuts', Fore.MAGENTA + setupClient.PreselectionCuts))
コード例 #26
0
from sentence_transformers import SentenceTransformer
import scipy.spatial
import numpy as np
import PrepareData

embedder = SentenceTransformer(
    'output/training_tf-idf_word_embeddings-2020-06-19_15-54-05')

corpus = PrepareData.load_data()

# Corpus with example sentences
corpus_embeddings = []
for document in corpus:
    sentences_embeddings = embedder.encode(document)
    sentences_embeddings = np.array(sentences_embeddings)
    document_embedding = np.mean(sentences_embeddings, axis=0)
    corpus_embeddings.append(document_embedding)

# Query sentences:
#
#similarity_matrix = []
#for first_doc in corpus_embeddings:
#    similarity_vector = []
#    for second_doc in corpus_embeddings:
#        similarity_vector.append(1 - scipy.spatial.distance.cosine(first_doc, second_doc))
#    similarity_matrix.append(similarity_vector)
#
#similarity_matrix = np.array(similarity_matrix)
#print(similarity_matrix)

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
コード例 #27
0
import numpy as np
import libs.Utils as utils
import PrepareData as data

# Load dataset
df = data.get_clean_dataset()


def plot_avalanche_activity_per_year():
    '''
    :def: This function creates a bar plot displaying the avalanche activity (number of avalancher) per year of study
    (21 years in total).

    :return: void
    '''

    # Plot number of avalanches per year
    utils.create_bar_plot(df, 'year', 'Year', 'Num avalanches', True)

    # From this plot we can see there hasn't been a clear pattern of increasing/decreasing the avalanche activity within time
    # In addition, we can see that the data collected the fist seven years is much smaller than the years after.
    # Therefore, comparision between years should be done through the entire data set.


def plot_avalanche_feature_correlation():
    '''
    :def: This function creates the correlation map between a pre-selected columns including categorical variables.

    :return: void
    '''
コード例 #28
0
指导文件文件保存在
http://nbviewer.jupyter.org/github/BlueBirdHouse/CarND-TensorFlow-Lab/blob/master/lab.ipynb
"""
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

import PrepareData

#A3 = np.array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 255],[255, 255, 255, 255, 255, 255, 255, 255, 255,255,255,255]])
#train_features1,NG = PrepareData.normalize_grayscale(A3)

#Temp = 0

#%%执行导入数据过程
#导入训练集合
train_features, train_labels = PrepareData.uncompress_features_labels('notMNIST_train.zip')

#导入测试集合
test_features, test_labels = PrepareData.uncompress_features_labels('notMNIST_test.zip')


#%%问题1:数据归一化过程
# Problem 1 - Implement Min-Max scaling for grayscale image data
train_features,train_features_NG = PrepareData.normalize_grayscale(train_features)
test_features,test_features_NG = PrepareData.normalize_grayscale(test_features)

#%%清除问题数据
train_features,train_labels = PrepareData.deleteNan(train_features,train_labels,train_features_NG)
test_features,test_labels = PrepareData.deleteNan(test_features,test_labels,test_features_NG)

#%% 取原有数据集合中的一部分,原作者的目的是为了配合虚拟空间的使用
コード例 #29
0
def plot_avalanche_activity_vs_aspect():
    # - Is the orientation of the avalanche important?
    # Plot the avalanche activity vs the aspect degree
    df_north, df_northeast, df_east, df_southeast, df_south, df_southwest, df_west, df_northwest = data.get_df_aspect(
    )

    labels = [
        'N\n' + str(len(df_north)), 'NE\n' + str(len(df_northeast)),
        'E\n' + str(len(df_east)), 'SE\n' + str(len(df_southeast)),
        'S\n' + str(len(df_south)), 'SW\n' + str(len(df_southwest)),
        'W\n' + str(len(df_west)), 'NW\n' + str(len(df_northwest))
    ]
    coordinates = [
        'North', 'North-East', 'East', 'South-East', 'South', 'South-West',
        'West', 'North-West'
    ]

    # For better observation each aspect will have the same space in the pie
    pie_weights = [1 / 8, 1 / 8, 1 / 8, 1 / 8, 1 / 8, 1 / 8, 1 / 8, 1 / 8]
    # On the other hand is the color of each trunch who determines the weight (number of avalanches)
    weight = [
        df_north.shape[0], df_northeast.shape[0], df_east.shape[0],
        df_southeast.shape[0], df_south.shape[0], df_southwest.shape[0],
        df_west.shape[0], df_northwest.shape[0]
    ]
    weight_cmap = np.true_divide(weight, len(df_northeast))

    # Take the percentage of each orientation
    total = sum(weight)
    percentage = np.around(np.true_divide(weight, total), 2)
    percentage_ = np.multiply(percentage, 100)
    percentage_int = [int(i) for i in percentage_]

    percentage_str = [
        str(len(df_north)) + '\n' + str(percentage_int[0]) + '%',
        str(len(df_northeast)) + '\n' + str(percentage_int[1]) + '%',
        str(len(df_east)) + '\n' + str(percentage_int[2]) + '%',
        str(len(df_southeast)) + '\n' + str(percentage_int[3]) + '%',
        str(len(df_south)) + '\n' + str(percentage_int[4]) + '%',
        str(len(df_southwest)) + '\n' + str(percentage_int[5]) + '%',
        str(len(df_west)) + '\n' + str(percentage_int[6]) + '%',
        str(len(df_northwest)) + '\n' + str(percentage_int[7]) + '%'
    ]

    #utils.create_pie(sizes=pie_weights, labels=labels, colorweight=weight_cmap, startangle=90+22)
    utils.create_two_pie(sizes1=pie_weights,
                         sizes2=pie_weights,
                         labels1=coordinates,
                         labels2=percentage_str,
                         colorweight=weight_cmap,
                         startangle=90 + 22)
コード例 #30
0
        X_train,X_test,y_train,y_test = train_test_split(Train.X,Train.y,test_size=0.25,random_state=4)
        clf = xgb.XGBModel(max_depth=8,n_estimators=100,objective="reg:linear", random_state=17,n_jobs=-1)
        clf.fit(X_train, y_train, eval_metric='rmse', verbose = True, eval_set = [(X_train,y_train),(X_test, y_test)])
        clf.save_model('./model/XGBoost.model')
        pickle.dump(clf, open("XGBosst.pickle.dat", "wb"))





if __name__ == "__main__":
    aa = MakeSeperateFile.MakeSeperateFile(sys.argv[1])
    aa.makeFile()
    with open('BusIdList.csv','rb') as f:
        list = pickle.load(f)
    data = PrepareData.PrepareData(list,2000)
    data.prepareData()
    col = ["Lat1","Lat2","Long1","Long2","hour", "minute","second",
       "timeTaken","week",'sin1','sin2','sin3','sin4','sin5','sin6',
       'sin7','sin8','sin9','sin10','sin11','sin12','sin13','sin14',
       'cos1','cos2','cos3','cos4','cos5','cos6','cos7','cos8','cos9',
       'cos10','cos11','cos12','cos13','cos14']
    final_data = pd.read_csv('./final_data/finalData.csv',names=col)
    train = Train(final_data)
    train.Linear_Regression()
    train.RandomForest()
    train.XGBoost()


    test_df = pd.read_csv(sys.argv[2])
    model = pickle.load(open('./model/LinearRegression.pickle','rb'))
コード例 #31
0
def TrainingModel():
    nb_epochs = Parameters.nb_of_epochs
    prepare = PrepareData.prepareData()
    train, train_lable, test, test_label, test_names = prepare.generateInputData(
    )
    best_roc = -1

    # Construct the neural network
    model = get_model()
    print(model.summary())

    # Save the structure of the neural network
    model_json = model.to_json()
    with open(Parameters.model_path + 'model' + ".json", "w") as json_file:
        json_file.write(model_json)

    writeParamsModel()
    train_acc = []
    train_loss = []
    test_acc = []
    test_loss = []

    # Train
    for epoch in range(nb_epochs):
        print('\nEPOCH:', epoch + 1, 'of', nb_epochs)
        history = model.fit(train,
                            train_lable,
                            epochs=1,
                            shuffle=True,
                            batch_size=Parameters.batch_size)
        loss, acc = model.evaluate(test, test_label)
        # Save training accuracy and loss
        train_acc.append(history.history.get('acc')[0])
        train_loss.append(history.history.get('loss')[0])
        # Print and save test acc and loss
        print('\tTest - loss: ', loss, '- acc:', acc)
        test_acc.append(acc)
        test_loss.append(loss)
        # Predict
        score = model.predict(test)
        predicted_label = model.predict_classes(test)
        unique, counts = np.unique(predicted_label, return_counts=True)
        # Print number of 0s and 1s predicted
        print("\tPredicted labels:", dict(zip(unique, counts)))
        # Roc
        roc = roc_auc_score(test_label, score)
        rocValues.append(roc)
        print('\tROC =', roc)

        # Select the epoch with the best performance
        if roc >= best_roc:
            best_roc = roc
            save_epoch = epoch
            best_score = score
            model.save_weights(Parameters.weights_path + str(save_epoch) +
                               ".h5")
            best_predicted_label = predicted_label
            bestAcc = acc
        # Stop is too much overfitting appears
        save_plots(train_acc, train_loss, test_acc, test_loss, rocValues)
        if history.history.get('acc')[0] - acc > 0.2:
            break

    print('Run:', Parameters.run)
    print('Performance of ProDec-BLSTM: roc:', best_roc)

    writeResults(test_names, test_label, best_score, best_predicted_label,
                 Parameters.run, best_roc, bestAcc, False)
    return train_acc, train_loss, test_acc, test_loss
コード例 #32
0
import matplotlib.pyplot as PLT
import PrepareData as PD
import Accuracy as ACC

accuracy = ACC.count_accuracy()

PD.original_graph_data()

#PLOTTING ORIGINAL FEATURES
PLT.scatter(PD.p1x, PD.p1y, label="setosa", color="red", marker="o", s=50)
PLT.scatter(PD.p2x,
            PD.p2y,
            label="versicolor",
            color="green",
            marker="o",
            s=50)
PLT.scatter(PD.p3x, PD.p3y, label="virginica", color="blue", marker="o", s=50)
#PLOTTING PREDICTED FEATURES
PLT.scatter(ACC.pred_x,
            ACC.pred_y,
            label="prediction",
            color="black",
            marker="|",
            s=100)

PLT.xlabel('sepal length')
PLT.ylabel('sepal width')
PLT.title(accuracy)
PLT.legend()
PLT.show()
コード例 #33
0
#Extract the dataset

train_folders = ImportData.maybe_extract(train_filename)
test_folders = ImportData.maybe_extract(test_filename)

#seed the random generator to get the same random series
np.random.seed(133)

#Problem 1:
print("Problem 1: display samples of images to check if they looks good")
if not SKIP:
    DataCheckers.displayLettersAsImage(test_folders)

#create classes, normalize dataset & put file in manageable  format

train_dataset = PrepareData.maybe_pickle(train_folders, 15000)
test_datasets = PrepareData.maybe_pickle(test_folders, 1800)

#Problem 2 display letter form the dataset array to check if it looks good
print(
    "Problem 2 display letter form the dataset array to check if it looks good"
)
if not SKIP:
    DataCheckers.plotRandomLettersFromDataset(test_datasets)
    DataCheckers.plotRandomLettersFromDataset(train_dataset)

#Problem 3 check if the repartition is even
print("Problem 3 check if the repartition is even")
if not SKIP:
    print("Variance test dataset: " +
          str(DataCheckers.checkRepartition(test_datasets)))
コード例 #34
0
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
'''

import PrepareData
import Models
import quandl
import h2o

#Please uncomment this line and put some key if some unprobable problem with quandl
quandl.ApiConfig.api_key = "rFhqT3ot2z_6AnzpB9nU"

#Get all data needed from Quandl, Preprocess I and II and have features Dataframe ready to train
X = PrepareData.PrepareFeatures()

#Prepares label to predict CRISIS 6 months ahead and gets labels Series
y = PrepareData.PrepareLabel(months_ahead= 6, dates_index= X.index)

#Creates one dataset
dataset = PrepareData.MergeDataset(X,y)

#Trains our preliminar model as documented on pdf
testframe, prel_model = Models.TrainPreliminarModel(dataset)

#Gets Performance Summary on preliminar model
prel_model.model_performance(testframe)

#Trains our final model as documented on pdf with 5 fold CV
final_model = Models.TrainCrossValidation5FoldFinalModel(dataset)
コード例 #35
0
    char_file_path = 'char.json'
    with open(char_file_path, 'r') as char_file:
        char_list = js.load(char_file)

    char_dic = Build_Char_One_Hot_Dic.one_hot_encoding(char_list)
    one_hot_feature_dim = len(char_list)

    train_file_path = str(Path().resolve().parent
                          ) + '/Offline-Challenge/test/xtrain_obfuscated.txt'
    with open(train_file_path, 'r') as raw_data_file:
        raw_data = read_raw_data(raw_data_file)

    validate_file_path = str(
        Path().resolve().parent) + '/Offline-Challenge/xtest_obfuscated.txt'
    with open(validate_file_path, 'r') as validate_raw_file:
        validate_raw = read_raw_data(validate_raw_file)

    x_one_hot, text_length = PrepareData.prepare_data(raw_data, char_dic,
                                                      one_hot_feature_dim)
    x_validate_one_hot, text_length = PrepareData.prepare_data(
        validate_raw, char_dic, one_hot_feature_dim, text_length=text_length)

    label_file_path = str(
        Path().resolve().parent) + '/Offline-Challenge/test/ytrain.txt'
    with open(label_file_path, 'r') as label_file:
        label_data = read_label(label_file)
    y = label_data

    main()
コード例 #36
0
ファイル: main.py プロジェクト: kwkwvenusgod/SAP_Challenge
def main():

    # you can input a list of integers for the raw n gram combination
    if len(sys.argv) == 1:
        n_gram_list = [5]
    else:
        n_gram_list = sys.argv[1:len(sys.argv)]
        n_gram_list = map(int, n_gram_list)

    x = PrepareData.feat_extraction(n_gram_list, x_one_hot)

    x_validate = PrepareData.feat_extraction(n_gram_list, x_validate_one_hot)
    n_feat = x.shape[1]
    raw_data_size = (n_feat, text_length, 1)

    n_classes = y.shape[1]
    k = 5
    k_fold_sequence = data_set_k_fold_separation(x.shape[0], k)

    output_train = open('train_acc.txt', 'wb')
    output_test = open('test_acc.txt', 'wb')
    y_validate = []
    train_loss_acc = []
    test_loss_acc = []
    for i in range(k):
        test_seq = k_fold_sequence[i]
        train_seq = []
        for j in range(k):
            if i != j:
                train_seq.extend(k_fold_sequence[j])

        nc = NC(input_size=raw_data_size,
                n_classes=n_classes,
                raw_feature_dim=n_feat)
        xtrain = x[train_seq]
        ytrain = y[train_seq]
        nc.fit([xtrain, xtrain], ytrain)

        eval_train_result = nc.evaluation([xtrain, xtrain], ytrain)
        train_loss_acc.append(eval_train_result)
        print(eval_train_result)
        print >> output_train, [k, eval_train_result]
        xtest = x[test_seq]
        ytest = y[test_seq]
        eval_test_result = nc.evaluation([xtest, xtest], ytest)
        test_loss_acc.append(eval_test_result)
        print(eval_test_result)
        print >> output_test, [k, eval_test_result]

        y_validate_k = nc.predict([x_validate, x_validate])
        y_validate_k = y_validate_k.argmax(axis=1)

        y_validate.append(y_validate_k)
    print >> output_train, [
        'average', np.mean(np.asarray(train_loss_acc), axis=0)
    ]
    print >> output_test, [
        'average', np.mean(np.asarray(test_loss_acc), axis=0)
    ]

    y_validate_file_path = 'ytest_all.txt'
    np.savetxt(fname=y_validate_file_path, X=np.asarray(y_validate), fmt='%i')
    Y = np.asarray(y_validate).transpose()
    y_final = []
    for tmp in Y:
        y_final.append(np.bincount(tmp).argmax())
    y_final_validate_file_path = 'ytest.txt'
    np.savetxt(fname=y_final_validate_file_path, X=y_final, fmt='%i')