Esempio n. 1
0
    def train(self, trainDF, validateDF):
        print("+++++++++++++++++++++Training model...")
        print("Remove non trainable features...")
        self.xTrain = trainDF
        self.yTrain = trainDF[self.yColDiscrete]
        if ('relevance_int' in self.xTrain):
            self.xTrain = self.xTrain.drop('relevance_int', axis=1)

        print("OneHot encoding")
        self.xTrain = pd.get_dummies(self.xTrain, sparse=True)
        self.xTrain = scipy.sparse.csc_matrix(self.xTrain)

        fm = SGDFMClassification(n_iter=1000,
                                 rank=16,
                                 l2_reg_w=0.0005,
                                 l2_reg_V=0.0005,
                                 l2_reg=0.0005,
                                 step_size=0.01)
        self._model = OneVsRestClassifier(fm)

        self.fittedModel = self._model.fit(self.xTrain, self.yTrain)
        self.yPred = self.fittedModel.predict(self.xTrain)

        print("Converting to old labels")
        dp = DataPreprocessing.DataPreprocessing()
        self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix())
        self.yPred = dp.transformNewLabelToOld(self.yPred)
        print("self.yTrain:", self.yTrain.shape, self.yTrain[1:50, ])
        print("self.yPred:", self.yPred.shape, self.yPred[1:100, ])

        print("MSE:", mean_squared_error(self.yTrain, self.yPred))
        print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred)))
        print("+++++++++++++++++++++Training completed")
Esempio n. 2
0
def drawGaze(x, y, gazeType, height_window):
    frame = create_blank(len(x), height_window, (255, 255, 255))
    # draw state
    lastChunk_b = -1
    while lastChunk_b < len(gazeType) - 1:
        chunk_f, chunk_b, c_type = DataPreprocessing.nextChunk(
            lastChunk_b, gazeType)
        if c_type == 1:
            #saccade -- yellow
            frame[:, chunk_f:chunk_b + 1] = [255, 255, 0]
        if c_type == 4:
            #pursuit -- green
            frame[:, chunk_f:chunk_b + 1] = [0, 255, 255]
        if c_type == 3:
            #noise -- red
            frame[:, chunk_f:chunk_b + 1] = [255, 0, 0]

        # update
        lastChunk_b = chunk_b
    # draw x, y
    for index in range(0, len(x)):
        width = 1680
        height = 1050
        x_p = int((x[index] / width) * height_window)
        y_p = int((y[index] / height) * height_window)
        cv2.circle(frame, (index, -1 * (x_p - height_window)), 1,
                   (80, 50, 200), -1)
        cv2.circle(frame, (index, -1 * (y_p - height_window)), 1, (0, 0, 0),
                   -1)
    return frame
Esempio n. 3
0
def get_result_from_data(data_dir, result_dir, dp_dir):
    """
    Get result from data
    :param data_dir: the pathname of the data directory
    :param result_dir: the pathname of the result directory
    :param dp_dir: the pathname of the DataPreprocessing module directory
    :return:
    """

    # Add code_dir folder
    sys.path.append(dp_dir)

    # Import the DataPreprocessing module
    import DataPreprocessing
    # Get the DataPreprocessing object
    dp = DataPreprocessing.DataPreprocessing(data_dir)

    # Match data file withm names file
    data_names = dp.match_data_names()

    # The parallel pipelines for data preprocessing, train, test, and evaluate the ALA classifier
    # n_jobs = -1 indicates (all CPUs are used)
    # Set backend="multiprocessing" (default) to prevent sharing memory between parent and threads
    Parallel(n_jobs=10)(
        delayed(pipeline)(dp, data_file, names_file, result_dir)
        for data_file, names_file in data_names)
Esempio n. 4
0
def clustering(clust, filenames, saved=False):
    #mergeTitle(df, filename2)
    if saved:
        stats = pd.read_csv(filenames['stats'])
        clusters = pd.read_csv(filenames['clusters'])
    else:
        data, results = dp.getDataForClustering(filenames, clust)
        #TODO divide data into training and testing datasets
        clust['n_samples'] = len(data)
        print 'total instances:', clust['n_samples']
        testing_num = int(clust['n_samples'] * 0.2)
        #testing_num = 1924500
        results['quest_id'] = results['quest_id'][
            testing_num:clust['n_samples']]
        results['time_row'] = results['time_row'][
            testing_num:clust['n_samples']]
        print 'testing instances: ', str(testing_num)  # 385981
        print 'Started clustering...'
        #clusters, stats = clusterData(data, clust, results, False)
        clusters, stats = clusterData(data[testing_num:clust['n_samples']],
                                      clust, results, False)
        print 'Saving the clustering results...'
        csr.to_csv1(stats, filenames['stats'])
        clusters.to_csv(filenames['clusters'])
    return stats, clusters
Esempio n. 5
0
def postProcessingUndefineRefine(gazeType):
    refineGazeType = []
    # chunk by chunk
    lastChunk_b = -1
    lastChunkType = -5  # just for initialization
    while lastChunk_b < len(gazeType) - 1:
        chunk_f, chunk_b, c_type = DataPreprocessing.nextChunk(
            lastChunk_b, gazeType)
        if c_type == 0:
            lastChunkType = 0
            for i in range(chunk_f, chunk_b + 1):
                refineGazeType.append(0)
        elif c_type == 1:
            lastChunkType = 1
            for i in range(chunk_f, chunk_b + 1):
                refineGazeType.append(1)
        elif c_type == 3:
            length = chunk_b - chunk_f + 1
            if lastChunkType == -5 or lastChunkType == 1:
                # peek next chunk
                nextChunkStartIndex = chunk_b + 1
                if nextChunkStartIndex < len(gazeType) - 1 and gazeType[
                        nextChunkStartIndex] == 1 and length <= 6:
                    for i in range(chunk_f, chunk_b + 1):
                        refineGazeType.append(1)
                        lastChunkType = 1
                else:
                    for i in range(chunk_f, chunk_b + 1):
                        refineGazeType.append(3)
                        lastChunkType = 3
        lastChunk_b = chunk_b
    return refineGazeType
Esempio n. 6
0
def filterOutShortFixtion(rt_gaze, gazeType):
    refineGazeType = []
    # chunk by chunk
    lastChunk_b = -1
    while lastChunk_b < len(gazeType) - 1:
        chunk_f, chunk_b, c_type = DataPreprocessing.nextChunk(
            lastChunk_b, gazeType)

        if c_type == 0:
            dur = -1
            if chunk_f != 0:
                dur = rt_gaze[chunk_b] - rt_gaze[chunk_f - 1]
            else:
                dur = rt_gaze[chunk_b] - rt_gaze[chunk_f]

            if dur < 100:
                for i in range(chunk_f, chunk_b + 1):
                    refineGazeType.append(
                        3)  # too short. neither saccade nor fixation
            else:
                for i in range(chunk_f, chunk_b + 1):
                    refineGazeType.append(0)  # fixation
        else:
            # determine as saccade
            for i in range(chunk_f, chunk_b + 1):
                refineGazeType.append(gazeType[i])  # keep same
        # update
        lastChunk_b = chunk_b
    return refineGazeType
def display_data_analysis(master_frame):
    """
    Creates and displays the data analysis tab of the notebook.
    :param master_frame: the parent frame of the page
    """
    data_analysis = create_note_page(master_frame, "Data Analysis")

    # Create  and display the plots
    corr_and_outlier = dpp.create_plots()
    corr_and_outlier_frame = ttk.Frame(data_analysis)
    corr_and_outlier_frame.pack(side='left', padx=(100, 0))
    display_plot(corr_and_outlier_frame, corr_and_outlier)

    # Create and display the text describing the plots
    text_frame = ttk.Frame(data_analysis)
    data_analysis_description = 'The raw data is analyzed using correlation coefficient between the price of the ' \
                                'house and the other features in the data. The higher the number the more important ' \
                                'the feature.' \
                                '\n' \
                                '\n' \
                                'Next the outliers in the price data are graphed so that the KMeans learning model ' \
                                'can be filtered properly.'

    create_description(text_frame, data_analysis_description)
    text_frame.pack(side='left', padx=(100, 0))
Esempio n. 8
0
def main():
    # TrainData: Includes the item outlet sales for each item
    # This data will be used to learn and test out ML algorithm to predict item outlet sales
    TrainData = pd.read_csv('../res/TrainData.txt', sep=",")

    # TestData is the data without any item outlet sales
    # This data is used to evaluate our ML algorithm
    TestData = pd.read_csv('../res/TestData.txt', sep=",")

    # Prepare train and test data to be used in our ML algorithm
    x_train, x_test, y_train, y_test = DataPreprocessing.PrepareData(TrainData)

    #Build model
    Model = ModelMgr.BuildModel(x_train)
    Model.summary()

    ModelCallBack = ModelMgr.PrintDot(x_train.shape[0])

    #Train model
    history = Model.fit(x_train,
                        y_train,
                        batch_size=Config.BATCH_SIZE,
                        epochs=Config.EPOCHS,
                        validation_split=Config.VALIDATION_SPLIT,
                        verbose=0,
                        callbacks=[ModelCallBack])

    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch
    print(hist.tail())

    ModelMgr.plot_history(history)
Esempio n. 9
0
    def getData(self, indexs, hasHeader, needHandleNegativeOneIndex, flag):
        data = []
        columns = defaultdict(list)  # each value in each column is appended to a list

        with open(self.fileName, encoding='Latin-1') as f:
            reader = csv.reader(f, delimiter = ",",quoting=csv.QUOTE_NONE)  # read rows into a dictionary format
            if hasHeader == 1:
                next(reader)
                next(reader)
            for row in reader:
                for (i, v) in enumerate(row):
                    columns[i].append(v)
        for j in indexs:
            newColumns = columns[j]
            if j in needHandleNegativeOneIndex:

                newColumns = DataPreprocessing.DataPreprocessing().handleNegativeOneV2([float(i) for i in newColumns], flag = False)
            data.append(newColumns)
        # minLength
        allLengths = []
        for i in range(0, len(data)):
            allLengths.append(len(data[i]))
        minLength = np.array(allLengths).min()
        for i in range(0, len(data)):
            data[i] = data[i][0:minLength]
        return data
    def train_model(self):
        adam = Adam(lr=0.001)
        self.training_model.compile(
            loss=self.bidirectional_ranking_loss,
            optimizer=adam,
            metrics=[self.accuracy_image, self.accuracy_text])

        checkpoints = keras.callbacks.ModelCheckpoint(
            "./Models/training_model.h5",
            monitor='val_loss',
            verbose=0,
            save_best_only=True,
            save_weights_only=False,
            mode='auto',
            period=1)

        data_preprocessing = DataPreprocessing.DataProcessor(
            self.bow, self.image_features)
        [a_im, a_txt, n_im,
         n_txt], Y_train, X_val, Y_val = data_preprocessing.return_data()

        for epoch in range(self.nb_epochs):
            np.random.shuffle(n_im)
            np.random.shuffle(n_txt)
            self.training_model.fit([a_im, a_txt, n_im, n_txt],
                                    Y_train,
                                    validation_data=[X_val, Y_val],
                                    epochs=1,
                                    batch_size=100,
                                    callbacks=[checkpoints])

        self.image_model.save_weights("./Models/image_model.h5")
        self.text_model.save_weights("./Models/text_model.h5")
 def preprocessing(self):
     data_base = DataPreprocessing.DataPeprocessing(self.db_file_name, self.processed_data_file_name)
     """
     1. 'MeanShirf' - default
     2. 'Uniform Distribution'
     3. 'Equal Steps'
     """
     data_base.data_preprosessing(PreprocessingTypes.UNIFORM_DISTRIBUTION)
 def predictSingle(self, songFile):
     data_base = DataPreprocessing.DataPeprocessing(self.db_file_name, self.processed_data_file_name)
     data_base.data_preprosessing(PreprocessingTypes.EQUAL_STEPS)
     self.DAG = self.convertFileToDAG()
     bn = BayesianNetwork.BN(self.DAG)
     res = bn.BNForOneSong(self.DAG, self.processed_data_file_name, self.predicted_results_file_name, songFile)
     print(res)
     return res
def main():
    DP = DataPreprocessing()
    UnderExposedImages, OverExposedImages = DP.ConstructDataset()
    UnderExposedYCbCrImages = [[], [], []]
    OverExposedYCbCrImages = [[], [], []]
    with tf.Session() as sess:
        for img in UnderExposedImages:
            Image_Array = img.eval()
            #Image_Array = Image_Array / 255 # normalization
            YImage, CbImage, CrImage = rgb_to_ycbcr(Image_Array)

            UnderExposedYCbCrImages[0].append(YImage)
            UnderExposedYCbCrImages[1].append(CbImage)
            UnderExposedYCbCrImages[2].append(CrImage)
            print("test")
        # Image.fromarray(np.asarray(OverExposedImages[0].eval())).show()
    return
Esempio n. 14
0
def extractFeaturesML(db_name, DIR):
    key_users = ['UserId']
    key_posts = ['PostId']
    # extracts all features
    extractor.extractForML(db_name, DIR)
    dfML = mf.mergeAll(DIR, key_users, key_posts, FeaturePredict)
    dfML.to_csv(DIR + 'dataML.csv', index=False)
    dfnorm = dp.normalize(dfML)
    return dfnorm
Esempio n. 15
0
def extractFeaturesML(db_name, DIR):
    key_users = ['UserId']
    key_posts = ['PostId']
    # extracts all features
    extractor.extractForML(db_name, DIR)
    dfML = mf.mergeAll(DIR, key_users, key_posts, FeaturePredict)
    dfML.to_csv(DIR + 'dataML.csv', index=False)
    dfnorm = dp.normalize(dfML)
    return dfnorm
Esempio n. 16
0
    def train(self, trainDF, svalidateDF):
        self._model = LogisticRegression(penalty='l2',
                                         dual=False,
                                         tol=0.0001,
                                         C=1.0,
                                         fit_intercept=True,
                                         intercept_scaling=1,
                                         class_weight=None,
                                         random_state=None,
                                         solver='sag',
                                         max_iter=10000,
                                         multi_class='multinomial',
                                         verbose=1,
                                         warm_start=False,
                                         n_jobs=-1)
        print("+++++++++++++++++++++Training model...")

        print("Remove non trainable features...")
        self.xTrain = trainDF
        self.yTrain = trainDF[self.yColDiscrete]
        # self.xValidate=validateDF
        # self.yValidate=validateDF[self.yColDiscrete]

        # self.xTrain.drop('search_term', axis=1, inplace=True)
        # self.xTrain.drop('relevance', axis=1, inplace=True)
        if ('relevance_int' in self.xTrain):
            self.xTrain = self.xTrain.drop('relevance_int', axis=1)
        # self.xTrain.drop('product_idx', axis=1, inplace=True)
        # self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True)

        # self.xValidate.drop('search_term', axis=1, inplace=True)
        # self.xValidate.drop('relevance', axis=1, inplace=True)
        # self.xValidate.drop('relevance_int', axis=1, inplace=True)
        # self.xValidate.drop('product_idx', axis=1, inplace=True)
        # self.xValidate.drop('Word2VecQueryExpansion', axis=1, inplace=True)

        print("+++++++++++++++++++++Training in progress")
        # print("self.xTrain:",list(self.xTrain))
        # print("self.yTrain:", list(self.yTrain))
        fittedModel = self._model.fit(self.xTrain, self.yTrain)
        self.yPred = fittedModel.predict(self.xTrain)
        # print("self.yPred:", list(self.yPred))

        print("Converting to old labels")
        dp = DataPreprocessing.DataPreprocessing()
        self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix())
        self.yPred = dp.transformNewLabelToOld(self.yPred)
        print("self.yTrain:", self.yTrain.shape, self.yTrain[1:50, ])
        print("self.yPred:", self.yPred.shape, self.yPred[1:50, ])

        print("MSE:", mean_squared_error(self.yTrain, self.yPred))
        print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred)))
        # print("Accuracy:", accuracy_score(self.yTrain, self.yPred))
        # print("Precision:", precision_score(self.yTrain, self.yPred, average='micro'))
        # print("Recall:", recall_score(self.yTrain, self.yPred, average='micro'))
        # print("F1:", f1_score(self.yTrain, self.yPred, average='micro'))
        print("+++++++++++++++++++++Training completed")
Esempio n. 17
0
def MSLR():
    print('call functions MSLR')

    # Data Preprocessing
    X, y = dp.readData('50_Startups.csv')
    X = dp.EncodeData(X, 'State')
    X_train, y_train, X_test, y_test = dp.splitData(X, y)

    #print(X_train)
    #print(y_train)
    # Apply simple linear regression
    regressor = sp.trainModel(X_train, y_train)

    #sp.showTheta()
    #sp.drawTrainSet(X_train, y_train)
    yt = sp.predictMultiTest(regressor, X_test, y_test)
    print(X_test)

    print(yt)
    sp.drawTestSet(X_test[:, 3], yt)
Esempio n. 18
0
def SLR():
    print('call functions SLR')

    # Data Preprocessing
    X, y = dp.readData('Salary_Data.csv')
    print(np.array(X))
    print(np.array(y))

    X_array = np.array(X)

    y_array = np.array(y)
    print(X_array)
    print(y_array)
    np.set_printoptions(2)
    print(
        np.concatenate(
            (X_array.reshape(len(X_array), 1), y_array.reshape(
                len(y_array), 1)), 1))

    m = len(X_array)
    theta0Array = np.ones((m, 1))
    print(np.concatenate((theta0Array, X_array), 1))

    X_train, y_train, X_test, y_test = dp.splitData(X, y)

    #print(X_train)
    #print(y_train)
    # Apply simple linear regression
    regressor = sp.trainModel(X_train, y_train)

    #pfilename = 'C:\self\salarypredictor.pkl'
    pfilename = 'salarypredictor.pkl'
    with open(pfilename, 'wb') as file:
        pickle.dump(regressor, file)

    # with open(pfilename, 'rb') as file:
    #     regressor = pickle.load(file)

    #sp.showTheta()
    #sp.drawTrainSet(X_train, y_train)
    sp.drawTestSet(X_test, sp.predictTest(regressor, X_test))
Esempio n. 19
0
def query_rewrite(text: str, k: int):

    # First, rewrite each word in the query text. Replace the out-of-vocab words with
    # their nearest words in embedding using KNN of ball-tree.
    text_list = re.split('[\\\\+\-#/,|;-?*$%()\[\]\s]', text)  #  clean data firstly
    text_list = [NN.word_rewrite(w) for w in text_list]
    text = DataPreprocessing.list2str(text_list)
    text = DataPreprocessing.list2str(DataPreprocessing.stemming(text))
    print(text)
    #  Second, add this query to our corpus and calculate the TF-IDF value

    corpus.append(text)
    corpus_array = np.array(corpus)
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(corpus)

    #  Third, calculate the cosine closest concepts of the query. Return the top K concepts.
    cosine_similarities = linear_kernel(tfidf[-1:], tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[:-k-1:-1]

    return corpus_array[related_docs_indices]
Esempio n. 20
0
def classify_ins():
    ins_li = [[], [], []]
    for ins in DataPreprocessing.load_instance_json():
        name = ins['name']
        ins_id = int(name[6:8])
        if ins_id in [i for i in range(1, 19)]:
            ins_li[0].append(ins)
        elif int(ins_id) in [i for i in range(19, 37)]:
            ins_li[1].append(ins)
        elif int(ins_id) in [i for i in range(37, 55)]:
            ins_li[2].append((ins))
    return (ins_li)
def _iterate_videos_from_pickle(files, normalize=False, use_first_hundred=False):
    # This is a generator function. It iterates over the data
    # and returns the frames of one video at a time with each call.
    # To-Do: implement batchsize
    
    for file in files:
            
        data = pickle.load(open(file,'rb'))
        
        index = 0
        N = data['targets'].shape[0]
        
        for i in range(0,N):
            
            if normalize:
                if use_first_hundred:
                    if data['video_frames'][i] > 100:
                        inputs = DP.normalize(data['data'][index:index+100])
                    else: 
                        inputs = DP.normalize(data['data'][index:index+data['video_frames'][i]])
                else:
                    inputs = DP.normalize(data['data'][index:index+data['video_frames'][i]])
            else:
                if use_first_hundred:
                    if data['video_frames'][i] > 100:
                        inputs = torch.from_numpy(np.array(data['data'][index:index+100]))
                    else:
                        inputs = torch.from_numpy(np.array(data['data'][index:index+data['video_frames'][i]]))
                else:
                    inputs = torch.from_numpy(np.array(data['data'][index:index+data['video_frames'][i]]))
            
            targets = np.array([data['targets'][i]])   
            targets = torch.from_numpy(targets).type(torch.LongTensor)
            
            yield inputs, targets
            
            index += data['video_frames'][i]
            i += 1
Esempio n. 22
0
def test(model=None):
    print('Predicting labels in test sentences...')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    if model is None:
        model = BertForSeqClassification()
        model.load_state_dict(
            torch.load(
                (utils.cfg.get('PRETRAIN_MODEL', 'fine_tuned_bert_path') +
                 '/pytorch_model.bin')))
        model.to(device)

        for param_tensor in model.state_dict():
            print(param_tensor, "\t", model.state_dict()[param_tensor].size())

    tokenizer = transformers.BertTokenizer.from_pretrained(
        utils.cfg.get('PRETRAIN_MODEL', 'fine_tuned_bert_path'))
    model.eval()

    test_set = dp.TestDataset()
    ul = Dataset(test_set, tokenizer)
    predict_dataloader = ul.get_dataloader(is_super=False)

    predictions = []
    for batch in tqdm.tqdm(predict_dataloader):
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask = batch
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_input_mask)

        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)

    # bayes
    # train_label = dp.LabeledDataset()
    # hashtag = ht.Hashtag(train_label=True, test=test_set)
    # sentiment_bayes = st.SentimentTime(test=test_set)
    # predictions = hashtag.bayes(predictions)
    # predictions = sentiment_bayes.bayes(predictions, 1)

    predict_labels = []
    for i in range(len(predictions)):
        predict_labels.append(
            np.argmax(predictions[i], axis=1).flatten().tolist())
    test_set.fill_result(list(
        itertools.chain(*predict_labels)))  # 把多个list合并成一个list
    test_set.submit()
    print('    DONE.')
def _iterate_videos(data):
    # This is a generator function. It iterates over the data
    # and returns the frames of one video at a time with each call.
    # To-Do: implement batchsize
    
    i = 0
    N = data['targets'].shape[0]
    for i in range(0,N):
        inputs = torch.from_numpy(np.array(DP.normalize(data['data'][i])))
        targets = np.array([data['targets'][i]])
        targets = torch.from_numpy(targets).type(torch.LongTensor)

        #yield np.ndarray(data['data'][index:index+data['video_frames'][i]]), np.ndarray(data['targets'][i])
        yield inputs, targets
Esempio n. 24
0
def main():
    data = load_dataCSV()
    
    look_back = 28
    jump=4
    
    train_data, test_data = dp.rescale_data(data)
    trainX, trainY = dp.create_dataset(train_data, look_back)
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX, testY = dp.create_dataset(test_data, look_back)  
    
    model = mod.getModel(look_back)
    model.fit(
        trainX,
        trainY,
        batch_size=128,
        nb_epoch=300,
        validation_split=0.10)
    
    pred,perfs=mod.testModel(model,testX,testY,jump,look_back)
    
    actual_test_data=test_data[len(test_data)-len(pred):]

    
    print("\n Average Covarance between predicted and actual prices on only predicted days:")
    print(np.mean(perfs))
    
    print("\n Covarance between predicted and actual prices on all days:")    
    print(np.cov(actual_test_data,pred)[1][0])
    
    plt.figure(3)
    plt.plot(actual_test_data)
    
    plt.figure(4)
    plt.plot(pred)
    
    mod.saveModel(model,'lstm3')
Esempio n. 25
0
def main():
    data = load_dataCSV()

    look_back = 28
    jump = 4

    train_data, test_data = dp.rescale_data(data)
    trainX, trainY = dp.create_dataset(train_data, look_back)
    trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
    testX, testY = dp.create_dataset(test_data, look_back)

    savedModel = load_model('lstm3.h5')

    pred, perfs = mod.testModel(savedModel, testX, testY, jump, look_back)

    actual_test_data = test_data[len(test_data) - len(pred):]

    print(
        "\n Average Covarance between predicted and actual prices on only predicted days:"
    )
    print(np.mean(perfs))

    print("\n Covarance between predicted and actual prices on all days:")
    print(np.cov(actual_test_data, pred)[1][0])
Esempio n. 26
0
def buildFixationFlagListFromEsacIndex(EsacIndex, rt_gaze, gazeX, gazeY):
    gazeType = []
    for i in range(0, len(rt_gaze)):
        if float(gazeX[i]) == -1.0:
            gazeType.append(2)
        else:
            if i in EsacIndex:
                gazeType.append(1)
            else:
                gazeType.append(0)
    refineGazeType = DataPreprocessing.postProcessingFixationFlagList(
        gazeType, rt_gaze, gazeX, gazeY)
    if len(refineGazeType) != len(gazeType):
        print("Error 1")
        sys.exit()
    return refineGazeType
Esempio n. 27
0
def preprocess():
    while True:
        print("Pre-process Menu")
        print("----------------\n")
        print("\tIntroduce the name of the file with the Data stored.")
        print("\tIntroduce 0 if you want to back to main menu.\n")
        fname = raw_input("\t\tIntroduce the name of the original json file: ")
        if fname == str(0):
            print("")
            return
        elif os.path.isfile(fname):
            ext = raw_input(
                "\t\tIntroduced the extension of the output file (csv, txt, json): "
            )
            language = raw_input(
                "\t\tIntroduced the code of the language (en, fr, es, de, zh, ja): "
            )
            no_repeated = raw_input(
                "\t\tDo you want to store the no repeated tweets? (y/n): ")
            print("")
            # With the aim of get a shorter name, we will erase the word 'Stream'. However, we
            # need the original name in the first step.
            DataPreprocessing.first_step(directory=directory,
                                         fname=fname,
                                         ext=ext,
                                         language=language)
            fname = fname.replace('Stream', '')
            DataPreprocessing.second_step(directory=directory,
                                          fname=fname,
                                          ext=ext,
                                          language=language)
            if no_repeated == "y":
                DataPreprocessing.third_step(directory=directory,
                                             fname=fname,
                                             ext=ext,
                                             language=language)
            DataPreprocessing.fourth_step(directory=directory,
                                          fname=fname,
                                          ext=ext,
                                          language=language)
            print("\nAction finished.\n")
        else:
            print("\nSorry, file hasn't found.\n")
Esempio n. 28
0
    def token_encode_multiprocess(self, tokenizer, sentences):
        n_cores = 10
        start_time = time.time()

        with multiprocessing.Pool(n_cores) as p:
            token_encode_partial = functools.partial(
                tokenizer.encode,
                add_special_tokens=True,
                max_length=int(
                    utils.cfg.get('HYPER_PARAMETER', 'max_sequence_length')),
                pad_to_max_length=True)
            token_encode_multiprocess_partial = functools.partial(
                self.token_encode, token_encode_partial)
            res = p.map(token_encode_multiprocess_partial,
                        dp.Batch(n_cores, sentences.tolist()))
            res = functools.reduce(lambda x, y: x + y, res)
            print(f'已获取Token后的ID, 用时:{round(time.time() - start_time, 2)}s')

        return res
Esempio n. 29
0
 def makeSentenceVector(self, sentence):
     '''
     Convert a single sentence to vector
     '''
     sentence = sentence.replace('.', '')
     senWords = sentence.split(' ')
     if self.model.currentModel == ModelType.Word2Vec:
         wordEmbedding = self.model.embedding
         ps = p.Preprocessing()
         senWords = ps.removeStopword(senWords)
         mat = []
         for i in senWords:
             if i in wordEmbedding:
                 mat.append(wordEmbedding[i])
         mat = np.array(mat)
         return np.mean(mat, axis=0)
     elif self.model.currentModel == ModelType.SelfTrainedDoc2Vec:
         embedding = self.model.embedding
         mat = np.array(embedding.infer_vector(senWords))
         return mat
Esempio n. 30
0
def pipeline_all_datasets():
    """
    The pipeline for all data sets
    :return:
    """

    # Add code_dir folder
    sys.path.append(dp_dir)

    # Import DataPreprocessing module
    import DataPreprocessing
    dp = DataPreprocessing.DataPreprocessing(data_dir)

    # Match data files with names file
    data_names = dp.match_data_names()

    # The pipeline for each data set (in parallel)
    # Set backend="multiprocessing" (default) to prevent sharing memory between parent and threads
    Parallel(n_jobs=1)(
        delayed(pipeline_one_dataset)(dp, data_files, names_file)
        for data_files, names_file in data_names)
Esempio n. 31
0
def clustering(clust, filenames, saved=False):
    #mergeTitle(df, filename2)
    if saved:
        stats = pd.read_csv(filenames['stats'])
        clusters = pd.read_csv(filenames['clusters'])
    else:
        data, results = dp.getDataForClustering(filenames, clust)
        #TODO divide data into training and testing datasets
        clust['n_samples'] = len(data)
        print 'total instances:', clust['n_samples']
        testing_num = int(clust['n_samples'] * 0.2)
        #testing_num = 1924500
        results['quest_id'] = results['quest_id'][testing_num:clust['n_samples']]
        results['time_row'] = results['time_row'][testing_num:clust['n_samples']]
        print 'testing instances: ', str(testing_num) # 385981
        print 'Started clustering...'
        #clusters, stats = clusterData(data, clust, results, False)
        clusters, stats = clusterData(data[testing_num:clust['n_samples']], clust, results, False)
        print 'Saving the clustering results...'
        csr.to_csv1(stats, filenames['stats'])
        clusters.to_csv(filenames['clusters'])
    return stats, clusters
Esempio n. 32
0
    4. Taekwondo
    5. MLB 1870-2016



"""

import numpy as np
import math
import time

import DataPreprocessing
import EnsembleClassifiers
import ModelValidation

DP = DataPreprocessing.DataPreprocessing()
EC = EnsembleClassifiers.EnsembleClassifiers()
MV = ModelValidation.ModelValidation()

#Special Cases Pre-processing

#DP.merge_taekwondo_datasets();
#DP.preprocess_sms_dataset();


def dataset_learning(dataset, output_file, dataset_name, preprocess_time, NT,
                     F, parameters):

    txtfile = open('./Learning_Results/' + output_file + '.txt', 'wb')
    txtfile.write("\n::::::::::::::::::::::::::::")
    txtfile.write("\nRandom Forest Classification")