def train(self, trainDF, validateDF): print("+++++++++++++++++++++Training model...") print("Remove non trainable features...") self.xTrain = trainDF self.yTrain = trainDF[self.yColDiscrete] if ('relevance_int' in self.xTrain): self.xTrain = self.xTrain.drop('relevance_int', axis=1) print("OneHot encoding") self.xTrain = pd.get_dummies(self.xTrain, sparse=True) self.xTrain = scipy.sparse.csc_matrix(self.xTrain) fm = SGDFMClassification(n_iter=1000, rank=16, l2_reg_w=0.0005, l2_reg_V=0.0005, l2_reg=0.0005, step_size=0.01) self._model = OneVsRestClassifier(fm) self.fittedModel = self._model.fit(self.xTrain, self.yTrain) self.yPred = self.fittedModel.predict(self.xTrain) print("Converting to old labels") dp = DataPreprocessing.DataPreprocessing() self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix()) self.yPred = dp.transformNewLabelToOld(self.yPred) print("self.yTrain:", self.yTrain.shape, self.yTrain[1:50, ]) print("self.yPred:", self.yPred.shape, self.yPred[1:100, ]) print("MSE:", mean_squared_error(self.yTrain, self.yPred)) print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred))) print("+++++++++++++++++++++Training completed")
def drawGaze(x, y, gazeType, height_window): frame = create_blank(len(x), height_window, (255, 255, 255)) # draw state lastChunk_b = -1 while lastChunk_b < len(gazeType) - 1: chunk_f, chunk_b, c_type = DataPreprocessing.nextChunk( lastChunk_b, gazeType) if c_type == 1: #saccade -- yellow frame[:, chunk_f:chunk_b + 1] = [255, 255, 0] if c_type == 4: #pursuit -- green frame[:, chunk_f:chunk_b + 1] = [0, 255, 255] if c_type == 3: #noise -- red frame[:, chunk_f:chunk_b + 1] = [255, 0, 0] # update lastChunk_b = chunk_b # draw x, y for index in range(0, len(x)): width = 1680 height = 1050 x_p = int((x[index] / width) * height_window) y_p = int((y[index] / height) * height_window) cv2.circle(frame, (index, -1 * (x_p - height_window)), 1, (80, 50, 200), -1) cv2.circle(frame, (index, -1 * (y_p - height_window)), 1, (0, 0, 0), -1) return frame
def get_result_from_data(data_dir, result_dir, dp_dir): """ Get result from data :param data_dir: the pathname of the data directory :param result_dir: the pathname of the result directory :param dp_dir: the pathname of the DataPreprocessing module directory :return: """ # Add code_dir folder sys.path.append(dp_dir) # Import the DataPreprocessing module import DataPreprocessing # Get the DataPreprocessing object dp = DataPreprocessing.DataPreprocessing(data_dir) # Match data file withm names file data_names = dp.match_data_names() # The parallel pipelines for data preprocessing, train, test, and evaluate the ALA classifier # n_jobs = -1 indicates (all CPUs are used) # Set backend="multiprocessing" (default) to prevent sharing memory between parent and threads Parallel(n_jobs=10)( delayed(pipeline)(dp, data_file, names_file, result_dir) for data_file, names_file in data_names)
def clustering(clust, filenames, saved=False): #mergeTitle(df, filename2) if saved: stats = pd.read_csv(filenames['stats']) clusters = pd.read_csv(filenames['clusters']) else: data, results = dp.getDataForClustering(filenames, clust) #TODO divide data into training and testing datasets clust['n_samples'] = len(data) print 'total instances:', clust['n_samples'] testing_num = int(clust['n_samples'] * 0.2) #testing_num = 1924500 results['quest_id'] = results['quest_id'][ testing_num:clust['n_samples']] results['time_row'] = results['time_row'][ testing_num:clust['n_samples']] print 'testing instances: ', str(testing_num) # 385981 print 'Started clustering...' #clusters, stats = clusterData(data, clust, results, False) clusters, stats = clusterData(data[testing_num:clust['n_samples']], clust, results, False) print 'Saving the clustering results...' csr.to_csv1(stats, filenames['stats']) clusters.to_csv(filenames['clusters']) return stats, clusters
def postProcessingUndefineRefine(gazeType): refineGazeType = [] # chunk by chunk lastChunk_b = -1 lastChunkType = -5 # just for initialization while lastChunk_b < len(gazeType) - 1: chunk_f, chunk_b, c_type = DataPreprocessing.nextChunk( lastChunk_b, gazeType) if c_type == 0: lastChunkType = 0 for i in range(chunk_f, chunk_b + 1): refineGazeType.append(0) elif c_type == 1: lastChunkType = 1 for i in range(chunk_f, chunk_b + 1): refineGazeType.append(1) elif c_type == 3: length = chunk_b - chunk_f + 1 if lastChunkType == -5 or lastChunkType == 1: # peek next chunk nextChunkStartIndex = chunk_b + 1 if nextChunkStartIndex < len(gazeType) - 1 and gazeType[ nextChunkStartIndex] == 1 and length <= 6: for i in range(chunk_f, chunk_b + 1): refineGazeType.append(1) lastChunkType = 1 else: for i in range(chunk_f, chunk_b + 1): refineGazeType.append(3) lastChunkType = 3 lastChunk_b = chunk_b return refineGazeType
def filterOutShortFixtion(rt_gaze, gazeType): refineGazeType = [] # chunk by chunk lastChunk_b = -1 while lastChunk_b < len(gazeType) - 1: chunk_f, chunk_b, c_type = DataPreprocessing.nextChunk( lastChunk_b, gazeType) if c_type == 0: dur = -1 if chunk_f != 0: dur = rt_gaze[chunk_b] - rt_gaze[chunk_f - 1] else: dur = rt_gaze[chunk_b] - rt_gaze[chunk_f] if dur < 100: for i in range(chunk_f, chunk_b + 1): refineGazeType.append( 3) # too short. neither saccade nor fixation else: for i in range(chunk_f, chunk_b + 1): refineGazeType.append(0) # fixation else: # determine as saccade for i in range(chunk_f, chunk_b + 1): refineGazeType.append(gazeType[i]) # keep same # update lastChunk_b = chunk_b return refineGazeType
def display_data_analysis(master_frame): """ Creates and displays the data analysis tab of the notebook. :param master_frame: the parent frame of the page """ data_analysis = create_note_page(master_frame, "Data Analysis") # Create and display the plots corr_and_outlier = dpp.create_plots() corr_and_outlier_frame = ttk.Frame(data_analysis) corr_and_outlier_frame.pack(side='left', padx=(100, 0)) display_plot(corr_and_outlier_frame, corr_and_outlier) # Create and display the text describing the plots text_frame = ttk.Frame(data_analysis) data_analysis_description = 'The raw data is analyzed using correlation coefficient between the price of the ' \ 'house and the other features in the data. The higher the number the more important ' \ 'the feature.' \ '\n' \ '\n' \ 'Next the outliers in the price data are graphed so that the KMeans learning model ' \ 'can be filtered properly.' create_description(text_frame, data_analysis_description) text_frame.pack(side='left', padx=(100, 0))
def main(): # TrainData: Includes the item outlet sales for each item # This data will be used to learn and test out ML algorithm to predict item outlet sales TrainData = pd.read_csv('../res/TrainData.txt', sep=",") # TestData is the data without any item outlet sales # This data is used to evaluate our ML algorithm TestData = pd.read_csv('../res/TestData.txt', sep=",") # Prepare train and test data to be used in our ML algorithm x_train, x_test, y_train, y_test = DataPreprocessing.PrepareData(TrainData) #Build model Model = ModelMgr.BuildModel(x_train) Model.summary() ModelCallBack = ModelMgr.PrintDot(x_train.shape[0]) #Train model history = Model.fit(x_train, y_train, batch_size=Config.BATCH_SIZE, epochs=Config.EPOCHS, validation_split=Config.VALIDATION_SPLIT, verbose=0, callbacks=[ModelCallBack]) hist = pd.DataFrame(history.history) hist['epoch'] = history.epoch print(hist.tail()) ModelMgr.plot_history(history)
def getData(self, indexs, hasHeader, needHandleNegativeOneIndex, flag): data = [] columns = defaultdict(list) # each value in each column is appended to a list with open(self.fileName, encoding='Latin-1') as f: reader = csv.reader(f, delimiter = ",",quoting=csv.QUOTE_NONE) # read rows into a dictionary format if hasHeader == 1: next(reader) next(reader) for row in reader: for (i, v) in enumerate(row): columns[i].append(v) for j in indexs: newColumns = columns[j] if j in needHandleNegativeOneIndex: newColumns = DataPreprocessing.DataPreprocessing().handleNegativeOneV2([float(i) for i in newColumns], flag = False) data.append(newColumns) # minLength allLengths = [] for i in range(0, len(data)): allLengths.append(len(data[i])) minLength = np.array(allLengths).min() for i in range(0, len(data)): data[i] = data[i][0:minLength] return data
def train_model(self): adam = Adam(lr=0.001) self.training_model.compile( loss=self.bidirectional_ranking_loss, optimizer=adam, metrics=[self.accuracy_image, self.accuracy_text]) checkpoints = keras.callbacks.ModelCheckpoint( "./Models/training_model.h5", monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=False, mode='auto', period=1) data_preprocessing = DataPreprocessing.DataProcessor( self.bow, self.image_features) [a_im, a_txt, n_im, n_txt], Y_train, X_val, Y_val = data_preprocessing.return_data() for epoch in range(self.nb_epochs): np.random.shuffle(n_im) np.random.shuffle(n_txt) self.training_model.fit([a_im, a_txt, n_im, n_txt], Y_train, validation_data=[X_val, Y_val], epochs=1, batch_size=100, callbacks=[checkpoints]) self.image_model.save_weights("./Models/image_model.h5") self.text_model.save_weights("./Models/text_model.h5")
def preprocessing(self): data_base = DataPreprocessing.DataPeprocessing(self.db_file_name, self.processed_data_file_name) """ 1. 'MeanShirf' - default 2. 'Uniform Distribution' 3. 'Equal Steps' """ data_base.data_preprosessing(PreprocessingTypes.UNIFORM_DISTRIBUTION)
def predictSingle(self, songFile): data_base = DataPreprocessing.DataPeprocessing(self.db_file_name, self.processed_data_file_name) data_base.data_preprosessing(PreprocessingTypes.EQUAL_STEPS) self.DAG = self.convertFileToDAG() bn = BayesianNetwork.BN(self.DAG) res = bn.BNForOneSong(self.DAG, self.processed_data_file_name, self.predicted_results_file_name, songFile) print(res) return res
def main(): DP = DataPreprocessing() UnderExposedImages, OverExposedImages = DP.ConstructDataset() UnderExposedYCbCrImages = [[], [], []] OverExposedYCbCrImages = [[], [], []] with tf.Session() as sess: for img in UnderExposedImages: Image_Array = img.eval() #Image_Array = Image_Array / 255 # normalization YImage, CbImage, CrImage = rgb_to_ycbcr(Image_Array) UnderExposedYCbCrImages[0].append(YImage) UnderExposedYCbCrImages[1].append(CbImage) UnderExposedYCbCrImages[2].append(CrImage) print("test") # Image.fromarray(np.asarray(OverExposedImages[0].eval())).show() return
def extractFeaturesML(db_name, DIR): key_users = ['UserId'] key_posts = ['PostId'] # extracts all features extractor.extractForML(db_name, DIR) dfML = mf.mergeAll(DIR, key_users, key_posts, FeaturePredict) dfML.to_csv(DIR + 'dataML.csv', index=False) dfnorm = dp.normalize(dfML) return dfnorm
def train(self, trainDF, svalidateDF): self._model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None, solver='sag', max_iter=10000, multi_class='multinomial', verbose=1, warm_start=False, n_jobs=-1) print("+++++++++++++++++++++Training model...") print("Remove non trainable features...") self.xTrain = trainDF self.yTrain = trainDF[self.yColDiscrete] # self.xValidate=validateDF # self.yValidate=validateDF[self.yColDiscrete] # self.xTrain.drop('search_term', axis=1, inplace=True) # self.xTrain.drop('relevance', axis=1, inplace=True) if ('relevance_int' in self.xTrain): self.xTrain = self.xTrain.drop('relevance_int', axis=1) # self.xTrain.drop('product_idx', axis=1, inplace=True) # self.xTrain.drop('Word2VecQueryExpansion', axis=1, inplace=True) # self.xValidate.drop('search_term', axis=1, inplace=True) # self.xValidate.drop('relevance', axis=1, inplace=True) # self.xValidate.drop('relevance_int', axis=1, inplace=True) # self.xValidate.drop('product_idx', axis=1, inplace=True) # self.xValidate.drop('Word2VecQueryExpansion', axis=1, inplace=True) print("+++++++++++++++++++++Training in progress") # print("self.xTrain:",list(self.xTrain)) # print("self.yTrain:", list(self.yTrain)) fittedModel = self._model.fit(self.xTrain, self.yTrain) self.yPred = fittedModel.predict(self.xTrain) # print("self.yPred:", list(self.yPred)) print("Converting to old labels") dp = DataPreprocessing.DataPreprocessing() self.yTrain = dp.transformNewLabelToOld(self.yTrain.as_matrix()) self.yPred = dp.transformNewLabelToOld(self.yPred) print("self.yTrain:", self.yTrain.shape, self.yTrain[1:50, ]) print("self.yPred:", self.yPred.shape, self.yPred[1:50, ]) print("MSE:", mean_squared_error(self.yTrain, self.yPred)) print("RMSE:", sqrt(mean_squared_error(self.yTrain, self.yPred))) # print("Accuracy:", accuracy_score(self.yTrain, self.yPred)) # print("Precision:", precision_score(self.yTrain, self.yPred, average='micro')) # print("Recall:", recall_score(self.yTrain, self.yPred, average='micro')) # print("F1:", f1_score(self.yTrain, self.yPred, average='micro')) print("+++++++++++++++++++++Training completed")
def MSLR(): print('call functions MSLR') # Data Preprocessing X, y = dp.readData('50_Startups.csv') X = dp.EncodeData(X, 'State') X_train, y_train, X_test, y_test = dp.splitData(X, y) #print(X_train) #print(y_train) # Apply simple linear regression regressor = sp.trainModel(X_train, y_train) #sp.showTheta() #sp.drawTrainSet(X_train, y_train) yt = sp.predictMultiTest(regressor, X_test, y_test) print(X_test) print(yt) sp.drawTestSet(X_test[:, 3], yt)
def SLR(): print('call functions SLR') # Data Preprocessing X, y = dp.readData('Salary_Data.csv') print(np.array(X)) print(np.array(y)) X_array = np.array(X) y_array = np.array(y) print(X_array) print(y_array) np.set_printoptions(2) print( np.concatenate( (X_array.reshape(len(X_array), 1), y_array.reshape( len(y_array), 1)), 1)) m = len(X_array) theta0Array = np.ones((m, 1)) print(np.concatenate((theta0Array, X_array), 1)) X_train, y_train, X_test, y_test = dp.splitData(X, y) #print(X_train) #print(y_train) # Apply simple linear regression regressor = sp.trainModel(X_train, y_train) #pfilename = 'C:\self\salarypredictor.pkl' pfilename = 'salarypredictor.pkl' with open(pfilename, 'wb') as file: pickle.dump(regressor, file) # with open(pfilename, 'rb') as file: # regressor = pickle.load(file) #sp.showTheta() #sp.drawTrainSet(X_train, y_train) sp.drawTestSet(X_test, sp.predictTest(regressor, X_test))
def query_rewrite(text: str, k: int): # First, rewrite each word in the query text. Replace the out-of-vocab words with # their nearest words in embedding using KNN of ball-tree. text_list = re.split('[\\\\+\-#/,|;-?*$%()\[\]\s]', text) # clean data firstly text_list = [NN.word_rewrite(w) for w in text_list] text = DataPreprocessing.list2str(text_list) text = DataPreprocessing.list2str(DataPreprocessing.stemming(text)) print(text) # Second, add this query to our corpus and calculate the TF-IDF value corpus.append(text) corpus_array = np.array(corpus) vectorizer = TfidfVectorizer() tfidf = vectorizer.fit_transform(corpus) # Third, calculate the cosine closest concepts of the query. Return the top K concepts. cosine_similarities = linear_kernel(tfidf[-1:], tfidf).flatten() related_docs_indices = cosine_similarities.argsort()[:-k-1:-1] return corpus_array[related_docs_indices]
def classify_ins(): ins_li = [[], [], []] for ins in DataPreprocessing.load_instance_json(): name = ins['name'] ins_id = int(name[6:8]) if ins_id in [i for i in range(1, 19)]: ins_li[0].append(ins) elif int(ins_id) in [i for i in range(19, 37)]: ins_li[1].append(ins) elif int(ins_id) in [i for i in range(37, 55)]: ins_li[2].append((ins)) return (ins_li)
def _iterate_videos_from_pickle(files, normalize=False, use_first_hundred=False): # This is a generator function. It iterates over the data # and returns the frames of one video at a time with each call. # To-Do: implement batchsize for file in files: data = pickle.load(open(file,'rb')) index = 0 N = data['targets'].shape[0] for i in range(0,N): if normalize: if use_first_hundred: if data['video_frames'][i] > 100: inputs = DP.normalize(data['data'][index:index+100]) else: inputs = DP.normalize(data['data'][index:index+data['video_frames'][i]]) else: inputs = DP.normalize(data['data'][index:index+data['video_frames'][i]]) else: if use_first_hundred: if data['video_frames'][i] > 100: inputs = torch.from_numpy(np.array(data['data'][index:index+100])) else: inputs = torch.from_numpy(np.array(data['data'][index:index+data['video_frames'][i]])) else: inputs = torch.from_numpy(np.array(data['data'][index:index+data['video_frames'][i]])) targets = np.array([data['targets'][i]]) targets = torch.from_numpy(targets).type(torch.LongTensor) yield inputs, targets index += data['video_frames'][i] i += 1
def test(model=None): print('Predicting labels in test sentences...') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if model is None: model = BertForSeqClassification() model.load_state_dict( torch.load( (utils.cfg.get('PRETRAIN_MODEL', 'fine_tuned_bert_path') + '/pytorch_model.bin'))) model.to(device) for param_tensor in model.state_dict(): print(param_tensor, "\t", model.state_dict()[param_tensor].size()) tokenizer = transformers.BertTokenizer.from_pretrained( utils.cfg.get('PRETRAIN_MODEL', 'fine_tuned_bert_path')) model.eval() test_set = dp.TestDataset() ul = Dataset(test_set, tokenizer) predict_dataloader = ul.get_dataloader(is_super=False) predictions = [] for batch in tqdm.tqdm(predict_dataloader): batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask = batch with torch.no_grad(): outputs = model(b_input_ids, attention_mask=b_input_mask) logits = outputs[0] logits = logits.detach().cpu().numpy() predictions.append(logits) # bayes # train_label = dp.LabeledDataset() # hashtag = ht.Hashtag(train_label=True, test=test_set) # sentiment_bayes = st.SentimentTime(test=test_set) # predictions = hashtag.bayes(predictions) # predictions = sentiment_bayes.bayes(predictions, 1) predict_labels = [] for i in range(len(predictions)): predict_labels.append( np.argmax(predictions[i], axis=1).flatten().tolist()) test_set.fill_result(list( itertools.chain(*predict_labels))) # 把多个list合并成一个list test_set.submit() print(' DONE.')
def _iterate_videos(data): # This is a generator function. It iterates over the data # and returns the frames of one video at a time with each call. # To-Do: implement batchsize i = 0 N = data['targets'].shape[0] for i in range(0,N): inputs = torch.from_numpy(np.array(DP.normalize(data['data'][i]))) targets = np.array([data['targets'][i]]) targets = torch.from_numpy(targets).type(torch.LongTensor) #yield np.ndarray(data['data'][index:index+data['video_frames'][i]]), np.ndarray(data['targets'][i]) yield inputs, targets
def main(): data = load_dataCSV() look_back = 28 jump=4 train_data, test_data = dp.rescale_data(data) trainX, trainY = dp.create_dataset(train_data, look_back) trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) testX, testY = dp.create_dataset(test_data, look_back) model = mod.getModel(look_back) model.fit( trainX, trainY, batch_size=128, nb_epoch=300, validation_split=0.10) pred,perfs=mod.testModel(model,testX,testY,jump,look_back) actual_test_data=test_data[len(test_data)-len(pred):] print("\n Average Covarance between predicted and actual prices on only predicted days:") print(np.mean(perfs)) print("\n Covarance between predicted and actual prices on all days:") print(np.cov(actual_test_data,pred)[1][0]) plt.figure(3) plt.plot(actual_test_data) plt.figure(4) plt.plot(pred) mod.saveModel(model,'lstm3')
def main(): data = load_dataCSV() look_back = 28 jump = 4 train_data, test_data = dp.rescale_data(data) trainX, trainY = dp.create_dataset(train_data, look_back) trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1])) testX, testY = dp.create_dataset(test_data, look_back) savedModel = load_model('lstm3.h5') pred, perfs = mod.testModel(savedModel, testX, testY, jump, look_back) actual_test_data = test_data[len(test_data) - len(pred):] print( "\n Average Covarance between predicted and actual prices on only predicted days:" ) print(np.mean(perfs)) print("\n Covarance between predicted and actual prices on all days:") print(np.cov(actual_test_data, pred)[1][0])
def buildFixationFlagListFromEsacIndex(EsacIndex, rt_gaze, gazeX, gazeY): gazeType = [] for i in range(0, len(rt_gaze)): if float(gazeX[i]) == -1.0: gazeType.append(2) else: if i in EsacIndex: gazeType.append(1) else: gazeType.append(0) refineGazeType = DataPreprocessing.postProcessingFixationFlagList( gazeType, rt_gaze, gazeX, gazeY) if len(refineGazeType) != len(gazeType): print("Error 1") sys.exit() return refineGazeType
def preprocess(): while True: print("Pre-process Menu") print("----------------\n") print("\tIntroduce the name of the file with the Data stored.") print("\tIntroduce 0 if you want to back to main menu.\n") fname = raw_input("\t\tIntroduce the name of the original json file: ") if fname == str(0): print("") return elif os.path.isfile(fname): ext = raw_input( "\t\tIntroduced the extension of the output file (csv, txt, json): " ) language = raw_input( "\t\tIntroduced the code of the language (en, fr, es, de, zh, ja): " ) no_repeated = raw_input( "\t\tDo you want to store the no repeated tweets? (y/n): ") print("") # With the aim of get a shorter name, we will erase the word 'Stream'. However, we # need the original name in the first step. DataPreprocessing.first_step(directory=directory, fname=fname, ext=ext, language=language) fname = fname.replace('Stream', '') DataPreprocessing.second_step(directory=directory, fname=fname, ext=ext, language=language) if no_repeated == "y": DataPreprocessing.third_step(directory=directory, fname=fname, ext=ext, language=language) DataPreprocessing.fourth_step(directory=directory, fname=fname, ext=ext, language=language) print("\nAction finished.\n") else: print("\nSorry, file hasn't found.\n")
def token_encode_multiprocess(self, tokenizer, sentences): n_cores = 10 start_time = time.time() with multiprocessing.Pool(n_cores) as p: token_encode_partial = functools.partial( tokenizer.encode, add_special_tokens=True, max_length=int( utils.cfg.get('HYPER_PARAMETER', 'max_sequence_length')), pad_to_max_length=True) token_encode_multiprocess_partial = functools.partial( self.token_encode, token_encode_partial) res = p.map(token_encode_multiprocess_partial, dp.Batch(n_cores, sentences.tolist())) res = functools.reduce(lambda x, y: x + y, res) print(f'已获取Token后的ID, 用时:{round(time.time() - start_time, 2)}s') return res
def makeSentenceVector(self, sentence): ''' Convert a single sentence to vector ''' sentence = sentence.replace('.', '') senWords = sentence.split(' ') if self.model.currentModel == ModelType.Word2Vec: wordEmbedding = self.model.embedding ps = p.Preprocessing() senWords = ps.removeStopword(senWords) mat = [] for i in senWords: if i in wordEmbedding: mat.append(wordEmbedding[i]) mat = np.array(mat) return np.mean(mat, axis=0) elif self.model.currentModel == ModelType.SelfTrainedDoc2Vec: embedding = self.model.embedding mat = np.array(embedding.infer_vector(senWords)) return mat
def pipeline_all_datasets(): """ The pipeline for all data sets :return: """ # Add code_dir folder sys.path.append(dp_dir) # Import DataPreprocessing module import DataPreprocessing dp = DataPreprocessing.DataPreprocessing(data_dir) # Match data files with names file data_names = dp.match_data_names() # The pipeline for each data set (in parallel) # Set backend="multiprocessing" (default) to prevent sharing memory between parent and threads Parallel(n_jobs=1)( delayed(pipeline_one_dataset)(dp, data_files, names_file) for data_files, names_file in data_names)
def clustering(clust, filenames, saved=False): #mergeTitle(df, filename2) if saved: stats = pd.read_csv(filenames['stats']) clusters = pd.read_csv(filenames['clusters']) else: data, results = dp.getDataForClustering(filenames, clust) #TODO divide data into training and testing datasets clust['n_samples'] = len(data) print 'total instances:', clust['n_samples'] testing_num = int(clust['n_samples'] * 0.2) #testing_num = 1924500 results['quest_id'] = results['quest_id'][testing_num:clust['n_samples']] results['time_row'] = results['time_row'][testing_num:clust['n_samples']] print 'testing instances: ', str(testing_num) # 385981 print 'Started clustering...' #clusters, stats = clusterData(data, clust, results, False) clusters, stats = clusterData(data[testing_num:clust['n_samples']], clust, results, False) print 'Saving the clustering results...' csr.to_csv1(stats, filenames['stats']) clusters.to_csv(filenames['clusters']) return stats, clusters
4. Taekwondo 5. MLB 1870-2016 """ import numpy as np import math import time import DataPreprocessing import EnsembleClassifiers import ModelValidation DP = DataPreprocessing.DataPreprocessing() EC = EnsembleClassifiers.EnsembleClassifiers() MV = ModelValidation.ModelValidation() #Special Cases Pre-processing #DP.merge_taekwondo_datasets(); #DP.preprocess_sms_dataset(); def dataset_learning(dataset, output_file, dataset_name, preprocess_time, NT, F, parameters): txtfile = open('./Learning_Results/' + output_file + '.txt', 'wb') txtfile.write("\n::::::::::::::::::::::::::::") txtfile.write("\nRandom Forest Classification")