def preProcessTestData(self, DataArrays): X = [] rf = ReadCsvFile() loc_dic = rf.ReadValueFromFile("train_loc_dic") index = len(loc_dic.keys()) for Data in DataArrays: len_items = len(Data) data = [] data.append(int(Data[3])) time_s = Data[4].split(" ") time_items = time_s[0].split("-") time_data_items = [int(x) for x in time_items] time_data_items.extend([0, 0, 0, 0, 0, 0]) time_data_val = time.mktime(time_data_items) time_items.extend(time_s[1].split(":")) time_items = [int(float(x)) for x in time_items] time_items.extend([0, 0, 0]) time_val = time.mktime(time_items) time_val = time_val - time_data_val # print time_val loc_place = Data[5] if loc_dic.has_key(loc_place) == False: loc_dic[loc_place] = index index += 1 loc_index = loc_dic[loc_place] data.append(time_val) data.append(loc_index) X.append(data) wr = WriteResult() wr.WriteValueToFile(X, "testData_X") wr.WriteValueToFile(loc_dic, "test_loc_dic") return X, loc_dic
def threadPredict(model,test_data,test_Xc,start_index,end_index,test_loc_dic): start_time = time.time() pre_result = predict(model, test_Xc[start_index:end_index]) end_time = time.time() print "predict from %d to %d. \n cost time :%lf s\n" % (start_index, end_index, end_time - start_time) start_time1 = time.time() result = transformResult(pre_result, test_loc_dic) if mu.acquire(True): wr = WriteResult() wr.WriteResultAnswer(test_data[start_index:end_index], result,Config.ResultDataPath+"submission_1.csv") mu.release() end_time1 = time.time() print "transform index from %d to %d. \n cost time :%lf s\n" % (start_index, end_index, end_time1 - start_time1)
def LoadModel(model_name="predict_Model_new"): model = None rf = ReadCsvFile() try: model = rf.ReadValueFromFile(model_name) print "Load Model success!" except: train_data = rf.ReadTrainFile() print len(train_data) preP = PreProcess() train_X, train_lab, loc_dic = preP.preProcessTrainData(train_data) train_Xc = preP.getFeatureScaler(train_X) print "Train model" model = TrainModel(train_Xc, train_lab) logging.info("save model") wr = WriteResult() wr.WriteValueToFile(model, model_name) return model
def main(): print "Start......" rf = ReadCsvFile() # train_data = rf.ReadTrainFile() preP = PreProcess() # train_X,train_lab,loc_dic = preP.preProcessTrainData(train_data) # train_Xc = preP.getFeatureScaler(train_X) # print "Train model" # TrainModel(train_Xc,train_lab) # train_lab = None # loc_dic = None # train_data = None # train_X = None # train_Xc = None # with open(Config.ResultDataPath+"result.csv","w") as fp: # print "清空文件","result.csv" model = LoadModel() # print "Read data predict ... ... " test_data = rf.ReadTestFile() test_X,test_loc_dic = preP.preProcessTestData(test_data) test_Xc = preP.getFeatureScaler(test_X) test_X = None length = len(test_Xc) print "test data len:",length print "predict result ... ... " for i in xrange(104,length): pre_result = None start_index = i * 100 end_index = (i+1) * 100 if end_index > length: end_index = length start_time = time.time() pre_result = predict(model,test_Xc[start_index:end_index]) end_time = time.time() print "predict from %d to %d. \n cost time :%lf s\n" % (start_index, end_index,end_time-start_time) start_time = time.time() result = transformResult(pre_result, test_loc_dic) logging.info("predict from %d to %d. \ncost time :%lf s" % (start_index, end_index,end_time-start_time)) wr = WriteResult() wr.WriteResultAnswer(test_data[start_index:end_index], result,Config.ResultDataPath+"result_1.csv") end_time = time.time() logging.info("transform index from %d to %d. \ncost time :%lf s" % (start_index, end_index,end_time-start_time)) print "transform index from %d to %d. \n cost time :%lf s\n" % (start_index, end_index,end_time-start_time)
def preProcessTrainData(self, DataArrays): X = [] Y = [] loc_dic = {} index = 1 for Data in DataArrays: len_items = len(Data) # Y.append(Data[len_items-1]) data = [] data.append(int(Data[3])) time_s = Data[4].split(" ") time_items = time_s[0].split("-") time_data_items = [int(x) for x in time_items] time_data_items.extend([0, 0, 0, 0, 0, 0]) time_data_val = time.mktime(time_data_items) time_items.extend(time_s[1].split(":")) time_items = [int(x) for x in time_items] time_items.extend([0, 0, 0]) time_val = time.mktime(time_items) time_val = time_val - time_data_val # print time_val loc_place = Data[5] if loc_dic.has_key(loc_place) == False: loc_dic[loc_place] = index index += 1 loc_index = loc_dic[loc_place] data.append(time_val) data.append(loc_index) X.append(data) # 处理 Y des_loc_place = Data[6] if not loc_dic.has_key(des_loc_place): loc_dic[des_loc_place] = index index += 1 Y.append(loc_dic[des_loc_place]) wr = WriteResult() wr.WriteValueToFile(X, "trainData_X") wr.WriteValueToFile(Y, "trainData_Y") wr.WriteValueToFile(loc_dic, "train_loc_dic") return X, Y, loc_dic
def TestReadFile(self): # rcf = ReadCsvFile() # start_r = time.time() # trainData = rcf.ReadTrainFile() # end_t = time.time() # print "read train data cost time :",(end_t-start_r) # print len(trainData) # for re in trainData: # print re # start_r = time.time() # testData = rcf.ReadTestFile() # end_t = time.time() # print "read test data cost time :", (end_t - start_r) # for re in testData: # print re wr = WriteResult() rf = ReadCsvFile() train_X = rf.ReadValueFromFile("trainData_X") train_Y = rf.ReadValueFromFile("trainData_Y") train_loc_dic = rf.ReadValueFromFile("train_loc_dic") # print len(testData) max_min_scaler = preprocessing.MinMaxScaler() train_XM = numpy.array(train_X) train_Xc = max_min_scaler.fit_transform(train_XM) wr.WriteValueToFile(train_Xc, ) print train_Xc preProce = PreProcess() # test_X,test_loc_dic = preProce.preProcessTestData(testData) test_X = rf.ReadValueFromFile("testData_X") test_loc_dic = rf.ReadValueFromFile("test_loc_dic") # X,Y,loc_dic = preProce.preProcess(trainData) print "X size:", len(test_X) print test_X[0], len(test_loc_dic.keys()) # for i in xrange(len(X)): test_XM = numpy.array(test_X) test_Xc = max_min_scaler.fit_transform(test_XM) print print test_Xc
def TrainModel(train_data,train_lab): model = GaussianNB() model.fit(train_data,train_lab) wr = WriteResult() wr.WriteValueToFile(model,"predict_Model") return model