class analysis_dif: dataio = DataIO() def diff_analysis(self,date,distinct): difflist = [] gaplist = [] for slice in range(143): ts = date+'-'+str(slice+1) diffdata = self.dataio.select_orderDiff_by_ds_distinct(ts,distinct) gap = self.dataio.select_gap(ts,distinct) gaplist.append(float(gap)) difflist.append(float(diffdata)) #print(type(diffdata)) #plt.plot(difflist,'ro-') fig = plt.figure() ax1 = fig.add_subplot(311) fig = sm.graphics.tsa.plot_acf(gaplist, lags=20, ax=ax1) ax2 = fig.add_subplot(312) fig = sm.graphics.tsa.plot_pacf(gaplist, lags=20, ax=ax2) ax3 = fig.add_subplot(313) ax3.plot(difflist,'ro-') title = date+" Distinct:"+str(distinct) plt.title(title) #arma_11 = sm.tsa.ARMA(difflist,(1,1)).fit() #arma_02 = sm.tsa.ARMA(difflist,(0,2)).fit() #arma_01 = sm.tsa.ARMA(gaplist,(1,0)).fit() arima = sm.tsa.ARIMA(gaplist,(1,1,0)).fit() fig1 = plt.figure(1) fig1 = arima.plot_predict() plt.show()
def update_data(): dataio = DataIO(autoload=False) df = dataio.update() if df.empty: st.sidebar.error("Failed To Update!") else: st.sidebar.balloons() st.sidebar.success("Updated Data, Wrote to disk and Loaded!") return df
def initialize(self): if self.__buf is None: self.__buf = DataIO( buf_len=self.buffer_len, output_file_name=self.output_file, close_encounter_output_file_name=self. close_encounter_output_file, collision_output_file_name=self.collision_output_file, CONST_G=self.CONST_G) if self.particles.N > 0: # initialize the C library self.libabie.initialize_code( self.CONST_G, self.CONST_C, self.particles.N, MAX_CE_EVENTS=self.max_close_encounter_events, MAX_COLLISION_EVENTS=self.max_collision_events, close_encounter_distance=self.close_encounter_distance) self.buf.initialize_buffer(self.particles.N)
def __init__(self): self.data_io = DataIO() self.model = None self.x_train = None self.y_train = None self.x_test = None self.y_test = None self.num_classes = 0 self.epochs = 50 self.batch_size = 8 self.use_noise = True self.distributed_training = False self.multi_gpu_training = False self._multi_gpu_model = None self._n_gpus = 1 self.callbacks = [] self.logger = None self.log_level = logging.DEBUG self.input_shape = (512, 512, 3) # (256, 256, 3) self._t_start = 0 self._t_end = 0
def tfidf_cloud(n_trees): dio = DataIO("/data/Settings_cloud.json") submission = False min_samples_split = 2 param = """Normal count vector with max 200. New submission which is repeatable. and nicer count_vector_titles = TfidfVectorizer( read_column(train_filename, column_name), max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) """ if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" #features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_features) #validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200", #["Title", "FullDescription", "LocationRaw"], #extra_valid_features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features) def load(filename): return joblib.load(path_join("/data", filename)) features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl") validation_features = load( "train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl") print "features", features.shape print "valid features", validation_features.shape #salaries = dio.get_salaries(type_n, log=True) #if not submission: #valid_salaries = dio.get_salaries(type_v, log=True) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries) #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries) #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5) #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5) #TODO: valid salaries so narobe dumpane salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl") valid_salaries = load( "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl") dio.is_log = True print salaries.shape name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % ( min_samples_split, n_trees) print name #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno") classifier = ExtraTreesRegressor( n_estimators=n_trees, verbose=2, n_jobs=4, # 2 jobs on submission / 4 on valid test oob_score=False, min_samples_split=min_samples_split, random_state=3465343) #dio.save_model(classifier, "testni_model", 99.) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) if submission: dio.save_prediction(name, predictions, type_n=type_v) dio.write_submission(name + ".csv", predictions=predictions) else: dio.compare_valid_pred(valid_salaries, predictions) metric = dio.error_metric mae = metric(valid_salaries, predictions) print "MAE validation: ", mae dio.save_model(classifier, name, mae) dio.save_prediction(name, predictions, type_n=type_v)
from data_io import DataIO from sklearn.ensemble import ExtraTreesRegressor from sklearn.cross_validation import cross_val_score from os.path import join as path_join from sklearn.feature_extraction.text import TfidfVectorizer import numpy as np import joblib import cloud import os tfidf_columns = ["Title", "FullDescription", "LocationRaw"] dio = DataIO("Settings.json") vectorizer = TfidfVectorizer(max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True) short_id = "tfidf_200f_l1" type_n = "train" type_v = "valid" dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid") columns = ["Category", "ContractTime", "ContractType"] le_features = dio.get_le_features(columns, "train_full") extra_features = dio.get_features(columns, type_n, le_features) extra_valid_features = dio.get_features(columns, type_v, le_features) features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix", tfidf_columns, extra_features) validation_features = dio.join_features( "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns, extra_valid_features)
class neural_network: fnn = FeedForwardNetwork() inputlen = 0 outputlen = 7 dataio = DataIO() feature = cFeature() dataset = {} def network_init(self): #输入feature len tempfeature, gap = self.feature.generate('2016-01-03-100', 1) self.inputlen = len(tempfeature) # 设立三层,一层输入层(3个神经元,别名为inLayer),一层隐藏层,一层输出层 inLayer = LinearLayer(self.inputlen, name='inLayer') hiddenLayer1 = SigmoidLayer(7, name='hiddenLayer1') hiddenLayer2 = SigmoidLayer(7, name='hiddenLayer2') outLayer = LinearLayer(7, name='outLayer') # 将三层都加入神经网络(即加入神经元) self.fnn.addInputModule(inLayer) self.fnn.addModule(hiddenLayer1) self.fnn.addModule(hiddenLayer2) self.fnn.addOutputModule(outLayer) # 建立三层之间的连接 in_to_hidden1 = FullConnection(inLayer, hiddenLayer1) hidden1_to_hidden2 = FullConnection(hiddenLayer1, hiddenLayer2) hidden_to_out = FullConnection(hiddenLayer2, outLayer) # 将连接加入神经网络 self.fnn.addConnection(in_to_hidden1) self.fnn.addConnection(hidden1_to_hidden2) self.fnn.addConnection(hidden_to_out) # 让神经网络可用 self.fnn.sortModules() def gene_training_sample(self): self.DS = SupervisedDataSet(self.inputlen, self.outputlen) if os.path.exists('nn_dataset.pkl'): with open('nn_dataset.pkl', 'rb') as f: self.dataset = pickle.load(f) for i in range(len(self.dataset['feature'])): #print(self.dataset['feature'][i]) self.DS.addSample(self.dataset['feature'][i], self.dataset['label'][i]) else: backdaylen = 3 prefix = '2016-01-' loop = 0 featurelist = [] targetlist = [] for day in range(2, 22, 1): date = prefix + "{:02}".format(day) for distinct in range(1, 67): for slice in range(1, 145): if slice < backdaylen: continue ts_cur = date + '-' + str(slice) gap_cur = self.dataio.select_gap(ts_cur, distinct) if gap_cur > 10: continue f_cur, gap = self.feature.generate(ts_cur, distinct) if f_cur == None: continue output = self.gene_output(gap_cur) featurelist.append(f_cur) targetlist.append(output) loop += 1 if loop % 1000 == 0: print(loop) self.dataset['feature'] = featurelist self.dataset['label'] = targetlist for i in range(len(featurelist)): self.DS.addSample(featurelist[i], targetlist[i]) print( "Building training set is finished. Total amount is {}".format( loop)) with open('nn_dataset.pkl', 'wb') as f: pickle.dump(self.dataset, f) def training_nerual_network(self): dataTrain, dataTest = self.DS.splitWithProportion(0.7) xTrain, yTrain = dataTrain['input'], dataTrain['target'] xTest, yTest = dataTest['input'], dataTest['target'] trainer = BackpropTrainer(self.fnn, dataTrain, verbose=True, learningrate=0.03, momentum=0.1) trainer.trainUntilConvergence(maxEpochs=20) output = self.fnn.activateOnDataset(dataTest) count = 0 countRight = 0 error = 0 for i in range(len(output)): posReal = yTest[i].argmax() posPredict = output[i].argmax() #print('o',output[i],posPredict) #print('r',yTest[i],posReal) error += abs(posReal - posPredict) if posReal == posPredict: countRight += 1 count += 1 error /= count print('Correct rate:{:.2f} Average error:{:.2f}'.format( countRight / count, error)) def gene_output(self, val): output = np.zeros(self.outputlen) if val == 0 or val == 1: output[0] = 1 if val == 2: output[1] = 1 if val == 3: output[2] = 1 if val == 4 or val == 5: output[3] = 1 if val == 6 or val == 7: output[4] = 1 if val == 8 or val == 9: output[5] = 1 if val > 9: output[6] = 1 return output
op.add_option("--n_features", action="store", type=int, default=2**16, help="n_features when using the hashing vectorizer.") (opts, args) = op.parse_args() if len(args) > 0: op.error("this script takes no arguments.") sys.exit(1) print __doc__ op.print_help() print dio = DataIO("Settings_loc5.json") submission = False n_trees = 10 min_samples_split = 2 if submission: type_n = "train_full" type_v = "valid_full" else: type_n = "train" type_v = "valid" vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') #short_id = "tfidf_200f_l1"
def get_data(): dataio = DataIO() df = dataio.get_data() return df
from data_io import DataIO data = DataIO(autoload=False) df = data.update() print(df.shape)
class cFeature: dataio = DataIO() datelice = '' date = '' distinct = 0 daytype = '' back_len = 3 def generate(self, ds, distinct): self.date = ds[0:10] self.datelice = ds self.distinct = distinct self.daytype = isWeekendsText(self.date) slice = int(ds.split('-')[-1]) if slice <= self.back_len: return None, None #--------------------feature generate----------------------# f = [] #wea_feature = self.weather_feature() # if wea_feature != None: # f.extend(wea_feature) # else: # return None, None gap_feature = self.gap_feature() if gap_feature == None: return None, None f.extend(gap_feature) # ts_feature = self.ts_feature() # f.extend(ts_feature) #f.append(1) gap = self.dataio.select_gap(self.datelice, self.distinct) return f, gap def weather_feature(self): weather = self.dataio.select_weatherdata_by_dateslice(self.datelice) if type(weather) == type(None): return None wea_feature = [0] * 4 cur_weather = int(weather['weather']) if cur_weather == 2 or cur_weather == 3 or cur_weather == 4: wea_feature[0] = 1 elif cur_weather == 8: wea_feature[1] = 1 elif cur_weather == 9: wea_feature[2] = 1 else: wea_feature[3] = 1 return wea_feature def gap_feature(self): gapfeature = [] ls = get_last_ts(self.datelice) gap_b1 = self.dataio.select_gap(ls, self.distinct) ls = get_last_ts(ls) gap_b2 = self.dataio.select_gap(ls, self.distinct) ls = get_last_ts(ls) gap_b3 = self.dataio.select_gap(ls, self.distinct) gap_std = np.std(np.array([gap_b1, gap_b2, gap_b3])) gapfeature.append(gap_std) gap_diff_b1 = gap_b1 - gap_b2 gap_diff_b2 = gap_b2 - gap_b3 # if gap_b1 != 0: # gapfeature.append(gap_diff_b1/gap_b1) # else: # gapfeature.append(gap_diff_b1) gapfeature.append(gap_b1) gapfeature.append(gap_diff_b1) #gapfeature.append(gap_diff_b1**2) gapfeature.append(gap_diff_b2) #ls = self.datelice # for i in range(self.back_len): # gap_filtered = self.dataio.select_filter_gap(ls,self.distinct,self.daytype) # #print(ls,self.daytype) # gapfeature.append(gap_filtered) # ls = get_last_ts(ls) gap_filtered_b2 = self.dataio.select_filter_gap( get_last_ts(get_last_ts(self.datelice)), self.distinct, self.daytype) gap_filtered_b1 = self.dataio.select_filter_gap( get_last_ts(self.datelice), self.distinct, self.daytype) gap_filtered_cur = self.dataio.select_filter_gap( self.datelice, self.distinct, self.daytype) gap_filtered_a1 = self.dataio.select_filter_gap( get_next_ts(self.datelice), self.distinct, self.daytype) if gap_filtered_a1 == None or gap_filtered_b1 == None or gap_filtered_b2 == None: return None gap_filter_diff_b2 = gap_filtered_b1 - gap_filtered_b2 gap_filter_diff_b1 = gap_filtered_cur - gap_filtered_b1 gap_filter_diff_a1 = gap_filtered_a1 - gap_filtered_cur #gapfeature.append(gap_filter_diff_b2) gapfeature.append(gap_filter_diff_b1) gapfeature.append(gap_filter_diff_a1) gapfeature.append(gap_filtered_cur) #gapfeature.append(math.pow(gap_filter_diff_b1,3)) #gapfeature.append(math.pow(gap_filter_diff_b1,2)) return gapfeature def ts_feature(self): slice = int(self.datelice.split('-')[-1]) ts_feature = gene_timeslice_feature(slice, 8) return ts_feature def traffic_feature(self): traffic_info = self.dataio.select_trafficdata_by_district( get_last_ts(self.datelice), self.distinct) traffic_level = [] if not traffic_info.empty: level1 = (traffic_info['level1'].values)[0] level2 = (traffic_info['level2'].values)[0] level3 = (traffic_info['level3'].values)[0] level4 = (traffic_info['level4'].values)[0] traffic_level = [level1, level2, level3, level4] else: traffic_level = [0, 0, 0, 0] return traffic_level
def buf(self): if self.__buf is None: self.__buf = DataIO(buf_len=self.buffer_len, output_file_name=self.output_file, CONST_G=self.CONST_G) return self.__buf
from data_io import DataIO #from os.path import join as path_join #import joblib import numpy as np dio = DataIO("Settings_submission.json") submission = True if submission: type_n = "train_full" type_v = "test_full" else: type_n = "train" type_v = "valid" model_names = [ "ExtraTree_min_sample2_30trees_200f_noNorm_categoryTimeType_new_log", "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_new_log", "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_new", "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_tfidfl2_new_log", "vowpall_submission", "vowpall_loc5" ] #model_names = [model2, model4] #model_names = [model1, model6, model4] #fit_predict(model2) #fit_predict(model1) #fit_predict(model3) #fit_predict(model5)
class analysis: dataio = DataIO() feature = cFeature() wa = wavelet_ana() verify_file_path = './predict_data_in_training.txt' weekend = [2, 3, 9, 17] weekday = [4, 5, 6, 12, 13, 14, 15, 18] Sat = [2, 9] Sun = [3, 17] def time2slice(self, i_time): t_array = datetime.datetime.strptime(i_time, "%Y-%m-%d %H:%M:%S") slice = t_array.hour * 6 + math.floor(t_array.minute / 10) + 1 return slice def slice2time(self, slice): slice = int(slice) hour = math.floor((slice - 1) / 6) min = (slice - 1 - hour * 6) * 10 timenow = "{:02}:{:02}".format(hour, min) return timenow def select_test_day(self, daylist): daytest = [] for i in daylist: day = '{:02d}'.format(i) prefix = '2016-01-' date = prefix + day daytest.append(date) return daytest def weather_main_trend(self, date, hour_interval=1): #print(self.dataio.select_weatherdata_by_dateslice(date)) weatherlist = [] for i in range(1, 144, 6 * hour_interval): dateslice = date + '-' + str(i) weather = self.dataio.select_weatherdata_by_dateslice(dateslice) if date == '2016-01-16': print(weather) if type(weather) != type(None): weatherlist.append(weather) if len(weatherlist) == 0: print("len(weatherlist)==0") exit(1) weatherPD = pd.DataFrame(weatherlist) if date == '2016-01-16': print(weatherPD) #del weatherPD['temp'] #del weatherPD['pm2.5'] timelist = [] for idx in weatherPD.index: slice = idx.split('-')[-1] timetext = self.slice2time(slice) timelist.append(timetext) weatherPD.index = timelist return weatherPD def write_weather_info(self): for day in range(21): prefix = '2016-01-' date = prefix + '{:02d}'.format(day + 1) print(date) pd_weather = self.weather_main_trend(date) filepath = './weather_info' filename = date + ".txt" fw = open(os.path.join(filepath, filename), 'w') pd_weather.to_csv(fw) fw.close() def do_analysis_drawGapTrend(self): weekend = [1, 2, 3, 9, 10, 16, 17] weekday1 = [4, 5, 6, 7, 8] weekday2 = [11, 12, 13, 14, 15] for type in range(3): if type == 0: daytest = self.select_test_day(weekend) ax = plt.subplot(311) ax.set_title("weekend") if type == 1: daytest = self.select_test_day(weekday1) ax = plt.subplot(312) ax.set_title("weekday1") if type == 2: daytest = self.select_test_day(weekday2) ax = plt.subplot(313) ax.set_title("weekday2") for day in daytest: data = self.dataio.select_orderdata_by_district(day, 8) gap = (data['demand'] - data['supply']) gaplen = gap.shape[0] idx = np.array(range(gaplen)) + 1 x_label = [] for i in range(144): x_label.append(ana.slice2time(i + 1)) gap.index = x_label gap.plot(label=day) ax.legend(loc=2) plt.show() def train_kernel_ridge_regression_clf(self, train_daylist, distinct, gamma=1, alpha=1): daytest = self.select_test_day(train_daylist) y_train = [] X_train = [] for day in daytest: for slice in range(144): dateslice = day + '-' + str(slice + 1) #feature,gap = self.generateFeatureLabel(dateslice,distinct) feature, gap = self.feature.generate(dateslice, distinct) if feature != None: if gap != 0: gap = math.log10(float(gap)) else: gap = -0.1 X_train.append(feature) y_train.append(gap) clf = KernelRidge(kernel='polynomial', gamma=gamma, alpha=alpha) #clf = KernelRidge(kernel='polynomial', degree=3,alpha=0.01) clf.fit(X_train, y_train) return clf def train_optimzation_model(self, train_daylist, distinct): daytest = self.select_test_day(train_daylist) y_train = [] X_train = [] for day in daytest: for slice in range(144): dateslice = day + '-' + str(slice + 1) #feature, label = self.generateFeatureLabel(dateslice, distinct) feature, label = self.feature.generate(dateslice, distinct) #print(feature,label) #print(feature1,label1) #print("-----------") if feature != None: X_train.append(feature) y_train.append(label) opt = optimization() opt.fit(X_train, y_train) return opt def train_gap_diff_curve(self, day, distinct): if len(day.split('-')) != 3: print( "The input of train_gap_diff_curve_by_distinct_day should be a xx-xx-xx" ) exit(1) difflist = [] for slice in range(144): dateslice = day + '-' + str(slice + 1) diffval = self.dataio.select_orderDiff_by_ds_distinct( dateslice, distinct) if diffval != None: difflist.append(diffval) coeffs = self.wa.get_wavelet_coeffs(difflist) #coeffs = self.wa.coeffs_process(coeffs) curve = self.wa.reconstruction_from_coeffs(coeffs) return np.array(curve) def train_gap_diff_by_distinctlist(self, distinct_list, diffcurveList, count): for distinct in distinct_list: count[0] += 1 print("Training model in " + "{:.1f}".format(count[0] / 66 * 100) + "% completed...") curve_dict = {} weekday = self.select_test_day(self.weekday) curve_sum = np.zeros(144) for day in weekday: curve = self.train_gap_diff_curve(day, distinct + 1) curve_sum += curve curve_dict['weekday'] = curve_sum / len(weekday) sat = self.select_test_day(self.Sat) curve_sum = np.zeros(144) for day in sat: curve = self.train_gap_diff_curve(day, distinct + 1) curve_sum += curve curve_dict['sat'] = curve_sum / len(sat) sun = self.select_test_day(self.Sun) curve_sum = np.zeros(144) for day in sun: curve = self.train_gap_diff_curve(day, distinct + 1) curve_sum += curve curve_dict['sun'] = curve_sum / len(sun) diffcurveList[distinct] = curve_dict def drawing_perform_by_distinct_daylist(self, clf, daylist, distinct): daytest = self.select_test_day(daylist) for i, day in enumerate(daytest): gap_real = [] gap_predict = [] slice_x = [] for slice in range(144): dateslice = day + '-' + str(slice + 1) #feature,gap = self.generateFeatureLabel(dateslice,distinct) feature, gap = self.feature.generate(dateslice, distinct) if feature == None: continue label_predicted = clf.predict([feature]) gap_real.append(gap) gap_predict.append(label_predicted) slice_x.append(slice) plt.plot(slice_x, gap_real, color=get_color(i), label=day) plt.plot(slice_x, gap_predict, color=get_color(i), ls='--', lw=2) plt.legend(loc=2) plt.grid() plt.show() def verifying_in_training_set(self, clf): fr = open(self.verify_file_path, 'r') timeslicelist = [] for line in fr: timeslice = line.split(' ')[0] timeslicelist.append(timeslice) fr.close() #------clf------distinct(0,65)-------type(0:weekday, 1:weekend)----- count = 0 err_rate_sum = 0 for timeslice in timeslicelist: for dis_ind in range(66): #clf[distinct][] distinct = dis_ind + 1 date = timeslice[0:10] isWeekend = isWeekends(date) #feature,gap = self.generateFeatureLabel(timeslice,distinct) feature, gap = self.feature.generate(timeslice, distinct) if feature == None or gap == 0: continue gap_predicted = clf[dis_ind][isWeekend].predict([feature])[0] gap_predicted = int(math.pow(10, gap_predicted)) if gap_predicted < 0: gap_predicted = 0 err_rate = abs((gap - gap_predicted) / gap) err_rate_sum += err_rate count += 1 err_rate_sum /= count return err_rate_sum def verifying_in_training_set_bydiff(self, diffcurve): fr = open(self.verify_file_path, 'r') timeslicelist = [] for line in fr: timeslice = line.split(' ')[0] timeslicelist.append(timeslice) fr.close() # ------clf------distinct(0,65)-------type(0:weekday, 1:weekend)----- count = 0 err_rate_sum = 0 for timeslice in timeslicelist: for dis_ind in range(66): distinct = dis_ind + 1 slice = int(timeslice.split('-')[-1]) date = timeslice[0:10] gap = self.dataio.select_gap(timeslice, distinct) if gap == 0: continue ts_before1 = date + '-' + str(slice - 1) ts_before2 = date + '-' + str(slice - 2) ts_before3 = date + '-' + str(slice - 3) gap1 = self.dataio.select_gap(ts_before1, distinct) gap2 = self.dataio.select_gap(ts_before2, distinct) gap3 = self.dataio.select_gap(ts_before3, distinct) diff1 = gap1 - gap2 diff2 = gap2 - gap3 daytype = isWeekends(date) diffval1 = 0 diffval0 = 0 if daytype == 0: curve = diffcurve[dis_ind]['weekday'] diffval0 = curve[slice - 1] diffval1 = curve[slice - 2] if daytype == 1: curve = diffcurve[dis_ind]['sat'] diffval0 = curve[slice - 1] diffval1 = curve[slice - 2] if daytype == 2: curve = diffcurve[dis_ind]['sun'] diffval0 = curve[slice - 1] diffval1 = curve[slice - 2] gapdiff_predict = 2 * diffval1 - diff1 + diffval0 gap_predicted = gap1 + gapdiff_predict if gap_predicted < 0: gap_predicted = 0 err_rate = abs((gap - gap_predicted) / gap) err_rate_sum += err_rate count += 1 err_rate_sum /= count return err_rate_sum def calculate_norm2_error(self, clf, daylist, distinct): err_val = 0 count = 0 daylist = self.select_test_day(daylist) for date in daylist: for slice in range(144): timeslice = date + '-' + str(slice + 1) feature, gap = self.feature.generate(timeslice, distinct) if feature == None or gap == 0: continue if gap != 0: gap_log = math.log10(gap) else: gap_log = 0 gap_predicted = clf.predict([feature])[0] err_val += (gap_log - gap_predicted)**2 count += 1 err_val /= count return err_val def calculate_mape_by_DayDistinct(self, clf, daylist, distinct): err_rate_sum = 0 count = 0 daylist = self.select_test_day(daylist) for date in daylist: for slice in range(144): timeslice = date + '-' + str(slice + 1) #feature, gap = self.generateFeatureLabel(timeslice, distinct) feature, gap = self.feature.generate(timeslice, distinct) if feature == None or gap == 0: continue gap_predicted = clf.predict([feature])[0] #print('Before log:',gap_predicted) gap_predicted = int(math.pow(10, gap_predicted)) if gap_predicted < 0: gap_predicted = 0 isWeekend = isWeekendsText(date) gap_filtered = self.dataio.select_filter_gap( timeslice, distinct, isWeekend) if gap_predicted > 2 * gap_filtered: gap_predicted = 2 * gap_filtered # print('After log:', gap_predicted,gap) err_rate = abs((gap - gap_predicted) / gap) #print(timeslice+"\t{:.2f}\t{}\t{:.0f}".format(err_rate,gap,gap_predicted)) err_rate_sum += err_rate count += 1 err_rate_sum /= count return err_rate_sum # # def generateFeatureLabel(self,dateslice,distinct): # date = dateslice[0:10] # weather = self.dataio.select_weatherdata_by_dateslice(dateslice) # if type(weather) == type(None): # #print("Weather info. does not exist in "+dateslice) # return None,None # # # # weather_feature = [0] * 4 # cur_weather = int(weather['weather']) # if cur_weather == 2 or cur_weather == 3 or cur_weather == 4: # weather_feature[0] = 1 # elif cur_weather == 8: # weather_feature[1] = 1 # elif cur_weather == 9: # weather_feature[2] = 1 # else: # weather_feature[3] = 1 # #print(weather_feature) # #weather_feature[int(weather['weather']) - 1] = 1 # # orderdata = self.dataio.select_orderdata_by_district(dateslice,distinct) # gap_real = (orderdata['demand']-orderdata['supply']).values # gap_real = gap_real[0] # timeslice = int(dateslice.split('-')[-1]) # if timeslice <4: # return None,None # traffic_info = self.dataio.select_trafficdata_by_district(dateslice,distinct) # if traffic_info.empty and distinct !=54: # return None,None # # ts_feature = gene_timeslice_feature(timeslice,4) # # # result = isWeekends(date) # if result == 0: # daytype = 'weekday' # if result == 1: # daytype = 'sat' # if result == 2: # daytype = 'sun' # # gap_filtered = self.dataio.select_filter_gap(dateslice,distinct,daytype) # gap_filtered_last = self.dataio.select_filter_gap(get_last_ts(dateslice),distinct,daytype) # traffic_level =[1,1,1,1] # if not traffic_info.empty: # level1 = (traffic_info['level1'].values)[0] # level2 = (traffic_info['level2'].values)[0] # level3 = (traffic_info['level3'].values)[0] # level4 = (traffic_info['level4'].values)[0] # traffic_level[0] = level1 # traffic_level[1] = level2 # traffic_level[2] = level3 # traffic_level[3] = level4 # # #print(traffic_level) # # trafficBeList = [] # GapBeList = [] # for delta in range(3): # datesliceBe = dateslice[0:11]+str(timeslice-delta-1) # orderdataBe = self.dataio.select_orderdata_by_district(datesliceBe, distinct) # gap_real_Be = (orderdataBe['demand'] - orderdataBe['supply']).values # gap_real_Be = gap_real_Be[0] # GapBeList.append(gap_real_Be) # # traffic_info = self.dataio.select_trafficdata_by_district(datesliceBe,distinct) # if not traffic_info.empty: # level1 = (traffic_info['level1'].values)[0] # level2 = (traffic_info['level2'].values)[0] # level3 = (traffic_info['level3'].values)[0] # level4 = (traffic_info['level4'].values)[0] # traffic_temp = level1 + level2 * 2 + level3 * 3 + level4 * 4 # else: # traffic_temp = 1 # trafficBeList.append(traffic_temp) # # # #GapBeListExp2 = [x*x for x in GapBeList] # GapBeListExp2 = math.pow(GapBeList[0],2) # #GapBeListExp2 = math.exp(GapBeList[0]) # feature = [] # # #feature.extend(weather_feature) # feature.extend(GapBeList) # #feature.extend(ts_feature) # feature.append(gap_filtered) # feature.append(gap_filtered_last) # # # diff = abs(gap_filtered - GapBeList[0]) # # diff_exp05 = math.pow(diff,0.5) # # if gap_filtered - GapBeList[0]>0: # # pass # # else: # # diff_exp05 *= -1 # # #feature.append(math.log(gap_filtered-GapBeList[0])) # # #feature.append(math.pow((GapBeList[0] - GapBeList[1]),2)) # #feature.extend(GapBeListExp2) # #feature.append(diff_exp05) # # # #feature.extend(traffic_level) # #feature.extend(trafficBeList) # feature.append(1) # # return feature,gap_real def gene_KRR_clf_bydaylist(self, distinct_list, clflist, count, gamma=1, alpha=1): for distinct in distinct_list: rand = random.random() time.sleep(rand / 10) count[0] += 1 print("Training model in " + "{:.1f}".format(count[0] / 66 * 100) + "% completed...") clf_weekday = self.train_kernel_ridge_regression_clf( weekday, distinct + 1, gamma, alpha) clf_weekend = self.train_kernel_ridge_regression_clf( weekend, distinct + 1, gamma, alpha) clflist[distinct] = [clf_weekday, clf_weekend] def train_OPT_clf_bydaylist(self, distinct_list, clflist, count): for distinct in distinct_list: rand = random.random() time.sleep(rand / 10) count[0] += 1 print("Training model in " + "{:.1f}".format(count[0] / 66 * 100) + "% completed...") clf_weekday = self.train_optimzation_model(self.weekday, distinct + 1) clf_weekend = self.train_optimzation_model(self.weekend, distinct + 1) clflist[distinct] = [clf_weekday, clf_weekend]
def __init__(self): self.data_io = DataIO() self.input = None self.content_data = None self.execute_data, self.prior_1, self.prior_2, self.leave = None, None, None, None wx.Frame.__init__(self, None, wx.ID_ANY, "Customer Segmentation", size=(800, 640)) self.panel = wx.Panel(self, wx.ID_ANY) status = self.CreateStatusBar() self.sizer = wx.BoxSizer(wx.VERTICAL) file_label = wx.StaticText(self.panel, -1, "Input:", (30, 20)) self.InputPathTextBox = wx.TextCtrl(self.panel, -1, "", size=(200, -1), pos=(100, 21)) self.browse_btn = wx.Button(self.panel, -1, "Browse", pos=(310, 20)) self.Bind(wx.EVT_BUTTON, self.onOpenFile, self.browse_btn) # output_label = wx.StaticText(self.panel, -1, "Ket qua:", (30, 50)) # self.OutputPathTextBox = wx.TextCtrl(self.panel, -1, "", size=(200, -1), pos=(100, 47)) self.execute_btn = wx.Button(self.panel, -1, "Phân tích", pos=(400, 20)) self.Bind(wx.EVT_BUTTON, self.onExecute, self.execute_btn) self.execute_btn.Disable() inform_label = wx.StaticText(self.panel, -1, "Tổng quát:", (30, 60)) self.inform_txt = wx.TextCtrl(self.panel, -1, "", style=wx.TE_MULTILINE | wx.TE_READONLY, size=(463, 70), pos=(100, 60)) self.prior1_btn = wx.Button(self.panel, -1, "Nhóm Ưu tiên 1", pos=(98, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenPrior1, self.prior1_btn) self.prior1_btn.Disable() self.prior2_btn = wx.Button(self.panel, -1, "Nhóm Ưu tiên 2", pos=(210, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenPrior2, self.prior2_btn) self.prior2_btn.Disable() self.leave_btn = wx.Button(self.panel, -1, "Dự đoán rời mạng", pos=(320, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenLeave, self.leave_btn) self.leave_btn.Disable() self.total_btn = wx.Button(self.panel, -1, "Toàn bộ các nhóm", pos=(443, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenTotal, self.total_btn) self.total_btn.Disable() self.chart_btn = wx.Button(self.panel, -1, "Biểu đồ", pos=(580, 140)) self.Bind(wx.EVT_BUTTON, self.onOpenChart, self.chart_btn) self.chart_btn.Disable() self.save_btn = wx.Button(self.panel, -1, "Export", pos=(680, 140)) self.Bind(wx.EVT_BUTTON, self.onSaveFile, self.save_btn) self.save_btn.Disable() self.data_grid = grid.Grid(self.panel, size=(780, 400), pos=(5, 180)) self.data_grid.CreateGrid(20, 10) self.panel.SetSizer(self.sizer)