def benchmark(client_type, brokers, topic, num_messages, msg_size, num_runs): payload = b"m" * msg_size if client_type == 'producer': client = KafkaProducer(bootstrap_servers=brokers) benchmark_fn = _produce elif client_type == 'consumer': client = KafkaConsumer(topic, bootstrap_servers=brokers, group_id=str(uuid.uuid1()), auto_offset_reset="earliest") client.subscribe([topic]) benchmark_fn = _consume print(f"Starting benchmark for Kafka-Python {client_type}.") run_times = [] for _ in range(num_runs): run_start_time = time.time() benchmark_fn(client, topic, payload, num_messages) run_time_taken = time.time() - run_start_time run_times.append(run_time_taken) utils.print_results(f"Kafka-Python {client_type}", run_times, num_messages, msg_size)
def train_lstm(): # Create symbolic vars x = Input(shape=(None, in_dim), dtype='float32', name='input') # Create network # fw_cell = LSTM(hidden_units_size, return_sequences=False, # implementation=2)(x) fw_cell = CuDNNLSTM(hidden_units_size, return_sequences=False)(x) h3 = Dense(classes, activation='softmax', use_bias=False)(fw_cell) model = Model(inputs=x, outputs=h3) validate_lstm_in_out(model) start = timer.perf_counter() model.compile(optimizer='Adam', loss='categorical_crossentropy') end = timer.perf_counter() print('>>> Model compilation took {:.1f} seconds'.format(end - start)) # Print parameter count params = model.count_params() print('# network parameters: ' + str(params)) # Start training batch_time = [] batch_loss = [] train_start = timer.perf_counter() for i in range(nb_batches): batch_start = timer.perf_counter() loss = model.train_on_batch(x=bX, y=to_categorical(bY, num_classes=classes)) batch_end = timer.perf_counter() batch_time.append(batch_end - batch_start) batch_loss.append(loss) train_end = timer.perf_counter() print_results(batch_loss, batch_time, train_start, train_end)
def train_lr( X, y, X_test, y_test, t, col_names = None, sample_weight = None ): sc = MinMaxScaler().fit(X) X = sc.transform(X) start = datetime.datetime.now() X_test_trans = sc.transform(X_test) print("training balanced LR..") lr = linear_model.LogisticRegression(class_weight='balanced') if sample_weight is not None: lr.fit(X, y, sample_weight) else: lr.fit(X, y) print("training mean accuracy = %.2f" % lr.score(X, y)) print("testing mean accuracy = %.2f" % lr.score(X_test_trans, y_test)) if col_names is not None: c = np.column_stack((col_names, np.round(lr.coef_.flatten(),2))) sorted_c = c[c[:,1].argsort()] print(sorted_c[:10]) print(sorted_c[-10:]) y_prob = lr.predict_proba(X_test_trans) end = datetime.datetime.now() delta = end - start y_pred = y_prob[:, 1] > t y_pred = y_pred.astype('uint8') print('--- t = %.2f results:' % t) print_results(y_test, y_pred) print('total time predictions: %f (s)' % delta.total_seconds()) print('time per query: %f (s)' % (delta.total_seconds() / len(y_pred))) false_preds = y_pred != y false_vectors = np.multiply(lr.coef_ * X[false_preds, :]) c = np.column_stack((col_names, np.round(lr.coef_.flatten(),2))) print(vector.shape) return y_pred
def run_experiment(params): tf.reset_default_graph() train_step, \ cost, \ accuracy, \ y_pred_cls, \ y_true_cls, \ placeholders = create_network( params['img_size'], params['num_channels'], params['num_classes'], params['num_fc_layer1_output'], params['learning_rate']) saver = tf.train.Saver() if not os.path.exists(params['save_dir']): os.makedirs(params['save_dir']) (train_acc, cost, test_acc) = train_network(params['data'], train_step, cost, accuracy, params['num_iterations'], params['train_batch_size'], placeholders, saver, params['save_dir'], params['plot_dir'], params['log_dir'], params['display_step']) cls_true, cls_pred, acc = test_network(params['test_batch_size'], placeholders, saver, params['save_dir'], accuracy, y_pred_cls, y_true_cls, params['data']) print_results(cls_pred, cls_true, "", params['plot_dir']) return train_acc, cost, test_acc, acc
def train_lr(X, y, X_test, y_test, t, col_names=None, sample_weight=None): sc = MinMaxScaler().fit(X) X = sc.transform(X) start = datetime.datetime.now() X_test_trans = sc.transform(X_test) print("training balanced LR..") lr = linear_model.LogisticRegression(class_weight='balanced') if sample_weight is not None: lr.fit(X, y, sample_weight) else: lr.fit(X, y) print("training mean accuracy = %.2f" % lr.score(X, y)) print("testing mean accuracy = %.2f" % lr.score(X_test_trans, y_test)) if col_names is not None: c = np.column_stack((col_names, np.round(lr.coef_.flatten(), 2))) print(c[c[:, 1].argsort()]) y_prob = lr.predict_proba(X_test_trans) end = datetime.datetime.now() delta = end - start y_pred = y_prob[:, 1] > t y_pred = y_pred.astype('uint8') print('--- t = %.2f results:' % t) print_results(y_test, y_pred) print('total time predictions: %f (s)' % delta.total_seconds()) print('time per query: %f (s)' % (delta.total_seconds() / len(y_pred))) return y_pred
def main(): opt_val = 0 opt_ind = [] parser = argparse.ArgumentParser() parser.add_argument('--DATASET', '-d', type=str, default='b_small') parser.add_argument('--METHOD', '-m', type=str, default='recursive_fill_in') args = parser.parse_args() dataset = 'datasets/' + args.DATASET + '.in' method = args.METHOD max_slices, slice_numbers = utils.read_data(dataset, display=1) mem_start = psutil.virtual_memory() time_start = time.time() if method == 'brute_force': opt_ind, opt_val = solutions.brute_force(slice_numbers, max_slices) elif method == 'smart_brute_force': opt_ind, opt_val = solutions.smart_brute_force(slice_numbers, max_slices) elif method == 'longest_path': opt_ind, opt_val = solutions.longest_path(slice_numbers, max_slices) elif method == 'recursive_fill_in': opt_ind, opt_val = solutions.recursive_fill_in(slice_numbers, max_slices) # elif method == "NEW_METHOD": # opt_ind, opt_val = solutions.NEW_METHOD(slice_numbers, max_slices) time_end = time.time() runtime = time_end - time_start mem_end = psutil.virtual_memory() memory_used = abs(mem_end.wired - mem_start.wired) * 1e-6 utils.print_results(opt_val, opt_ind, method, runtime, memory_used, max_slices, slice_numbers)
async def _producer_benchmark(brokers, topic, num_messages, msg_size, num_runs): payload = bytearray(b"m" * msg_size) producer_config = dict( bootstrap_servers=brokers, ) loop = asyncio.get_event_loop() producer = AIOKafkaProducer(loop=loop, **producer_config) await producer.start() print("Starting benchmark for AIOKafka Producer.") run_times = [] try: for _ in range(num_runs): run_start_time = time.time() await _produce(producer, topic, payload, num_messages) run_time_taken = time.time() - run_start_time run_times.append(run_time_taken) except asyncio.CancelledError: pass finally: await producer.stop() utils.print_results( "AIOKafka Producer", run_times, num_messages, msg_size )
def main(args): img = Image.open(args.img_path) if args.has_reference != 'False': ref_img = Image.open(args.has_reference) #begin processing if args.method == 'HE': model = HE() if args.method == 'Gamma': model = Gamma(args.gamma) if args.method == 'Gray_World': model = Gray_World(args.gamma) if args.method == 'Retinex': model = Retinex(args) if args.method == 'Max_RGB': model = Max_RGB(args.gamma) if args.method == 'DeHaze': model = DeHaze(args.omega, args.kernel_size, args.model) if args.method == 'LIME': model = LIME(args.gamma, args.alpha, args.sigma, args.kernel_size) pro_img = model.run(img) #processing image #print out the results if args.has_reference == 'False': show_results(img, pro_img) print_results(args, img, pro_img) else: show_results(img, pro_img, ref_img) print_results(args, img, pro_img, ref_img) return
def run(): start_time = get_start_time() n_people_to_compare_with, max_len_closest_matches, image_name, person_count, image_path = get_user_inputs( ) file_count = get_number_of_pics_to_compare_with(n_people_to_compare_with, lfw_path, person_count, image_name) original_image_encodings = get_image_encodings(image_path) closest_matches_sorted = compare_with_other_images( lfw_path, file_count, original_image_encodings, image_name, max_len_closest_matches, start_time, n_people_to_compare_with) print_results(closest_matches_sorted, image_path) output_results_csv(closest_matches_sorted, start_time, image_name, max_len_closest_matches, file_count, n_people_to_compare_with) output_results_image(closest_matches_sorted, start_time, image_name, max_len_closest_matches, file_count, n_people_to_compare_with)
async def _consumer_benchmark(brokers, topic, num_messages, msg_size, num_runs): loop = asyncio.get_event_loop() consumer = AIOKafkaConsumer( topic, group_id=str(uuid.uuid1()), auto_offset_reset="earliest", enable_auto_commit=False, loop=loop ) await consumer.start() print("Starting benchmark for AIOKafka Consumer.") run_times = [] try: for _ in range(num_runs): run_start_time = time.time() await _consume(consumer, num_messages) run_time_taken = time.time() - run_start_time run_times.append(run_time_taken) except asyncio.CancelledError: pass finally: await consumer.stop() utils.print_results( "AIOKafka Consumer", run_times, num_messages, msg_size )
def video_process(video_path, pm_model, save_video_flag): video_thread = VideoThread(video_path, 1280, 960, 1, '视频线程') video_thread.start() serial_thread = SerialThread('串口线程') serial_thread.start() init_flag = True while True: frame_read = video_thread.get_image() if frame_read is None: print('获取视频失败!') break # if init_flag and save_video_flag: # # 视频模式输出检测视频 # save_name = 'save_video.avi' # print('保存视频到' + save_name) # out_video = cv2.VideoWriter(save_name, cv2.VideoWriter_fourcc(*"MJPG"), 10.0, # (frame_read.shape[1], frame_read.shape[0])) # init_flag = False if init_flag: init_flag = False continue # [类别编号, 置信度, 中点坐标, 左上坐标, 右下坐标] boxes = pm_model.predict(frame_read) print_results(boxes, pm_model.label_names, init_flag) draw_results(frame_read, boxes, pm_model.colors, pm_model.label_names, False) serial_thread.set_data(boxes)
def classify(self, instance): self.attempts += 1 self.clear_neighbors() for cur_inst in self.instances: distance = self.euclidean_distance(cur_inst, instance) if len(self.neighbors) < self.k or distance < self.max_neighbor_distance(): new_neighbor = dict() new_neighbor['distance'] = distance new_neighbor['instance'] = cur_inst self.add_neighbor(new_neighbor) distribution = dict() for neighbor in self.neighbors: if not neighbor['instance'][-1] in distribution: distribution[neighbor['instance'][-1]] = 1 else: distribution[neighbor['instance'][-1]] += 1 max = 0 value = '' for item in distribution.items(): if item[1] > max: max = item[1] value = item[0] if value == instance[-1]: self.hits += 1 self.train(instance) if not self.attempts % 100: utils.print_results(self.attempts,self.hits)
def test_monza(): print('MONZA TEST:') start = np.array([0.5, 1.0, 4.9]) goal = np.array([3.8, 1.0, 0.1]) success, numofmoves, distance, total_time, max_time = runtest( './maps/monza.txt', start, goal, True) print_results(success, numofmoves, distance, total_time, max_time, './results/monza.txt')
def test_room(): print('ROOM TEST:') start = np.array([1.0, 5.0, 1.5]) goal = np.array([9.0, 7.0, 1.5]) success, numofmoves, distance, total_time, max_time = runtest( './maps/room.txt', start, goal, True) print_results(success, numofmoves, distance, total_time, max_time, './results/room.txt')
def test_tower(): print('TOWER TEST:') start = np.array([2.5, 4.0, 0.5]) goal = np.array([4.0, 2.5, 19.5]) success, numofmoves, distance, total_time, max_time = runtest( './maps/tower.txt', start, goal, True) print_results(success, numofmoves, distance, total_time, max_time, './results/tower.txt')
def test_window(): print('WINDOW TEST:') start = np.array([0.2, -4.9, 0.2]) goal = np.array([6.0, 18.0, 3.0]) success, numofmoves, distance, total_time, max_time = runtest( './maps/window.txt', start, goal, True) print_results(success, numofmoves, distance, total_time, max_time, './results/window.txt')
def test_maze(): print('MAZE TEST:') start = np.array([0.0, 0.0, 1.0]) goal = np.array([12.0, 12.0, 5.0]) success, numofmoves, distance, total_time, max_time = runtest( './maps/maze.txt', start, goal, True) print_results(success, numofmoves, distance, total_time, max_time, './results/maze.txt')
def test_single_cube(): print('SINGLE_CUBE TEST:') start = np.array([2.3, 2.3, 1.3]) goal = np.array([7.0, 7.0, 6.0]) success, numofmoves, distance, total_time, max_time = runtest( './maps/single_cube.txt', start, goal, True) print_results(success, numofmoves, distance, total_time, max_time, './results/single_cube.txt')
def test_flappy_bird(): print('FLAPPY_BIRD TEST:') start = np.array([0.5, 2.5, 5.5]) goal = np.array([19.0, 2.5, 5.5]) success, numofmoves, distance, total_time, max_time = runtest( './maps/flappy_bird.txt', start, goal, True) print_results(success, numofmoves, distance, total_time, max_time, './results/flappy_bird.txt')
def train_epoch(self, epoch): """ Evaluate the model on the train set. """ t1 = time() output = { 'tp': [], 'fp': [], 'fn': [], 'tn': [], 'loss': [], 'preds': [] } train_info = [] self.model = self.model.train() train_iter = self.iterator(self.data['train'], batch_size=self.params['batch'], shuffle_=self.params['shuffle_data']) self.optimizer.zero_grad() for batch_idx, batch in enumerate(train_iter): batch = self.convert_batch(batch) with autograd.detect_anomaly(): loss, stats, predictions, select = self.model(batch) loss.backward() # backward computation output['loss'] += [loss.item()] output['tp'] += [stats['tp'].to('cpu').data.numpy()] output['fp'] += [stats['fp'].to('cpu').data.numpy()] output['fn'] += [stats['fn'].to('cpu').data.numpy()] output['tn'] += [stats['tn'].to('cpu').data.numpy()] output['preds'] += [predictions.to('cpu').data.numpy()] train_info += [ batch['info'][select[0].to('cpu').data.numpy(), select[1].to('cpu').data.numpy(), select[2].to('cpu').data.numpy()] ] # Accumulate gradients (by Yuwei Xu) if (batch_idx + 1) % self.accumulation_steps == 0: nn.utils.clip_grad_norm_(self.model.parameters(), self.gc) # gradient clipping self.optimizer.step() self.optimizer.zero_grad() t2 = time() if self.window: total_loss, scores = self.subdocs_performance( output['loss'], output['preds'], train_info) else: total_loss, scores = self.performance(output) self.train_res['loss'] += [total_loss] self.train_res['score'] += [scores[self.primary_metric]] print('Epoch: {:02d} | TRAIN | LOSS = {:.05f}, '.format( epoch, total_loss), end="") print_results(scores, [], self.show_class, t2 - t1)
def print_results(self, dtc_accuracy, rfc_accuracy): text = 'Experiment #{}\n' \ 'Training size = {:0.0f}%\n' \ 'Dicision Tree Classifier accuracy = {:0.2f}%\n' \ 'Randrom Forest Classifier accuracy = {:0.2f}%\n\n'.format(self.num, self.data.train_size * 100, dtc_accuracy * 100, rfc_accuracy * 100) print_results(text)
def _run_ml(df, n_runs, mb, ml_keys, ml_score_keys, backend): ml_scores, ml_times = ml( df=df, n_runs=n_runs, mb=mb, ml_keys=ml_keys, ml_score_keys=ml_score_keys ) print_results(results=ml_times, backend=backend, unit="s") ml_times["Backend"] = backend print_results(results=ml_scores, backend=backend) ml_scores["Backend"] = backend return ml_times
def testing_simple(self): X = np.array([[1, 3], [2, 3], [3, 2], [3, 1], [1, 1], [2, 2]]) y = np.array([0, 0, 0, 0, 1, 1]) kc = KClustering() kc.fit(X, y) utils.print_results(kc.predict(X), y) self.plot_results(X, y)
def main(data_path="data/features/", out_path="data/models/svc/"): X_train, X_test, y_train, y_test = read_data(data_path) name = "LinearSVC" params = read_params("params.yaml", "svc") model = LinearSVC(**params) model.fit(X_train, y_train) accuracy, c_matrix, fig = evaluate_model(model, X_test, y_test) print_results(accuracy, c_matrix, name) save_results(out_path, model, fig) log_experiment(out_path, metrics=dict(accuracy=accuracy, confusion_matrics=c_matrix))
def main(data_path='data/features/', out_path='data/models/logistic/'): X_train, X_test, y_train, y_test = read_data(data_path) name = 'LogisticRegression' params = read_params('params.yaml', 'logistic') model = LogisticRegression(**params) model.fit(X_train, y_train) accuracy, c_matrix, fig = evaluate_model(model, X_test, y_test) print_results(accuracy, c_matrix, name) save_results(out_path, model, fig) log_experiment(out_path, params=params, metrics=dict(accuracy=accuracy, confusion_matrics=c_matrix))
def run(name): global x_train global y_train global x_dev global y_dev x_train, y_train = shuffle(x_train, y_train, random_state=1) x_dev, y_dev = shuffle(x_dev, y_dev, random_state=1) weights = model.train(x_train, y_train) print("validating: " + name) model.validate(weights, x_dev, y_dev) print("writing test predictions for " + name) results = model.get_test_results(x_test, weights) if not os.path.exists(OUTPUT_DIR): os.makedirs(OUTPUT_DIR) pred_path = OUTPUT_DIR + "/test." + name + ".pred" utils.print_results(results, pred_path)
def main(): raw_data = pnd.read_table('../data/segmentation.test', sep=',', header=None, lineterminator='\n') data = Data(raw_data, 0.7) bayes_sklearn_accuracy, bayes_native_accuracy = test_bayes(data) sklearn_kn_clf_accuracy, my_kn_clf_accuracy = test_knn(data, 10) text = 'bayes,{:0.4f}%,{:0.4f}%\n' \ 'knn,{:0.4f}%,{:0.4f}%\n'.format(bayes_sklearn_accuracy, bayes_native_accuracy, sklearn_kn_clf_accuracy, my_kn_clf_accuracy) print_results(text)
def main(data_path='data/features/', out_path='data/models/r_forrest/', n_estimators=10, max_samples=30): X_train, X_test, y_train, y_test = read_data(data_path) name = 'RandomForrest' params = read_params('params.yaml', 'forrest') model = RandomForestClassifier(**params) model.fit(X_train, y_train) accuracy, c_matrix, fig = evaluate_model(model, X_test, y_test) print_results(accuracy, c_matrix, name) save_results(out_path, model, fig) log_experiment(out_path, params=params, metrics=dict(accuracy=accuracy, confusion_matrics=c_matrix))
def classify(self, instance): """Classify a record from a testing set and show results""" self.attempts += 1 max_probability = 0 class_value = '' for clazz in self.classes: probability = clazz.class_probability(instance[0:-1]) #print 'Probability for outcome ' + class_.value + ' is ' + str(probability) if probability > max_probability: max_probability = probability class_value = clazz.value if class_value == instance[-1]: self.hits += 1 if not self.attempts % 100: utils.print_results(self.attempts,self.hits)
def main(data_path='data/features/', model_path='data/models/', out_path='data/models/ensemble/'): X_train, X_test, y_train, y_test = read_data(data_path) name = 'Ensemble' params = read_params('params.yaml', 'ensemble') cl1 = load_model(f'{model_path}/logistic/') cl2 = load_model(f'{model_path}/svc/') cl3 = load_model(f'{model_path}/r_forrest/') estimators = [('l_regression', cl1), ('l_svc', cl2), ('r_forrest', cl3)] model = VotingClassifier(estimators, **params) model.fit(X_train, y_train) accuracy, c_matrix, fig = evaluate_model(model, X_test, y_test) print_results(accuracy, c_matrix, name) save_results(out_path, model, fig) log_experiment(out_path, metrics=dict(accuracy=accuracy, confusion_matrics=c_matrix))
def test(): m1, m2, m3, mt = ([-4, 2], [0, 5], [4, 2], [0, 0]) C = np.array([[1, 0], [0, 1]]) n_samples = 1000 X1 = np.random.multivariate_normal(m1, C, n_samples) X2 = np.random.multivariate_normal(m2, C, n_samples) X3 = np.random.multivariate_normal(m3, C, n_samples) Xt = np.random.multivariate_normal(mt, C, n_samples) X = np.vstack((X1, X2, X3, Xt)) y = np.concatenate((np.zeros(3 * n_samples), np.ones(n_samples))) kc = KClustering() kc.fit(X, y, plot=True) f = kc.predict(X) print('## Prediction Results ##') utils.print_results(f, y)
def main(argv): filename = argv[0] t = float(argv[1]) size = 0.33 df = pd.read_csv('../../data/cache_selection/' + filename) labels = df['Label'] df = df.drop(['Id', 'Label'], axis=1) #print(df.corr()['Label'].sort_values()) X, X_test, y, y_test = train_test_split(df, labels, stratify=labels, test_size=size, random_state=1) X = X.drop(['TestViewCount', 'Query', '18', '100'], axis=1) vc = X_test['TestViewCount'] test_queries = X_test['Query'] q18 = X_test['18'] q100 = X_test['100'] X_test = X_test.drop(['TestViewCount', 'Query', '18', '100'], axis=1) ql = q18.copy() ql_pred = X_test['ql_t'] < X_test['ql_t.1'] ql.loc[ql_pred == 1] = q100[ql_pred == 1] print("train set size and ones: %d, %d" % (y.shape[0], np.sum(y))) print("test set size and ones: %d, %d" % (y_test.shape[0], np.sum(y_test))) print("onez ratio in trian set = %.2f" % (100 * np.sum(y) / y.shape[0])) print("onez ratio in test set = %.2f" % (100 * np.sum(y_test) / y_test.shape[0])) # learn the model #sc = StandardScaler().fit(X) sc = MinMaxScaler().fit(X) X = sc.transform(X) X_test = sc.transform(X_test) print("training LR..") lr = linear_model.LogisticRegression() lr.fit(X, y) # lr.fit(X, y, sample_weight=vcx) print("train/test mean accuracy = %.2f, %.2f" % (lr.score(X, y), lr.score(X_test, y_test))) y_pred = lr.predict(X_test) print_results(y_test, y_pred) print("training balanced LR..") lr = linear_model.LogisticRegression(class_weight='balanced') lr.fit(X, y) #lr.fit(X, y, sample_weight=vcx) print("train/test mean accuracy = %.2f, %.2f" % (lr.score(X, y), lr.score(X_test, y_test))) #c = np.column_stack((df.columns.values[1:-1], np.round(lr.coef_.flatten(),2))) #print(c[c[:,1].argsort()]) start = datetime.datetime.now() y_prob = lr.predict_proba(X_test) y_pred = y_prob[:, 1] > t y_pred = y_pred.astype('uint8') print(y_pred.shape) end = datetime.datetime.now() print('--- results:') print_results(y_test, y_pred) delta = end - start print('total time: %f' % delta.total_seconds()) print('time per query: %f' % (delta.total_seconds() / len(y_pred))) print('test size (distinct): % d' % y_pred.size) print('test size (all): % d' % vc.sum()) ones = vc * y_pred print('ones ratio: %.2f' % (ones.sum() / y_pred.size)) output = pd.DataFrame() output['Query'] = test_queries output['TestViewCount'] = vc output['Label'] = y_test output['Pred'] = pd.Series(y_pred, index=output.index) output['18'] = q18 output['100'] = q100 output['ql'] = ql output['ml'] = output.apply(f, axis=1) output['best'] = output.apply(g, axis=1) r = np.random.randint(0, 2, q18.size) output['rand'] = q18.copy() output['rand'][r == 1] = q100[r == 1].copy() if (argv[2]): output.to_csv('../../data/python_data/%s_result.csv' % filename[:-4], index=False)
""" # Use Pipl to match remainder for dev in [dev for dev in devs if dev['email']]: if dev['li_matches']: if dev['li_matches'][0]['score'] < 75: print "Trying piplsearch for %s..." % dev.get('name') result = try_piplsearch(dev) if result: dev['li_matches'].append(result) # re-sort from highest scoring match to lowest dev['li_matches'] = sorted(dev['li_matches'], key =lambda k: k['score'], reverse=True) else: print "Trying piplsearch for %s..." % dev.get('name') result = try_piplsearch(dev) if result: dev['li_matches'].append(result) # re-sort from highest scoring match to lowest dev['li_matches'] = sorted(dev['li_matches'], key =lambda k: k['score'], reverse=True) """ # See the results of running through both algos #devs = utils.load_json('scoredresults') utils.print_results(devs) #utils.sanity_check(devs)
else: offset = get_split_offset(args.percentage, training_set) testing_set = training_set[offset:] training_set = training_set[:offset] print 'Testing set is derived from training set' print 'Training set: %d instances, testing set: %d instances' %(len(training_set), len(testing_set)) #Choose classifier if args.classifier == 'bayes': classifier = bayes.bayes() for instance in cls: clazz_ = Clazz(len(training_set[0])-1,instance) classifier.add_class(clazz_) elif args.classifier == 'knn': classifier = knn.knn(args.kvalue) else: raise RuntimeError #Precompute data for normalization statistics = Clazz(len(training_set[0])-1,'training_set') [statistics.add_match(instance) for instance in training_set] #Train and classify [classifier.train(instance) for instance in normalize(training_set)] [classifier.classify(normalize_instance(instance)) for instance in testing_set] utils.print_results(classifier.attempts,classifier.hits) print 'Classification completed'
def main(argv): filename = argv[0] df = pd.read_csv('../../data/cache_selection/' + filename) t = float(argv[1]) df = df.fillna(0) labels = df['label'] size = 0.33 X, X_test, y, y_test = train_test_split(df.drop(['label'], axis=1), labels, stratify=labels, test_size=size, random_state=1) X = X.drop(['query', 'TrainFreq', 'TestFreq', '2', '100'], axis=1) test_queries = X_test['query'] subset_mrr = X_test['2'] db_mrr = X_test['100'] test_freq = X_test['TestFreq'] X_test = X_test.drop(['TrainFreq', 'TestFreq', 'query', '2', '100'], axis=1) ql = subset_mrr.copy() ql_pred = X_test['ql_c'] < X_test['ql_c.1'] ql.loc[ql_pred == 1] = db_mrr[ql_pred == 1] #print(df.corr()['label'].sort_values()) print("train set size and ones: %d, %d" % (y.shape[0], np.sum(y))) print("test set size and ones: %d, %d" % (y_test.shape[0], np.sum(y_test))) print("onez ratio in trian set = %.2f" % (100 * np.sum(y) / y.shape[0])) print("onez ratio in test set = %.2f" % (100 * np.sum(y_test) / y_test.shape[0])) # learn the model #sc = StandardScaler().fit(X) sc = MinMaxScaler().fit(X) X = sc.transform(X) X_test = sc.transform(X_test) print("training balanced LR..") lr = linear_model.LogisticRegression(class_weight='balanced') lr.fit(X, y) print("training mean accuracy = %.2f" % lr.score(X, y)) print("testing mean accuracy = %.2f" % lr.score(X_test, y_test)) c = np.column_stack((df.columns.values[5:-1], np.round(lr.coef_.flatten(),2))) print(c[c[:,1].argsort()]) y_prob = lr.predict_proba(X_test) y_pred = y_prob[:, 1] > t y_pred = y_pred.astype('uint8') print('--- t = %.2f results:' % t) print_results(y_test, y_pred) output = pd.DataFrame() output['Query'] = test_queries output['TestFreq'] = test_freq output['2'] = subset_mrr output['100'] = db_mrr output['Label'] = y_test output['ql'] = ql output['ql_label'] = ql ml = subset_mrr.copy() ml.loc[y_pred == 1] = db_mrr[y_pred == 1] output['ml'] = ml output['Pred'] = pd.Series(y_pred, index=output.index) best = subset_mrr.copy() print(best.mean()) best[y_test == 1] = db_mrr[y_test == 1] print(best.mean()) output['best'] = best r = np.random.randint(0, 2, output['2'].size) output['rand'] = output['2'].copy() output['rand'][r == 1] = output['100'][r == 1].copy() analyze(output, '2', '100','TestFreq') if (argv[2]): df.to_csv('%s%s_result.csv' % ('../../data/python_data/', filename[:-4]), index=False)
GOOD_SCRIPTS[netid] = script except: # AssertionError and other exception print('-' * 80) print("'", script, "'", sep='') print('Output of %s' % netid) print(output) print('=' * 80) print('Code of %s' % netid) with open(script, 'r') as f: print(f.read()) self.students[netid] = 0 BAD_SCRIPTS[netid] = script def tearDown(self): global STUDENTS STUDENTS = self.students if __name__ == '__main__': suite = unittest.TestLoader().loadTestsFromTestCase(TestAssignment) runner = unittest.TextTestRunner() runner.run(suite) print_results(STUDENTS) check_scripts(GOOD_SCRIPTS) #for netid in OUTPUTS: #print('[netid]', netid) #print(OUTPUTS[netid]) print('*' * 80) check_scripts(BAD_SCRIPTS)