Exemple #1
0
def benchmark(client_type, brokers, topic, num_messages, msg_size, num_runs):
    payload = b"m" * msg_size

    if client_type == 'producer':
        client = KafkaProducer(bootstrap_servers=brokers)
        benchmark_fn = _produce
    elif client_type == 'consumer':
        client = KafkaConsumer(topic,
                               bootstrap_servers=brokers,
                               group_id=str(uuid.uuid1()),
                               auto_offset_reset="earliest")
        client.subscribe([topic])
        benchmark_fn = _consume

    print(f"Starting benchmark for Kafka-Python {client_type}.")

    run_times = []
    for _ in range(num_runs):
        run_start_time = time.time()
        benchmark_fn(client, topic, payload, num_messages)
        run_time_taken = time.time() - run_start_time
        run_times.append(run_time_taken)

    utils.print_results(f"Kafka-Python {client_type}", run_times, num_messages,
                        msg_size)
def train_lstm():

    # Create symbolic vars
    x = Input(shape=(None, in_dim), dtype='float32', name='input')

    # Create network
    # fw_cell = LSTM(hidden_units_size, return_sequences=False,
    #                implementation=2)(x)
    fw_cell = CuDNNLSTM(hidden_units_size, return_sequences=False)(x)

    h3 = Dense(classes, activation='softmax', use_bias=False)(fw_cell)
    model = Model(inputs=x, outputs=h3)
    validate_lstm_in_out(model)
    start = timer.perf_counter()
    model.compile(optimizer='Adam', loss='categorical_crossentropy')
    end = timer.perf_counter()
    print('>>> Model compilation took {:.1f} seconds'.format(end - start))

    # Print parameter count
    params = model.count_params()
    print('# network parameters: ' + str(params))

    # Start training
    batch_time = []
    batch_loss = []
    train_start = timer.perf_counter()
    for i in range(nb_batches):
        batch_start = timer.perf_counter()
        loss = model.train_on_batch(x=bX,
                                    y=to_categorical(bY, num_classes=classes))
        batch_end = timer.perf_counter()
        batch_time.append(batch_end - batch_start)
        batch_loss.append(loss)
        train_end = timer.perf_counter()
    print_results(batch_loss, batch_time, train_start, train_end)
def train_lr( X, y, X_test, y_test, t, col_names = None, sample_weight = None ):
    sc = MinMaxScaler().fit(X)
    X = sc.transform(X)
    start = datetime.datetime.now()
    X_test_trans = sc.transform(X_test)
    print("training balanced LR..")
    lr = linear_model.LogisticRegression(class_weight='balanced')
    if sample_weight is not None:
        lr.fit(X, y, sample_weight)
    else:
        lr.fit(X, y)
    print("training mean accuracy = %.2f" % lr.score(X, y))
    print("testing mean accuracy = %.2f" % lr.score(X_test_trans, y_test))
    if col_names is not None:
        c = np.column_stack((col_names, np.round(lr.coef_.flatten(),2)))
        sorted_c = c[c[:,1].argsort()]
        print(sorted_c[:10])
        print(sorted_c[-10:])
    y_prob = lr.predict_proba(X_test_trans)
    end = datetime.datetime.now()
    delta = end - start
    y_pred = y_prob[:, 1] > t
    y_pred = y_pred.astype('uint8')
    print('--- t = %.2f results:' % t)
    print_results(y_test, y_pred)
    print('total time predictions: %f (s)' % delta.total_seconds())
    print('time per query: %f (s)' % (delta.total_seconds() / len(y_pred)))
    false_preds = y_pred != y
    false_vectors = np.multiply(lr.coef_ * X[false_preds, :])
    c = np.column_stack((col_names, np.round(lr.coef_.flatten(),2)))
    print(vector.shape)
    return y_pred
def run_experiment(params):
    tf.reset_default_graph()

    train_step, \
    cost, \
    accuracy, \
    y_pred_cls, \
    y_true_cls, \
    placeholders = create_network(
                                params['img_size'],
                                params['num_channels'],
                                params['num_classes'],
                                params['num_fc_layer1_output'],
                                params['learning_rate'])

    saver = tf.train.Saver()
    if not os.path.exists(params['save_dir']):
        os.makedirs(params['save_dir'])

    (train_acc, cost,
     test_acc) = train_network(params['data'], train_step, cost, accuracy,
                               params['num_iterations'],
                               params['train_batch_size'], placeholders, saver,
                               params['save_dir'], params['plot_dir'],
                               params['log_dir'], params['display_step'])

    cls_true, cls_pred, acc = test_network(params['test_batch_size'],
                                           placeholders, saver,
                                           params['save_dir'], accuracy,
                                           y_pred_cls, y_true_cls,
                                           params['data'])

    print_results(cls_pred, cls_true, "", params['plot_dir'])

    return train_acc, cost, test_acc, acc
Exemple #5
0
def train_lr(X, y, X_test, y_test, t, col_names=None, sample_weight=None):
    sc = MinMaxScaler().fit(X)
    X = sc.transform(X)
    start = datetime.datetime.now()
    X_test_trans = sc.transform(X_test)
    print("training balanced LR..")
    lr = linear_model.LogisticRegression(class_weight='balanced')
    if sample_weight is not None:
        lr.fit(X, y, sample_weight)
    else:
        lr.fit(X, y)
    print("training mean accuracy = %.2f" % lr.score(X, y))
    print("testing mean accuracy = %.2f" % lr.score(X_test_trans, y_test))
    if col_names is not None:
        c = np.column_stack((col_names, np.round(lr.coef_.flatten(), 2)))
        print(c[c[:, 1].argsort()])
    y_prob = lr.predict_proba(X_test_trans)
    end = datetime.datetime.now()
    delta = end - start
    y_pred = y_prob[:, 1] > t
    y_pred = y_pred.astype('uint8')
    print('--- t = %.2f results:' % t)
    print_results(y_test, y_pred)
    print('total time predictions: %f (s)' % delta.total_seconds())
    print('time per query: %f (s)' % (delta.total_seconds() / len(y_pred)))
    return y_pred
Exemple #6
0
def main():
    opt_val = 0
    opt_ind = []
    parser = argparse.ArgumentParser()
    parser.add_argument('--DATASET', '-d', type=str, default='b_small')
    parser.add_argument('--METHOD', '-m', type=str, default='recursive_fill_in')
    args = parser.parse_args()

    dataset = 'datasets/' + args.DATASET + '.in'
    method = args.METHOD

    max_slices, slice_numbers = utils.read_data(dataset, display=1)
    mem_start = psutil.virtual_memory()
    time_start = time.time()

    if method == 'brute_force':
        opt_ind, opt_val = solutions.brute_force(slice_numbers, max_slices)
    elif method == 'smart_brute_force':
        opt_ind, opt_val = solutions.smart_brute_force(slice_numbers, max_slices)
    elif method == 'longest_path':
        opt_ind, opt_val = solutions.longest_path(slice_numbers, max_slices)
    elif method == 'recursive_fill_in':
        opt_ind, opt_val = solutions.recursive_fill_in(slice_numbers, max_slices)
    # elif method == "NEW_METHOD":
    #     opt_ind, opt_val = solutions.NEW_METHOD(slice_numbers, max_slices)

    time_end = time.time()
    runtime = time_end - time_start
    mem_end = psutil.virtual_memory()
    memory_used = abs(mem_end.wired - mem_start.wired) * 1e-6

    utils.print_results(opt_val, opt_ind, method, runtime, memory_used, max_slices, slice_numbers)
async def _producer_benchmark(brokers, topic, num_messages, msg_size, num_runs):
    payload = bytearray(b"m" * msg_size)
    producer_config = dict(
        bootstrap_servers=brokers,
    )

    loop = asyncio.get_event_loop()
    producer = AIOKafkaProducer(loop=loop, **producer_config)
    await producer.start()

    print("Starting benchmark for AIOKafka Producer.")
    run_times = []

    try:
        for _ in range(num_runs):
            run_start_time = time.time()
            await _produce(producer, topic, payload, num_messages)
            run_time_taken = time.time() - run_start_time
            run_times.append(run_time_taken)
    except asyncio.CancelledError:
        pass
    finally:
        await producer.stop()

    utils.print_results(
        "AIOKafka Producer", run_times, num_messages, msg_size
    )
Exemple #8
0
def main(args):
    img = Image.open(args.img_path)
    if args.has_reference != 'False':
        ref_img = Image.open(args.has_reference)

    #begin processing
    if args.method == 'HE':
        model = HE()
    if args.method == 'Gamma':
        model = Gamma(args.gamma)
    if args.method == 'Gray_World':
        model = Gray_World(args.gamma)
    if args.method == 'Retinex':
        model = Retinex(args)
    if args.method == 'Max_RGB':
        model = Max_RGB(args.gamma)
    if args.method == 'DeHaze':
        model = DeHaze(args.omega, args.kernel_size, args.model)
    if args.method == 'LIME':
        model = LIME(args.gamma, args.alpha, args.sigma, args.kernel_size)

    pro_img = model.run(img)  #processing image

    #print out the results
    if args.has_reference == 'False':
        show_results(img, pro_img)
        print_results(args, img, pro_img)
    else:
        show_results(img, pro_img, ref_img)
        print_results(args, img, pro_img, ref_img)

    return
Exemple #9
0
def run():
    start_time = get_start_time()

    n_people_to_compare_with, max_len_closest_matches, image_name, person_count, image_path = get_user_inputs(
    )

    file_count = get_number_of_pics_to_compare_with(n_people_to_compare_with,
                                                    lfw_path, person_count,
                                                    image_name)

    original_image_encodings = get_image_encodings(image_path)

    closest_matches_sorted = compare_with_other_images(
        lfw_path, file_count, original_image_encodings, image_name,
        max_len_closest_matches, start_time, n_people_to_compare_with)

    print_results(closest_matches_sorted, image_path)

    output_results_csv(closest_matches_sorted, start_time, image_name,
                       max_len_closest_matches, file_count,
                       n_people_to_compare_with)

    output_results_image(closest_matches_sorted, start_time, image_name,
                         max_len_closest_matches, file_count,
                         n_people_to_compare_with)
async def _consumer_benchmark(brokers, topic, num_messages, msg_size, num_runs):
    loop = asyncio.get_event_loop()
    consumer = AIOKafkaConsumer(
        topic, group_id=str(uuid.uuid1()),
        auto_offset_reset="earliest",
        enable_auto_commit=False,
        loop=loop
    )

    await consumer.start()

    print("Starting benchmark for AIOKafka Consumer.")
    run_times = []

    try:
        for _ in range(num_runs):
            run_start_time = time.time()
            await _consume(consumer, num_messages)
            run_time_taken = time.time() - run_start_time
            run_times.append(run_time_taken)
    except asyncio.CancelledError:
        pass
    finally:
        await consumer.stop()

    utils.print_results(
        "AIOKafka Consumer", run_times, num_messages, msg_size
    )
Exemple #11
0
def video_process(video_path, pm_model, save_video_flag):
    video_thread = VideoThread(video_path, 1280, 960, 1, '视频线程')
    video_thread.start()
    serial_thread = SerialThread('串口线程')
    serial_thread.start()
    init_flag = True

    while True:
        frame_read = video_thread.get_image()

        if frame_read is None:
            print('获取视频失败!')
            break

        # if init_flag and save_video_flag:
        #     # 视频模式输出检测视频
        #     save_name = 'save_video.avi'
        #     print('保存视频到' + save_name)
        #     out_video = cv2.VideoWriter(save_name, cv2.VideoWriter_fourcc(*"MJPG"), 10.0,
        #                                 (frame_read.shape[1], frame_read.shape[0]))
        #     init_flag = False
        if init_flag:
            init_flag = False
            continue
        # [类别编号, 置信度, 中点坐标, 左上坐标, 右下坐标]
        boxes = pm_model.predict(frame_read)
        print_results(boxes, pm_model.label_names, init_flag)
        draw_results(frame_read, boxes, pm_model.colors, pm_model.label_names, False)
        serial_thread.set_data(boxes)
Exemple #12
0
    def classify(self, instance):
        self.attempts += 1
        self.clear_neighbors()

        for cur_inst in self.instances:
            distance = self.euclidean_distance(cur_inst, instance)
            if len(self.neighbors) < self.k or distance < self.max_neighbor_distance():
                new_neighbor = dict()
                new_neighbor['distance'] = distance
                new_neighbor['instance'] = cur_inst
                self.add_neighbor(new_neighbor)

        distribution = dict()
        for neighbor in self.neighbors:
            if not neighbor['instance'][-1] in distribution:
                distribution[neighbor['instance'][-1]] = 1
            else:
                distribution[neighbor['instance'][-1]] += 1

        max = 0
        value = ''

        for item in distribution.items():
            if item[1] > max:
                max = item[1]
                value = item[0]

        if value == instance[-1]:
            self.hits += 1
            self.train(instance)

        if not self.attempts % 100:
            utils.print_results(self.attempts,self.hits)
def test_monza():
    print('MONZA TEST:')
    start = np.array([0.5, 1.0, 4.9])
    goal = np.array([3.8, 1.0, 0.1])
    success, numofmoves, distance, total_time, max_time = runtest(
        './maps/monza.txt', start, goal, True)
    print_results(success, numofmoves, distance, total_time, max_time,
                  './results/monza.txt')
def test_room():
    print('ROOM TEST:')
    start = np.array([1.0, 5.0, 1.5])
    goal = np.array([9.0, 7.0, 1.5])
    success, numofmoves, distance, total_time, max_time = runtest(
        './maps/room.txt', start, goal, True)
    print_results(success, numofmoves, distance, total_time, max_time,
                  './results/room.txt')
def test_tower():
    print('TOWER TEST:')
    start = np.array([2.5, 4.0, 0.5])
    goal = np.array([4.0, 2.5, 19.5])
    success, numofmoves, distance, total_time, max_time = runtest(
        './maps/tower.txt', start, goal, True)
    print_results(success, numofmoves, distance, total_time, max_time,
                  './results/tower.txt')
def test_window():
    print('WINDOW TEST:')
    start = np.array([0.2, -4.9, 0.2])
    goal = np.array([6.0, 18.0, 3.0])
    success, numofmoves, distance, total_time, max_time = runtest(
        './maps/window.txt', start, goal, True)
    print_results(success, numofmoves, distance, total_time, max_time,
                  './results/window.txt')
def test_maze():
    print('MAZE TEST:')
    start = np.array([0.0, 0.0, 1.0])
    goal = np.array([12.0, 12.0, 5.0])
    success, numofmoves, distance, total_time, max_time = runtest(
        './maps/maze.txt', start, goal, True)
    print_results(success, numofmoves, distance, total_time, max_time,
                  './results/maze.txt')
def test_single_cube():
    print('SINGLE_CUBE TEST:')
    start = np.array([2.3, 2.3, 1.3])
    goal = np.array([7.0, 7.0, 6.0])
    success, numofmoves, distance, total_time, max_time = runtest(
        './maps/single_cube.txt', start, goal, True)
    print_results(success, numofmoves, distance, total_time, max_time,
                  './results/single_cube.txt')
def test_flappy_bird():
    print('FLAPPY_BIRD TEST:')
    start = np.array([0.5, 2.5, 5.5])
    goal = np.array([19.0, 2.5, 5.5])
    success, numofmoves, distance, total_time, max_time = runtest(
        './maps/flappy_bird.txt', start, goal, True)
    print_results(success, numofmoves, distance, total_time, max_time,
                  './results/flappy_bird.txt')
Exemple #20
0
    def train_epoch(self, epoch):
        """
        Evaluate the model on the train set.
        """
        t1 = time()
        output = {
            'tp': [],
            'fp': [],
            'fn': [],
            'tn': [],
            'loss': [],
            'preds': []
        }
        train_info = []

        self.model = self.model.train()
        train_iter = self.iterator(self.data['train'],
                                   batch_size=self.params['batch'],
                                   shuffle_=self.params['shuffle_data'])
        self.optimizer.zero_grad()
        for batch_idx, batch in enumerate(train_iter):
            batch = self.convert_batch(batch)

            with autograd.detect_anomaly():
                loss, stats, predictions, select = self.model(batch)
                loss.backward()  # backward computation

            output['loss'] += [loss.item()]
            output['tp'] += [stats['tp'].to('cpu').data.numpy()]
            output['fp'] += [stats['fp'].to('cpu').data.numpy()]
            output['fn'] += [stats['fn'].to('cpu').data.numpy()]
            output['tn'] += [stats['tn'].to('cpu').data.numpy()]
            output['preds'] += [predictions.to('cpu').data.numpy()]
            train_info += [
                batch['info'][select[0].to('cpu').data.numpy(),
                              select[1].to('cpu').data.numpy(),
                              select[2].to('cpu').data.numpy()]
            ]
            # Accumulate gradients (by Yuwei Xu)
            if (batch_idx + 1) % self.accumulation_steps == 0:
                nn.utils.clip_grad_norm_(self.model.parameters(),
                                         self.gc)  # gradient clipping
                self.optimizer.step()
                self.optimizer.zero_grad()
        t2 = time()
        if self.window:
            total_loss, scores = self.subdocs_performance(
                output['loss'], output['preds'], train_info)
        else:
            total_loss, scores = self.performance(output)

        self.train_res['loss'] += [total_loss]
        self.train_res['score'] += [scores[self.primary_metric]]
        print('Epoch: {:02d} | TRAIN | LOSS = {:.05f}, '.format(
            epoch, total_loss),
              end="")
        print_results(scores, [], self.show_class, t2 - t1)
Exemple #21
0
 def print_results(self, dtc_accuracy, rfc_accuracy):
     text = 'Experiment #{}\n' \
            'Training size = {:0.0f}%\n' \
            'Dicision Tree Classifier accuracy = {:0.2f}%\n' \
            'Randrom Forest Classifier accuracy = {:0.2f}%\n\n'.format(self.num,
                                                                       self.data.train_size * 100,
                                                                       dtc_accuracy * 100,
                                                                       rfc_accuracy * 100)
     print_results(text)
Exemple #22
0
def _run_ml(df, n_runs, mb, ml_keys, ml_score_keys, backend):
    ml_scores, ml_times = ml(
        df=df, n_runs=n_runs, mb=mb, ml_keys=ml_keys, ml_score_keys=ml_score_keys
    )
    print_results(results=ml_times, backend=backend, unit="s")
    ml_times["Backend"] = backend
    print_results(results=ml_scores, backend=backend)
    ml_scores["Backend"] = backend
    return ml_times
Exemple #23
0
    def testing_simple(self):

        X = np.array([[1, 3], [2, 3], [3, 2], [3, 1], [1, 1], [2, 2]])
        y = np.array([0, 0, 0, 0, 1, 1])
        kc = KClustering()
        kc.fit(X, y)

        utils.print_results(kc.predict(X), y)

        self.plot_results(X, y)
Exemple #24
0
def main(data_path="data/features/", out_path="data/models/svc/"):
    X_train, X_test, y_train, y_test = read_data(data_path)

    name = "LinearSVC"
    params = read_params("params.yaml", "svc")
    model = LinearSVC(**params)
    model.fit(X_train, y_train)

    accuracy, c_matrix, fig = evaluate_model(model, X_test, y_test)
    print_results(accuracy, c_matrix, name)

    save_results(out_path, model, fig)
    log_experiment(out_path,
                   metrics=dict(accuracy=accuracy, confusion_matrics=c_matrix))
Exemple #25
0
def main(data_path='data/features/', out_path='data/models/logistic/'):
    X_train, X_test, y_train, y_test = read_data(data_path)

    name = 'LogisticRegression'
    params = read_params('params.yaml', 'logistic')
    model = LogisticRegression(**params)
    model.fit(X_train, y_train)

    accuracy, c_matrix, fig = evaluate_model(model, X_test, y_test)
    print_results(accuracy, c_matrix, name)

    save_results(out_path, model, fig)
    log_experiment(out_path,
                   params=params,
                   metrics=dict(accuracy=accuracy, confusion_matrics=c_matrix))
Exemple #26
0
def run(name):
    global x_train
    global y_train
    global x_dev
    global y_dev
    x_train, y_train = shuffle(x_train, y_train, random_state=1)
    x_dev, y_dev = shuffle(x_dev, y_dev, random_state=1)
    weights = model.train(x_train, y_train)
    print("validating: " + name)
    model.validate(weights, x_dev, y_dev)
    print("writing test predictions for " + name)
    results = model.get_test_results(x_test, weights)

    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    pred_path = OUTPUT_DIR + "/test." + name + ".pred"
    utils.print_results(results, pred_path)
Exemple #27
0
def main():
    raw_data = pnd.read_table('../data/segmentation.test',
                              sep=',',
                              header=None,
                              lineterminator='\n')
    data = Data(raw_data, 0.7)

    bayes_sklearn_accuracy, bayes_native_accuracy = test_bayes(data)
    sklearn_kn_clf_accuracy, my_kn_clf_accuracy = test_knn(data, 10)

    text = 'bayes,{:0.4f}%,{:0.4f}%\n' \
           'knn,{:0.4f}%,{:0.4f}%\n'.format(bayes_sklearn_accuracy,
                                            bayes_native_accuracy,
                                            sklearn_kn_clf_accuracy,
                                            my_kn_clf_accuracy)

    print_results(text)
Exemple #28
0
def main(data_path='data/features/',
         out_path='data/models/r_forrest/',
         n_estimators=10,
         max_samples=30):
    X_train, X_test, y_train, y_test = read_data(data_path)

    name = 'RandomForrest'
    params = read_params('params.yaml', 'forrest')
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)

    accuracy, c_matrix, fig = evaluate_model(model, X_test, y_test)
    print_results(accuracy, c_matrix, name)

    save_results(out_path, model, fig)
    log_experiment(out_path,
                   params=params,
                   metrics=dict(accuracy=accuracy, confusion_matrics=c_matrix))
Exemple #29
0
    def classify(self, instance):
        """Classify a record from a testing set and show results"""

        self.attempts += 1
        max_probability = 0
        class_value = ''
        for clazz in self.classes:
            probability = clazz.class_probability(instance[0:-1])
            #print 'Probability for outcome ' + class_.value + ' is ' + str(probability)
            if probability > max_probability:
                max_probability = probability
                class_value = clazz.value

        if class_value == instance[-1]:
            self.hits += 1

        if not self.attempts % 100:
            utils.print_results(self.attempts,self.hits)
Exemple #30
0
def main(data_path='data/features/',
         model_path='data/models/',
         out_path='data/models/ensemble/'):
    X_train, X_test, y_train, y_test = read_data(data_path)

    name = 'Ensemble'
    params = read_params('params.yaml', 'ensemble')
    cl1 = load_model(f'{model_path}/logistic/')
    cl2 = load_model(f'{model_path}/svc/')
    cl3 = load_model(f'{model_path}/r_forrest/')
    estimators = [('l_regression', cl1), ('l_svc', cl2), ('r_forrest', cl3)]

    model = VotingClassifier(estimators, **params)
    model.fit(X_train, y_train)

    accuracy, c_matrix, fig = evaluate_model(model, X_test, y_test)
    print_results(accuracy, c_matrix, name)

    save_results(out_path, model, fig)
    log_experiment(out_path,
                   metrics=dict(accuracy=accuracy, confusion_matrics=c_matrix))
def test():
    m1, m2, m3, mt = ([-4, 2], [0, 5], [4, 2], [0, 0])
    C = np.array([[1, 0], [0, 1]])

    n_samples = 1000

    X1 = np.random.multivariate_normal(m1, C, n_samples)
    X2 = np.random.multivariate_normal(m2, C, n_samples)
    X3 = np.random.multivariate_normal(m3, C, n_samples)
    Xt = np.random.multivariate_normal(mt, C, n_samples)

    X = np.vstack((X1, X2, X3, Xt))
    y = np.concatenate((np.zeros(3 * n_samples), np.ones(n_samples)))

    kc = KClustering()

    kc.fit(X, y, plot=True)

    f = kc.predict(X)
    print('## Prediction Results ##')
    utils.print_results(f, y)
def main(argv):
    filename = argv[0]
    t = float(argv[1])
    size = 0.33
    df = pd.read_csv('../../data/cache_selection/' + filename)
    labels = df['Label']
    df = df.drop(['Id', 'Label'], axis=1)
    #print(df.corr()['Label'].sort_values())
    X, X_test, y, y_test = train_test_split(df, labels, stratify=labels,
                                            test_size=size, random_state=1)
    X = X.drop(['TestViewCount', 'Query', '18', '100'], axis=1)
    vc = X_test['TestViewCount']
    test_queries = X_test['Query']
    q18 = X_test['18']
    q100 = X_test['100']
    X_test = X_test.drop(['TestViewCount', 'Query', '18', '100'], axis=1)
    ql = q18.copy()
    ql_pred = X_test['ql_t'] < X_test['ql_t.1']
    ql.loc[ql_pred == 1] = q100[ql_pred == 1]
    print("train set size and ones: %d, %d" % (y.shape[0], np.sum(y)))
    print("test set size and ones: %d, %d" % (y_test.shape[0], np.sum(y_test)))
    print("onez ratio in trian set =  %.2f" % (100 * np.sum(y) / y.shape[0]))
    print("onez ratio in test set =  %.2f" % (100 * np.sum(y_test) / y_test.shape[0]))
    # learn the model
    #sc = StandardScaler().fit(X)
    sc = MinMaxScaler().fit(X)
    X = sc.transform(X)
    X_test = sc.transform(X_test)
    print("training LR..")
    lr = linear_model.LogisticRegression()
    lr.fit(X, y)
    # lr.fit(X, y, sample_weight=vcx)
    print("train/test mean accuracy = %.2f, %.2f" %
          (lr.score(X, y), lr.score(X_test, y_test)))
    y_pred = lr.predict(X_test)
    print_results(y_test, y_pred)
    print("training balanced LR..")
    lr = linear_model.LogisticRegression(class_weight='balanced')
    lr.fit(X, y)
    #lr.fit(X, y, sample_weight=vcx)
    print("train/test mean accuracy = %.2f, %.2f" %
          (lr.score(X, y), lr.score(X_test, y_test)))
    #c = np.column_stack((df.columns.values[1:-1], np.round(lr.coef_.flatten(),2)))
    #print(c[c[:,1].argsort()])
    start = datetime.datetime.now()
    y_prob = lr.predict_proba(X_test)
    y_pred = y_prob[:, 1] > t
    y_pred = y_pred.astype('uint8')
    print(y_pred.shape)
    end = datetime.datetime.now()
    print('--- results:')
    print_results(y_test, y_pred)
    delta = end - start
    print('total time: %f' % delta.total_seconds())
    print('time per query: %f' % (delta.total_seconds() / len(y_pred)))
    print('test size (distinct): % d' % y_pred.size)
    print('test size (all): % d' % vc.sum())
    ones = vc * y_pred
    print('ones ratio: %.2f' % (ones.sum() / y_pred.size))
    output = pd.DataFrame()
    output['Query'] = test_queries
    output['TestViewCount'] = vc
    output['Label'] = y_test
    output['Pred'] = pd.Series(y_pred, index=output.index)
    output['18'] = q18
    output['100'] = q100
    output['ql'] = ql
    output['ml'] = output.apply(f, axis=1)
    output['best'] = output.apply(g, axis=1)
    r = np.random.randint(0, 2, q18.size)
    output['rand'] = q18.copy()
    output['rand'][r == 1] = q100[r == 1].copy()
    if (argv[2]):
        output.to_csv('../../data/python_data/%s_result.csv' %
                  filename[:-4], index=False)
Exemple #33
0
	
				
	"""
	# Use Pipl to match remainder
	for dev in [dev for dev in devs if dev['email']]:
		if dev['li_matches']:
			if dev['li_matches'][0]['score'] < 75:
				print "Trying piplsearch for %s..." % dev.get('name')
				result = try_piplsearch(dev)
				if result:
					dev['li_matches'].append(result)
					# re-sort from highest scoring match to lowest
					dev['li_matches'] = sorted(dev['li_matches'], key =lambda k: k['score'], reverse=True)
		else:
			print "Trying piplsearch for %s..." % dev.get('name')
			result = try_piplsearch(dev)
			if result:
				dev['li_matches'].append(result)
				# re-sort from highest scoring match to lowest
				dev['li_matches'] = sorted(dev['li_matches'], key =lambda k: k['score'], reverse=True)
	"""
		
		
	# See the results of running through both algos
	#devs = utils.load_json('scoredresults')
	utils.print_results(devs)
	
	#utils.sanity_check(devs)
	

Exemple #34
0
    else:
        offset = get_split_offset(args.percentage, training_set)
        testing_set = training_set[offset:]
        training_set = training_set[:offset]
        print 'Testing set is derived from training set'
    print 'Training set: %d instances, testing set: %d instances' %(len(training_set), len(testing_set))

    #Choose classifier
    if args.classifier == 'bayes':
        classifier = bayes.bayes()
        for instance in cls:
            clazz_ = Clazz(len(training_set[0])-1,instance)
            classifier.add_class(clazz_)
    elif args.classifier == 'knn':
        classifier = knn.knn(args.kvalue)
    else:
         raise RuntimeError

    #Precompute data for normalization
    statistics = Clazz(len(training_set[0])-1,'training_set')
    [statistics.add_match(instance) for instance in training_set]

    #Train and classify
    [classifier.train(instance) for instance in normalize(training_set)]
    [classifier.classify(normalize_instance(instance)) for instance in testing_set]
    utils.print_results(classifier.attempts,classifier.hits)
    print 'Classification completed'



def main(argv):
    filename = argv[0]
    df = pd.read_csv('../../data/cache_selection/' + filename)
    t = float(argv[1])
    df = df.fillna(0)
    labels = df['label']
    size = 0.33
    X, X_test, y, y_test = train_test_split(df.drop(['label'], axis=1), labels, stratify=labels,
                                            test_size=size, random_state=1)
    X = X.drop(['query', 'TrainFreq', 'TestFreq', '2', '100'], axis=1)
    test_queries = X_test['query']
    subset_mrr = X_test['2']
    db_mrr = X_test['100']
    test_freq = X_test['TestFreq']
    X_test = X_test.drop(['TrainFreq', 'TestFreq', 'query', '2', '100'], axis=1)
    ql = subset_mrr.copy()
    ql_pred = X_test['ql_c'] < X_test['ql_c.1']
    ql.loc[ql_pred == 1] = db_mrr[ql_pred == 1]
    #print(df.corr()['label'].sort_values())
    print("train set size and ones: %d, %d" % (y.shape[0], np.sum(y)))
    print("test set size and ones: %d, %d" % (y_test.shape[0], np.sum(y_test)))
    print("onez ratio in trian set =  %.2f" % (100 * np.sum(y) / y.shape[0]))
    print("onez ratio in test set =  %.2f" % (100 * np.sum(y_test) / y_test.shape[0]))
    # learn the model
    #sc = StandardScaler().fit(X)
    sc = MinMaxScaler().fit(X)
    X = sc.transform(X)
    X_test = sc.transform(X_test)
    print("training balanced LR..")
    lr = linear_model.LogisticRegression(class_weight='balanced')
    lr.fit(X, y)
    print("training mean accuracy = %.2f" % lr.score(X, y))
    print("testing mean accuracy = %.2f" % lr.score(X_test, y_test))
    c = np.column_stack((df.columns.values[5:-1], np.round(lr.coef_.flatten(),2)))
    print(c[c[:,1].argsort()])
    y_prob = lr.predict_proba(X_test)
    y_pred = y_prob[:, 1] > t
    y_pred = y_pred.astype('uint8')
    print('--- t = %.2f results:' % t)
    print_results(y_test, y_pred)
    output = pd.DataFrame()
    output['Query'] = test_queries
    output['TestFreq'] = test_freq
    output['2'] = subset_mrr
    output['100'] = db_mrr
    output['Label'] = y_test
    output['ql'] = ql
    output['ql_label'] = ql
    ml = subset_mrr.copy()
    ml.loc[y_pred == 1] = db_mrr[y_pred == 1]
    output['ml'] = ml
    output['Pred'] = pd.Series(y_pred, index=output.index)
    best = subset_mrr.copy()
    print(best.mean())
    best[y_test == 1] = db_mrr[y_test == 1]
    print(best.mean())
    output['best'] = best
    r = np.random.randint(0, 2, output['2'].size)
    output['rand'] = output['2'].copy()
    output['rand'][r == 1] = output['100'][r == 1].copy()
    analyze(output, '2', '100','TestFreq')
    if (argv[2]):
        df.to_csv('%s%s_result.csv' % ('../../data/python_data/',
                                       filename[:-4]), index=False)
Exemple #36
0
                        GOOD_SCRIPTS[netid] = script
                    except:
                        # AssertionError and other exception
                        print('-' * 80)
                        print("'", script, "'", sep='')
                        print('Output of %s' % netid)
                        print(output)
                        print('=' * 80)
                        print('Code of %s' % netid)
                        with open(script, 'r') as f:
                            print(f.read())
                        self.students[netid] = 0
                        BAD_SCRIPTS[netid] = script

    def tearDown(self):
        global STUDENTS
        STUDENTS = self.students


if __name__ == '__main__':
    suite = unittest.TestLoader().loadTestsFromTestCase(TestAssignment)
    runner = unittest.TextTestRunner()
    runner.run(suite)
    print_results(STUDENTS)
    check_scripts(GOOD_SCRIPTS)
    #for netid in OUTPUTS:
        #print('[netid]', netid)
        #print(OUTPUTS[netid])
    print('*' * 80)
    check_scripts(BAD_SCRIPTS)