Exemple #1
0
def show_training_set_histogram(path, config):
    column_name = config['histogram']['column']
    with open(path) as fp:
        hist_values = []
        header = trainingset.normalize_header(next(fp))  # get header
        for line in fp:
            row = trainingset.TrainingSetRow(trainingset.normalize_line(line),
                                             header)
            hist_values.append(float(row.get(column_name)))
        plot_hist(hist_values, config, column_name)
Exemple #2
0
def show_scatter_2d_plot(path, config):
    with open(path) as fp:
        header = trainingset.normalize_header(next(fp))
        x_key = config['scatter']['x_axis']
        y_key = config['scatter']['y_axis']
        y_data = []
        x_data = []
        for line in fp:
            row = trainingset.TrainingSetRow(trainingset.normalize_line(line),
                                             header)
            x_data.append(float(row.get(x_key)))
            y_data.append(float(row.get(y_key)))
        plt.plot(x_data, y_data, '.', label='observation')
        plt.xlabel(x_key)
        plt.ylabel(y_key)
        plt.legend()
        plt.show()
Exemple #3
0
def show_scatter_3d_plot(path, config):
    with open(path) as fp:
        header = trainingset.normalize_header(next(fp))
        x_key = config['scatter']['x_axis']
        y_key = config['scatter']['y_axis']
        z_key = config['scatter']['z_axis']
        y_data = []
        x_data = []
        z_data = []
        for line in fp:
            row = trainingset.TrainingSetRow(trainingset.normalize_line(line),
                                             header)
            x_data.append(float(row.get(x_key)))
            y_data.append(float(row.get(y_key)))
            z_data.append(float(row.get(z_key)))

        fig = plt.figure()
        ax = fig.gca(projection='3d')
        ax.scatter(x_data, y_data, z_data, label='observations')
        ax.set_xlabel(x_key)
        ax.set_ylabel(y_key)
        ax.set_zlabel(z_key)
        ax.legend()
        plt.show()
 def setUp(self):
     super().setUp()
     header = '#0->on_sale 1->bm25_score'
     line = '2 qid:0 0:0 1:4.1 #red jeans->1120'
     self.test_row = trainingset.TrainingSetRow(line, header)
Exemple #5
0
if __name__ == "__main__":
    with open('config.json') as json_data_file:
        config = json.load(json_data_file)
        url = get_solr_url(config)
        batch_size = config['indexing']['batch_size']
        click_stream_file_path = config['file']['click_stream']
        training_set_file_path = config['file']['training_set']

        with open(training_set_file_path) as training_set_file:
            click_data_dict = read_click_data_dict(click_stream_file_path)
            header = trainingset.normalize_header(next(training_set_file))
            docs_batch = []
            indexed = 0
            batch_num = 0
            for line in training_set_file:
                row = trainingset.TrainingSetRow(
                    trainingset.normalize_line(line), header)
                doc_for_index = row.get_as_dict()
                qd_key = row.get_qd_pair()
                if qd_key in click_data_dict:
                    doc_for_index.update(click_data_dict[qd_key])
                else:
                    print('Not joined by key for={}'.format(qd_key))
                    continue
                docs_batch.append(doc_for_index)
                indexed += 1
                if len(docs_batch) >= batch_size:
                    index_docs(docs_batch, url)
                    docs_batch.clear()
                    batch_num += 1
                    print('Batch #{}. Total indexed {} docs.'.format(
                        batch_num, indexed))