Ejemplo n.º 1
0
def upload_sequences():
    def allowed_file(filename):
        return '.' in filename and \
                filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

    query_file = check_exists(request.files.get('queryFile'), 'queryFile')
    has_name_col = request.form.get('hasNameCol', False, type=bool)
    if query_file.filename == '':
        raise FileError('No file selected')

    if query_file and allowed_file(query_file.filename):
        query_file.save(UPLOAD_PATH)
        # TODO: limit the size of the uploaded set
        load_details = pygenex.loadDataset('upload',
                                           UPLOAD_PATH,
                                           hasNameCol=has_name_col)
        pygenex.normalize('upload')
        allTimeSeries = get_names_lengths_thumbnails('upload',
                                                 load_details['count'])
        for i in range(load_details['count']):
            series = pygenex.getTimeSeries('upload', i)
            allTimeSeries[i]['raw'] = attach_index(series)

        pygenex.unloadDataset('upload')
        os.remove(UPLOAD_PATH)
        return jsonify(allTimeSeries)

    raise FileError('Invalid file type')
Ejemplo n.º 2
0
 def _prune(self):
     old_cache = self._cache.copy()
     super(GenexCache, self)._prune()
     for k in old_cache:
         if isinstance(k, tuple) and k not in self._cache:
             try:
                 pg.unloadDataset(make_name(*k))
                 logger.debug('Unloaded %s', k)
             except RuntimeError:
                 logger.debug('%s is not a loaded dataset', k)
Ejemplo n.º 3
0
def group_dataset(name,
                  from_st,
                  to_st,
                  dist,
                  num_threads=15,
                  dry_run=False,
                  exclude_callback=None,
                  progress_callback=None):
    dataset_path = os.path.join(DATASET_ROOT, name + '_DATA')
    info = pg.loadDataset(name, dataset_path, ',', -1, 1)
    logging.info('Loaded dataset %s. Count = %d. Length = %d', name,
                 info['count'], info['length'])
    pg.normalize(name)
    logging.info('Normalized the dataset %s.', name)
    records = []
    timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
    records_name = name + '_records_' + timestamp + '.csv'
    records_path = os.path.join(GROUPS_ROOT, name, records_name)
    for d in dist:
        for st in np.arange(from_st, to_st, 0.1):
            st = round(st * 10) / 10
            if exclude_callback is not None and exclude_callback(name, d, st):
                logging.info('Ignore [%s, %s, %.1f]', name, d, st)
                continue

            logging.info('Grouping [%s, %s, %.1f] with %d threads', name, d,
                         st, num_threads)
            if dry_run:
                records.append({})

            if not dry_run:
                start = time.time()
                group_count = pg.group(name, st, d, num_threads)
                end = time.time()

                logging.info('Finished [%s, %s, %.1f] after %f seconds', name,
                             d, st, end - start)
                logging.info('[%s, %s, %.1f] generates %d groups', name, d, st,
                             group_count)

                save_dir = os.path.join(GROUPS_ROOT, name, d)
                if not os.path.exists(save_dir):
                    os.makedirs(save_dir)
                save_path = os.path.join(
                    save_dir, name + '_GROUPS_' + '{:.1f}'.format(st))
                logging.info('Saving groups [%s, %s, %.1f] to %s', name, d, st,
                             save_path)
                pg.saveGroups(name, save_path)

                size_save_path = os.path.join(
                    save_dir, name + '_GROUP_SIZES_' + '{:.1f}'.format(st))

                logging.info('Saving groups size [%s, %s, %.1f] to %s', name,
                             d, st, size_save_path)
                pg.saveGroupsSize(name, size_save_path)

                records.append({
                    'dist_name': d,
                    'st': st,
                    'group_count': group_count,
                    'path': save_path,
                    'size_path': size_save_path,
                    'duration': end - start
                })

                records_df = pd.DataFrame(records)
                records_df.to_csv(records_path, index=False)
                logging.info('Saved grouping record for %s to %s', name,
                             records_path)

                if progress_callback is not None:
                    progress_callback(name, d, st)

    pg.unloadDataset(name)
    logging.info('Unloaded %s', name)
    return records
Ejemplo n.º 4
0
def run_paa(name, dist, k, queries_df, dry_run=False):
    '''Run PAA experiment

	For each distance, this method iterates over the queries. It checks if
	the current query was already performed and recorded to the json result file.
	If it was not, it run the PAA method with that query

	JSON result structure
	{
		'euclidean': [{ query: [index, start, end, outside]
						result_paa: [{'data': {'index': ..., 'end': .., 'start': ...}, 'dist': ...}, ...]
						time_paa: ....},
					  ...]
	  'manhattan': ...
	}
	'''
    name, name_out = load_and_normalize(name)

    pg.preparePAA(name, 3)
    logging.info('Generate PAA of block size 3 for dataset %s.', name)

    results, experiment_path = get_results_object(name)

    for d in dist:
        if d not in results:
            results[d] = []

        for i in range(queries_df.shape[0]):
            query = {
                'index': queries_df['index'][i],
                'start': queries_df.start[i],
                'end': queries_df.end[i],
                'outside': queries_df.outside[i]
            }

            find_query = filter(
                lambda o: 'result_paa' in o and o['query'] == query,
                results[d])
            if len(find_query) == 0:
                logging.info('Running %s %s...(%d/%d)', name,
                             query_description('PAA', k, query, d), i,
                             queries_df.shape[0])
                if not dry_run:
                    # Run the query and measure response time
                    start = time.time()
                    if query['outside'] == 0:  # is inside
                        result_paa = pg.ksimpaa(k, name, name, query['index'],
                                                query['start'], query['end'],
                                                d)
                    else:
                        result_paa = pg.ksimpaa(k, name, name_out,
                                                query['index'], query['start'],
                                                query['end'], d)
                    end = time.time()

                    time_paa = end - start

                    results[d].append({
                        'query': query,
                        'result_paa': result_paa,
                        'time_paa': time_paa
                    })

                    # Dump result to file immediatelly
                    with open(experiment_path, 'w') as f:
                        json.dump(results, f)

                    logging.info('Finished %s after %.1f seconds',
                                 query_description('PAA', k, query, d),
                                 end - start)
            else:
                logging.info('Query %s is already run',
                             query_description('PAA', k, query, d))

    pg.unloadDataset(name)
    logging.info('Unloaded %s', name)

    pg.unloadDataset(name_out)
    logging.info('Unloaded %s', name_out)
Ejemplo n.º 5
0
'''
from __future__ import print_function
import os
import pygenex as pg
import pandas as pd

from common import GROUPING_RECORDS, DATASET_ROOT

all_datasets = [n for n in os.listdir(DATASET_ROOT) if n.endswith('DATA')]

dataset_info = {}

for i, ds in enumerate(all_datasets):
	name = ds[:len(ds) - len('_DATA')]
	print('(%d/%d) Processing...%s' % (i, len(all_datasets), name))
	info = pg.loadDataset(name, os.path.join(DATASET_ROOT, ds), ',', -1, 1)
	dataset_info[name] = info
	pg.unloadDataset(name)

	query_info = pg.loadDataset(name
								, os.path.join(DATASET_ROOT, name + '_QUERY')
								, ',', -1, 1)
	dataset_info[name + '_out'] = query_info

df = pd.DataFrame.from_dict(dataset_info, orient='index')
df['subsequence'] = (df['count'] * df['length'] * (df['length'] - 1) / 2).astype('int')
df.to_json(GROUPING_RECORDS, orient='index')
print('Preview the first few datasets')
print(df.head())
print('Saved info to', GROUPING_RECORDS)
Ejemplo n.º 6
0
def run_genex(name, dist, k, queries_df, num_subseq, dry_run=False):
    '''Run GENEX experiment

	For each distance, this method iterates over the queries. It checks if
	the current query was already performed and recorded to the json result file.
	If it was not, it run the GENEX method with that query.

	For GENEX method, we run 1-NN and 15-NN differently. For 1-NN, we run similarly
	as brute-force and PAA. For 15-NN, we also find the smallest extent of the parameter
	k_e such that we reach 100% for each query.

	JSON result structure (1-NN, in the same file as BF and PAA)
	{
		'euclidean': [{ 
						query: [index, start, end, outside]
						result_genex_0.1: [{'data': {'index': ..., 'end': .., 'start': ...}, 'dist': ...}, ...]
						time_genex_0.1: ....
					},
					{ 
						query: [index, start, end, outside]
						result_genex_0.1: [{'data': {'index': ..., 'end': .., 'start': ...}, 'dist': ...}, ...]
						time_genex_0.1: ....},
					...]
		'manhattan': ...
	}

	JSON result structure (15-NN, in a different file)
	{
		'euclidean': [{
			query: [index, start, end, outside]
			result_genex_0.1: [<accuracy_0.1>, <accuracy_0.2>, ..., <accuracy_5.0>]
			time_genex_0.1: [<time_0.1>, <time_0.2>, ..., <time_5.0>]
		},
		{
			query: [index, start, end, outside]
			result_genex_0.2: [<accuracy_0.1>, <accuracy_0.2>, ..., <accuracy_5.0>]
			time_genex_0.2: [<time_0.1>, <time_0.2>, ..., <time_5.0>]
		},
		...]
	}
	accuracy_x means x percent of the dataset is explored
	'''
    name, name_out = load_and_normalize(name)

    results, experiment_path = get_results_object(name)
    results_15nn, experiment_path_15nn = get_results_object(name + '_15NN')

    for d in dist:

        if d not in results:
            results[d] = []

        if d not in results_15nn:
            results_15nn[d] = []

        for st in [0.1, 0.2, 0.3, 0.4, 0.5]:
            group_file_name = '{}_GROUPS_{}'.format(name, str(st))
            group_file_path = GROUPS_ROOT + '/{}/{}/{}'.format(
                name, d, group_file_name)

            if not os.path.exists(group_file_path):
                logging.info('Group %s not found. Moving on.' %
                             group_file_name)
                continue

            logging.info('Loading group file %s', group_file_name)
            number_of_groups = pg.loadGroups(name, group_file_path)
            logging.info('Loaded %s with %d groups', group_file_name,
                         number_of_groups)

            method_key = 'genex_' + str(st)
            for i in range(queries_df.shape[0]):
                query = {
                    'index': queries_df['index'][i],
                    'start': queries_df.start[i],
                    'end': queries_df.end[i],
                    'outside': queries_df.outside[i]
                }

                #####################################
                ##	      1-NN experiment          ##
                #####################################
                logging.info('1-NN experiment for %s',
                             query_description('GENEX', 1, query, d))
                find_query = filter(
                    lambda o: 'result_' + method_key in o and o['query'] ==
                    query, results[d])
                if len(find_query) == 0:
                    logging.info('Running %s %s...(%d/%d)', name,
                                 query_description('GENEX', 1, query, d),
                                 i + 1, queries_df.shape[0])
                    if not dry_run:
                        # Run the query and measure response time
                        start = time.time()
                        query_name = name if query['outside'] == 0 else name_out
                        result_genex = pg.sim(name, query_name, query['index'],
                                              query['start'], query['end'])
                        end = time.time()

                        time_genex = end - start

                        # Append new result to the result array
                        results[d].append({
                            'query': query,
                            'result_' + method_key: result_genex,
                            'time_' + method_key: time_genex
                        })

                        # Dump result to file immediatelly
                        with open(experiment_path, 'w') as f:
                            json.dump(results, f)

                        logging.info('Finished %s after %.1f seconds',
                                     query_description('GENEX', 1, query, d),
                                     end - start)
                else:
                    logging.info('Query %s is already run',
                                 query_description('GENEX', 1, query, d))

                #####################################
                ##		  15-NN experiment         ##
                #####################################
                logging.info('15-NN experiment for %s',
                             query_description('GENEX', k, query, d))
                result_bf = filter(
                    lambda o: 'result_bf' in o and o['query'] == query,
                    results[d])
                find_query = filter(
                    lambda o: 'result_' + method_key in o and o['query'] ==
                    query, results_15nn[d])
                if len(find_query) == 0 and len(result_bf) > 0:
                    logging.info('Running %s %s...(%d/%d)', name,
                                 query_description('GENEX', k, query, d),
                                 i + 1, queries_df.shape[0])

                    if not dry_run:
                        dist_bf = [
                            r['dist'] for r in result_bf[0]['result_bf']
                        ]
                        all_err = []
                        all_time = []
                        counter = 0
                        for ke_ratio in np.arange(0.1, 100.1, 0.1):
                            ke = int(round(ke_ratio / 100 * subseq))
                            if counter % 10 == 0:
                                logging.info('ke_ratio = %f. ke = %d',
                                             ke_ratio, ke)
                            counter += 1
                            start = time.time()

                            query_name = name if query[
                                'outside'] == 0 else name_out
                            result_genex = pg.ksim(k, ke, name, query_name,
                                                   query['index'],
                                                   query['start'],
                                                   query['end'])
                            end = time.time()
                            dist_genex = [r['dist'] for r in result_genex]

                            err = compute_rel_error(dist_genex, dist_bf)
                            all_err.append(err)
                            all_time.append(end - start)
                            if abs(err) < 1e-9:
                                break

                        results_15nn[d].append({
                            'query': query,
                            'result_' + method_key: all_err,
                            'time_' + method_key: all_time
                        })

                        with open(experiment_path_15nn, 'w') as f:
                            json.dump(results_15nn, f)

                        logging.info('Finished %s after %.1f seconds',
                                     query_description('GENEX', k, query, d),
                                     end - start)
                else:
                    logging.info(
                        'Query %s is already run or its bf result does not exist',
                        query_description('GENEX', k, query, d))

    pg.unloadDataset(name)
    logging.info('Unloaded %s', name)

    pg.unloadDataset(name_out)
    logging.info('Unloaded %s', name_out)