def join_datasets(project_id, left_dataset_id, right_dataset_id, on, left_on, right_on, how, left_suffix, right_suffix, new_dataset_name_prefix): left_df = get_data(project_id=project_id, dataset_id=left_dataset_id) right_df = get_data(project_id=project_id, dataset_id=right_dataset_id) project = db_access.get_project(project_id) original_left_dataset = db_access.get_dataset(project_id, left_dataset_id) original_right_dataset = db_access.get_dataset(project_id, right_dataset_id) preloaded_project = project.get('preloaded', False) if preloaded_project: project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory']) else: project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id)) original_left_dataset_title = original_left_dataset['title'] original_right_dataset_title = original_right_dataset['title'] fallback_title = original_left_dataset_title[:20] + original_left_dataset_title[:20] original_dataset_title = original_left_dataset_title + original_right_dataset_title dataset_type = '.tsv' new_dataset_title, new_dataset_name, new_dataset_path = \ get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type) left_columns = left_df.columns.values right_columns = right_df.columns.values on = list_elements_from_indices(left_columns, on) # Not using left_on or right_on for now df_joined = left_df.merge(right_df, how=how, on=on, suffixes=[left_suffix, right_suffix]) return df_joined, new_dataset_title, new_dataset_name, new_dataset_path
def unpivot_dataset(project_id, dataset_id, pivot_fields, variable_name, value_name, new_dataset_name_prefix): ''' Returns unpivoted dataframe ''' df = get_data(project_id=project_id, dataset_id=dataset_id) project = db_access.get_project(project_id) original_dataset = db_access.get_dataset(project_id, dataset_id) preloaded_project = project.get('preloaded', False) if preloaded_project: project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory']) else: project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id)) original_dataset_title = original_dataset['title'] fallback_title = original_dataset_title[:20] dataset_type = '.tsv' new_dataset_title, new_dataset_name, new_dataset_path = \ get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type) columns = df.columns.values pivot_fields = list_elements_from_indices(columns, pivot_fields) preserved_fields = difference_of_lists(columns, pivot_fields) df_unpivoted = pd.melt(df, id_vars=preserved_fields, value_vars=pivot_fields, var_name=variable_name, value_name=value_name) return df_unpivoted, new_dataset_title, new_dataset_name, new_dataset_path
def get_data(project_id=None, dataset_id=None, nrows=None, field_properties=[]): if IMD.hasData(dataset_id): logger.debug('Accessing from IMD, project_id: %s, dataset_id: %s', project_id, dataset_id) df = IMD.getData(dataset_id) return df dataset = db_access.get_dataset(project_id, dataset_id) print(dataset) dialect = dataset['dialect'] encoding = dataset.get('encoding', 'utf-8') if dataset['storage_type'] == 's3': if dataset['preloaded']: file_obj = s3_client.get_object( Bucket=current_app.config['AWS_DATA_BUCKET'], Key="-1/%s" % dataset['file_name'] ) else: file_obj = s3_client.get_object( Bucket=current_app.config['AWS_DATA_BUCKET'], Key="%s/%s" % (str(project_id), dataset['file_name']) ) accessor = file_obj['Body'] if dataset['storage_type'] == 'file': accessor = dataset['path'] if not field_properties: field_properties = db_access.get_field_properties(project_id, dataset_id) # dive-la debug print('accessor:', accessor) import os print('folder contents', os.listdir("/usr/src/app/uploads/1/")) print('now pd read table') df = pd.read_table( accessor, error_bad_lines = False, encoding = encoding, skiprows = dataset['offset'], sep = dialect['delimiter'], engine = 'c', # dtype = field_to_type_mapping, escapechar = dialect['escapechar'], doublequote = dialect['doublequote'], quotechar = dialect['quotechar'], parse_dates = True, nrows = nrows, thousands = ',' ) sanitized_df = sanitize_df(df) coerced_df = coerce_types(sanitized_df, field_properties) IMD.insertData(dataset_id, coerced_df) return coerced_df
def join_datasets(project_id, left_dataset_id, right_dataset_id, on, left_on, right_on, how, left_suffix, right_suffix, new_dataset_name_prefix): left_df = get_data(project_id=project_id, dataset_id=left_dataset_id) right_df = get_data(project_id=project_id, dataset_id=right_dataset_id) project = db_access.get_project(project_id) original_left_dataset = db_access.get_dataset(project_id, left_dataset_id) original_right_dataset = db_access.get_dataset(project_id, right_dataset_id) preloaded_project = project.get('preloaded', False) if preloaded_project: project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory']) else: project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id)) original_left_dataset_title = original_left_dataset['title'] original_right_dataset_title = original_right_dataset['title'] fallback_title = original_left_dataset_title[: 20] + original_left_dataset_title[: 20] original_dataset_title = original_left_dataset_title + original_right_dataset_title dataset_type = '.tsv' new_dataset_title, new_dataset_name, new_dataset_path = \ get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type) left_columns = left_df.columns.values right_columns = right_df.columns.values on = list_elements_from_indices(left_columns, on) # Not using left_on or right_on for now df_joined = left_df.merge(right_df, how=how, on=on, suffixes=[left_suffix, right_suffix]) return df_joined, new_dataset_title, new_dataset_name, new_dataset_path
def get(self, dataset_id): args = datasetGetParser.parse_args() project_id = args.get('project_id') dataset = db_access.get_dataset(project_id, dataset_id) sample = get_dataset_sample(dataset_id, project_id) response = { 'id': dataset_id, 'title': dataset.get('title'), 'preloaded': dataset.get('preloaded'), 'details': sample } return jsonify(response)
def get_data(project_id=None, dataset_id=None, nrows=None, field_properties=[]): if IMD.hasData(dataset_id): logger.debug('Accessing from IMD, project_id: %s, dataset_id: %s', project_id, dataset_id) df = IMD.getData(dataset_id) return df dataset = db_access.get_dataset(project_id, dataset_id) dialect = dataset['dialect'] encoding = dataset.get('encoding', 'utf-8') if dataset['storage_type'] == 's3': if dataset['preloaded']: file_obj = s3_client.get_object( Bucket=current_app.config['AWS_DATA_BUCKET'], Key="-1/%s" % dataset['file_name'] ) else: file_obj = s3_client.get_object( Bucket=current_app.config['AWS_DATA_BUCKET'], Key="%s/%s" % (str(project_id), dataset['file_name']) ) accessor = file_obj['Body'] if dataset['storage_type'] == 'file': accessor = dataset['path'] if not field_properties: field_properties = db_access.get_field_properties(project_id, dataset_id) df = pd.read_table( accessor, error_bad_lines = False, encoding = encoding, skiprows = dataset['offset'], sep = dialect['delimiter'], engine = 'c', # dtype = field_to_type_mapping, escapechar = dialect['escapechar'], doublequote = dialect['doublequote'], quotechar = dialect['quotechar'], parse_dates = True, nrows = nrows, thousands = ',' ) sanitized_df = sanitize_df(df) coerced_df = coerce_types(sanitized_df, field_properties) IMD.insertData(dataset_id, coerced_df) return coerced_df
def compute_dataset_properties(dataset_id, project_id, path=None): ''' Compute and return dictionary containing whole import pandas as pd-dataset properties ''' print('compute_dataset_properties', 'path', path) if not path: dataset = db_access.get_dataset(project_id, dataset_id) path = dataset['path'] df = get_data(project_id=project_id, dataset_id=dataset_id) n_rows, n_cols = df.shape field_names = df.columns.values.tolist() # field_types = [] # for (i, field_name) in enumerate(df): # logger.debug('Calculating types for field %s', field_name) # field_values = df[field_name] # field_type, field_type_scores = calculate_field_type(field_name, field_values, i, n_cols) # field_types.append(field_type) # Forgoing time series detection for now (expensive) # time_series = detect_time_series(df, field_types) # if time_series: # time_series = True time_series = False structure = 'wide' if time_series else 'long' properties = { 'n_rows': n_rows, 'n_cols': n_cols, 'field_names': field_names, # 'field_types': field_types, 'field_accessors': [i for i in range(0, n_cols)], 'structure': structure, 'is_time_series': time_series, } return { 'desc': 'Done computing dataset properties', 'result': properties, }
def reduce_dataset(project_id, dataset_id, column_ids_to_keep, new_dataset_name_prefix): df = get_data(project_id=project_id, dataset_id=dataset_id) project = db_access.get_project(project_id) original_dataset = db_access.get_dataset(project_id, dataset_id) preloaded_project = project.get('preloaded', False) if preloaded_project: project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory']) else: project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id)) original_dataset_title = original_dataset['title'] fallback_title = original_dataset_title[:20] dataset_type = '.tsv' new_dataset_title, new_dataset_name, new_dataset_path = \ get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type) df_reduced = df.iloc[:, column_ids_to_keep] return df_reduced, new_dataset_title, new_dataset_name, new_dataset_path
def get(self, dataset_id): args = datasetGetParser.parse_args() project_id = args.get('project_id') dataset = db_access.get_dataset(project_id, dataset_id) sample = get_dataset_sample(dataset_id, project_id) response = { 'id': dataset_id, 'title': dataset.get('title'), 'preloaded': dataset.get('preloaded'), 'details': sample } #print('Dataset(Resource)', response) #print('Sample', sample) #print('Dataset(Resource) json', jsonify(response)) return jsonify(response)