Esempio n. 1
0
def join_datasets(project_id, left_dataset_id, right_dataset_id, on, left_on, right_on, how, left_suffix, right_suffix, new_dataset_name_prefix):
    left_df = get_data(project_id=project_id, dataset_id=left_dataset_id)
    right_df = get_data(project_id=project_id, dataset_id=right_dataset_id)

    project = db_access.get_project(project_id)
    original_left_dataset = db_access.get_dataset(project_id, left_dataset_id)
    original_right_dataset = db_access.get_dataset(project_id, right_dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id))

    original_left_dataset_title = original_left_dataset['title']
    original_right_dataset_title = original_right_dataset['title']

    fallback_title = original_left_dataset_title[:20] + original_left_dataset_title[:20]
    original_dataset_title = original_left_dataset_title + original_right_dataset_title
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    left_columns = left_df.columns.values
    right_columns = right_df.columns.values
    on = list_elements_from_indices(left_columns, on)

    # Not using left_on or right_on for now
    df_joined = left_df.merge(right_df, how=how, on=on, suffixes=[left_suffix, right_suffix])

    return df_joined, new_dataset_title, new_dataset_name, new_dataset_path
Esempio n. 2
0
def unpivot_dataset(project_id, dataset_id, pivot_fields, variable_name,
                    value_name, new_dataset_name_prefix):
    '''
    Returns unpivoted dataframe
    '''
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    project = db_access.get_project(project_id)
    original_dataset = db_access.get_dataset(project_id, dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'],
                                   project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'],
                                   str(project_id))

    original_dataset_title = original_dataset['title']
    fallback_title = original_dataset_title[:20]
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    columns = df.columns.values
    pivot_fields = list_elements_from_indices(columns, pivot_fields)
    preserved_fields = difference_of_lists(columns, pivot_fields)
    df_unpivoted = pd.melt(df,
                           id_vars=preserved_fields,
                           value_vars=pivot_fields,
                           var_name=variable_name,
                           value_name=value_name)

    return df_unpivoted, new_dataset_title, new_dataset_name, new_dataset_path
Esempio n. 3
0
def unpivot_dataset(project_id, dataset_id, pivot_fields, variable_name, value_name, new_dataset_name_prefix):
    '''
    Returns unpivoted dataframe
    '''
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    project = db_access.get_project(project_id)
    original_dataset = db_access.get_dataset(project_id, dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id))

    original_dataset_title = original_dataset['title']
    fallback_title = original_dataset_title[:20]
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    columns = df.columns.values
    pivot_fields = list_elements_from_indices(columns, pivot_fields)
    preserved_fields = difference_of_lists(columns, pivot_fields)
    df_unpivoted = pd.melt(df, id_vars=preserved_fields, value_vars=pivot_fields, var_name=variable_name, value_name=value_name)

    return df_unpivoted, new_dataset_title, new_dataset_name, new_dataset_path
Esempio n. 4
0
def get_data(project_id=None, dataset_id=None, nrows=None, field_properties=[]):
    if IMD.hasData(dataset_id):
        logger.debug('Accessing from IMD, project_id: %s, dataset_id: %s', project_id, dataset_id)
        df = IMD.getData(dataset_id)
        return df

    dataset = db_access.get_dataset(project_id, dataset_id)
    print(dataset)
    dialect = dataset['dialect']
    encoding = dataset.get('encoding', 'utf-8')

    if dataset['storage_type'] == 's3':
        if dataset['preloaded']:
            file_obj = s3_client.get_object(
                Bucket=current_app.config['AWS_DATA_BUCKET'],
                Key="-1/%s" % dataset['file_name']
            )
        else:
            file_obj = s3_client.get_object(
                Bucket=current_app.config['AWS_DATA_BUCKET'],
                Key="%s/%s" % (str(project_id), dataset['file_name'])
            )
        accessor = file_obj['Body']

    if dataset['storage_type'] == 'file':
        accessor = dataset['path']

    if not field_properties:
        field_properties = db_access.get_field_properties(project_id, dataset_id)

    # dive-la debug
    print('accessor:', accessor)
    import os
    print('folder contents', os.listdir("/usr/src/app/uploads/1/"))

    print('now pd read table')

    df = pd.read_table(
        accessor,
        error_bad_lines = False,
        encoding = encoding,
        skiprows = dataset['offset'],
        sep = dialect['delimiter'],
        engine = 'c',
        # dtype = field_to_type_mapping,
        escapechar = dialect['escapechar'],
        doublequote = dialect['doublequote'],
        quotechar = dialect['quotechar'],
        parse_dates = True,
        nrows = nrows,
        thousands = ','
    )
    sanitized_df = sanitize_df(df)
    coerced_df = coerce_types(sanitized_df, field_properties)

    IMD.insertData(dataset_id, coerced_df)
    return coerced_df
Esempio n. 5
0
def join_datasets(project_id, left_dataset_id, right_dataset_id, on, left_on,
                  right_on, how, left_suffix, right_suffix,
                  new_dataset_name_prefix):
    left_df = get_data(project_id=project_id, dataset_id=left_dataset_id)
    right_df = get_data(project_id=project_id, dataset_id=right_dataset_id)

    project = db_access.get_project(project_id)
    original_left_dataset = db_access.get_dataset(project_id, left_dataset_id)
    original_right_dataset = db_access.get_dataset(project_id,
                                                   right_dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'],
                                   project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'],
                                   str(project_id))

    original_left_dataset_title = original_left_dataset['title']
    original_right_dataset_title = original_right_dataset['title']

    fallback_title = original_left_dataset_title[:
                                                 20] + original_left_dataset_title[:
                                                                                   20]
    original_dataset_title = original_left_dataset_title + original_right_dataset_title
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    left_columns = left_df.columns.values
    right_columns = right_df.columns.values
    on = list_elements_from_indices(left_columns, on)

    # Not using left_on or right_on for now
    df_joined = left_df.merge(right_df,
                              how=how,
                              on=on,
                              suffixes=[left_suffix, right_suffix])

    return df_joined, new_dataset_title, new_dataset_name, new_dataset_path
Esempio n. 6
0
    def get(self, dataset_id):
        args = datasetGetParser.parse_args()
        project_id = args.get('project_id')

        dataset = db_access.get_dataset(project_id, dataset_id)
        sample = get_dataset_sample(dataset_id, project_id)

        response = {
            'id': dataset_id,
            'title': dataset.get('title'),
            'preloaded': dataset.get('preloaded'),
            'details': sample
        }
        return jsonify(response)
Esempio n. 7
0
    def get(self, dataset_id):
        args = datasetGetParser.parse_args()
        project_id = args.get('project_id')

        dataset = db_access.get_dataset(project_id, dataset_id)
        sample = get_dataset_sample(dataset_id, project_id)

        response = {
            'id': dataset_id,
            'title': dataset.get('title'),
            'preloaded': dataset.get('preloaded'),
            'details': sample
        }
        return jsonify(response)
Esempio n. 8
0
def get_data(project_id=None, dataset_id=None, nrows=None, field_properties=[]):
    if IMD.hasData(dataset_id):
        logger.debug('Accessing from IMD, project_id: %s, dataset_id: %s', project_id, dataset_id)
        df = IMD.getData(dataset_id)
        return df

    dataset = db_access.get_dataset(project_id, dataset_id)
    dialect = dataset['dialect']
    encoding = dataset.get('encoding', 'utf-8')

    if dataset['storage_type'] == 's3':
        if dataset['preloaded']:
            file_obj = s3_client.get_object(
                Bucket=current_app.config['AWS_DATA_BUCKET'],
                Key="-1/%s" % dataset['file_name']
            )
        else:
            file_obj = s3_client.get_object(
                Bucket=current_app.config['AWS_DATA_BUCKET'],
                Key="%s/%s" % (str(project_id), dataset['file_name'])
            )
        accessor = file_obj['Body']

    if dataset['storage_type'] == 'file':
        accessor = dataset['path']

    if not field_properties:
        field_properties = db_access.get_field_properties(project_id, dataset_id)

    df = pd.read_table(
        accessor,
        error_bad_lines = False,
        encoding = encoding,
        skiprows = dataset['offset'],
        sep = dialect['delimiter'],
        engine = 'c',
        # dtype = field_to_type_mapping,
        escapechar = dialect['escapechar'],
        doublequote = dialect['doublequote'],
        quotechar = dialect['quotechar'],
        parse_dates = True,
        nrows = nrows,
        thousands = ','
    )
    sanitized_df = sanitize_df(df)
    coerced_df = coerce_types(sanitized_df, field_properties)

    IMD.insertData(dataset_id, coerced_df)
    return coerced_df
Esempio n. 9
0
def compute_dataset_properties(dataset_id, project_id, path=None):
    ''' Compute and return dictionary containing whole
    import pandas as pd-dataset properties '''

    print('compute_dataset_properties', 'path', path)

    if not path:
        dataset = db_access.get_dataset(project_id, dataset_id)
        path = dataset['path']
        df = get_data(project_id=project_id, dataset_id=dataset_id)

    n_rows, n_cols = df.shape
    field_names = df.columns.values.tolist()

    # field_types = []
    # for (i, field_name) in enumerate(df):
    #     logger.debug('Calculating types for field %s', field_name)
    #     field_values = df[field_name]
    #     field_type, field_type_scores = calculate_field_type(field_name, field_values, i, n_cols)
    #     field_types.append(field_type)

    # Forgoing time series detection for now (expensive)
    # time_series = detect_time_series(df, field_types)
    # if time_series:
    #     time_series = True
    time_series = False

    structure = 'wide' if time_series else 'long'

    properties = {
        'n_rows': n_rows,
        'n_cols': n_cols,
        'field_names': field_names,
        # 'field_types': field_types,
        'field_accessors': [i for i in range(0, n_cols)],
        'structure': structure,
        'is_time_series': time_series,
    }

    return {
        'desc': 'Done computing dataset properties',
        'result': properties,
    }
Esempio n. 10
0
def reduce_dataset(project_id, dataset_id, column_ids_to_keep, new_dataset_name_prefix):
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    project = db_access.get_project(project_id)
    original_dataset = db_access.get_dataset(project_id, dataset_id)

    preloaded_project = project.get('preloaded', False)
    if preloaded_project:
        project_dir = os.path.join(task_app.config['PRELOADED_PATH'], project['directory'])
    else:
        project_dir = os.path.join(task_app.config['STORAGE_PATH'], str(project_id))

    original_dataset_title = original_dataset['title']
    fallback_title = original_dataset_title[:20]
    dataset_type = '.tsv'
    new_dataset_title, new_dataset_name, new_dataset_path = \
        get_transformed_file_name(project_dir, new_dataset_name_prefix, fallback_title, original_dataset_title, dataset_type)

    df_reduced = df.iloc[:, column_ids_to_keep]

    return df_reduced, new_dataset_title, new_dataset_name, new_dataset_path
Esempio n. 11
0
    def get(self, dataset_id):
        args = datasetGetParser.parse_args()
        project_id = args.get('project_id')

        dataset = db_access.get_dataset(project_id, dataset_id)
        sample = get_dataset_sample(dataset_id, project_id)

        response = {
            'id': dataset_id,
            'title': dataset.get('title'),
            'preloaded': dataset.get('preloaded'),
            'details': sample
        }

        #print('Dataset(Resource)', response)

        #print('Sample', sample)

        #print('Dataset(Resource) json', jsonify(response))
        return jsonify(response)