Ejemplo n.º 1
0
    def construct_folders(self):
        """Create the folder structure"""
        """
           new_dataset_id
           └── TRAIN
               ├── dataset_TRAIN
               │ ├── datasetDoc.json
               │ └── tables XXX
               │    └── learningData.csv
               └── problem_TRAIN XXX
                   ├── dataSplits.csv
                   └── problemDoc.json
        """
        if self.has_error():
            return False

        d3m_config = self.user_workspace.d3m_config  #et_latest_d3m_config()
        if not d3m_config:
            user_msg = 'Latest D3M configuration not found. (construct_folders)'
            self.send_websocket_err_msg(user_msg)
            return False

        if (not d3m_config.additional_inputs) or \
            (not isdir(d3m_config.additional_inputs)):
            user_msg = ('Additional inputs folder does not exist! %s') % \
                        (d3m_config.additional_inputs,)
            self.send_websocket_err_msg(user_msg)
            return False

        # ---------------------------------------
        # Create the problem_TRAIN
        # ---------------------------------------
        self.dataset_root_dir = join(d3m_config.additional_inputs,
                                     self.dataset_id)
        self.problem_dir = join(self.dataset_root_dir, 'TRAIN',
                                'problem_TRAIN')

        LOGGER.info('       - dataset_root_dir: %s', self.dataset_root_dir)
        LOGGER.info('       - problem_dir: %s', self.problem_dir)
        dir_info = create_directory(self.problem_dir)
        if not dir_info.success:
            self.send_websocket_err_msg(dir_info.err_msg)
            return False

        # ---------------------------------------
        # Create the tables dir
        # ---------------------------------------
        self.tables_dir = join(self.dataset_root_dir, 'TRAIN', 'dataset_TRAIN',
                               'tables')
        LOGGER.info('       - tables_dir: %s', self.tables_dir)

        dir_info = create_directory(self.tables_dir)
        if not dir_info.success:
            self.send_websocket_err_msg(dir_info.err_msg)
            return False
        self.dataset_dir = dirname(self.tables_dir)
        return True
Ejemplo n.º 2
0
    def construct_folders(self):
        """Create the folder structure + D3MConfig object"""
        """
           new_dataset_id
           └── TRAIN
               ├── dataset_TRAIN
               │ ├── datasetDoc.json
               │ └── tables XXX
               │    └── learningData.csv
               └── problem_TRAIN XXX
                   ├── dataSplits.csv
                   └── problemDoc.json
        """
        if self.has_error():
            return False

        if not isdir(self.writable_output_dir):
            user_msg = f'UserDatasetUtil. This directory does not exist: {self.writable_output_dir}'
            self.send_websocket_err_msg(user_msg)
            return False

        self.dataset_id = slugify(self.dataset_name[:15] + '-' +
                                  get_alpha_string(4))

        self.dataset_root_dir = join(self.writable_output_dir, self.dataset_id)

        # ---------------------------------------
        # Create the problem_TRAIN directory
        # ---------------------------------------
        self.problem_dir = join(self.dataset_root_dir, 'TRAIN',
                                'problem_TRAIN')

        LOGGER.info('       - dataset_root_dir: %s', self.dataset_root_dir)
        LOGGER.info('       - problem_dir: %s', self.problem_dir)
        dir_info = create_directory(self.problem_dir)
        if not dir_info.success:
            self.send_websocket_err_msg(dir_info.err_msg)
            return False

        # ---------------------------------------
        # Create the tables dir
        # ---------------------------------------
        self.dataset_tables_dir = join(self.dataset_root_dir, 'TRAIN',
                                       'dataset_TRAIN', 'tables')

        dir_info = create_directory(self.dataset_tables_dir)
        if not dir_info.success:
            self.send_websocket_err_msg(dir_info.err_msg)
            return False
        self.dataset_dir = dirname(self.dataset_tables_dir)
        return True
Ejemplo n.º 3
0
def check_build_output_directories(d3m_config):
    """Used when setting a new a d3m_config:
        - check if the output directories exist
        - build them if they don't"""
    if not isinstance(d3m_config, D3MConfiguration):
        return err_resp('d3m_config must be a D3MConfiguration object')

    temp_path = None
    output_path = d3m_config.env_values.get(d3m_static.KEY_D3MOUTPUTDIR)
    if output_path:
        temp_path = join(output_path, 'temp')

    paths_to_check = [output_path,
                      temp_path,
                      d3m_config.env_values.get(d3m_static.KEY_D3MLOCALDIR),
                      d3m_config.env_values.get(d3m_static.KEY_D3MSTATICDIR)]

    paths_to_build = [x for x in paths_to_check
                      if x and not isdir(x)]

    fail_info = []
    for build_path in paths_to_build:
        path_info = create_directory(build_path)
        if path_info.success:
            print('directory created: ', build_path)
        else:
            err_msg = 'Failed to build directory: %s' % (path_info.err_msg)
            fail_info.append(err_msg)
    if fail_info:
        return err_resp('\n'.join(fail_info))

    return ok_resp('looks good')
Ejemplo n.º 4
0
    def save_datamart_file(data_foldername, file_data, **kwargs):
        """Save materialize response as a file.  This should be a .zip
        containing both a datafile and a datasetDoc.json"""
        if not file_data:
            return err_resp('"file_data" must be specified')

        # create directory if it doesn't exist
        #       (Ok if the directory already exists)
        #
        dir_info = create_directory(data_foldername)
        if not dir_info.success:
            return err_resp(dir_info.err_msg)

        try:
            with zipfile.ZipFile(BytesIO(file_data.content), 'r') as data_zip:
                data_zip.extractall(data_foldername)
        except RuntimeError as err_obj:
            user_msg = (f'Failed to extract zip to "{data_foldername}".'
                        f' Error: %s') % (err_obj, )
            return err_resp(user_msg)

        # Make sure that learningData.csv exists
        #
        data_filepath = join(data_foldername, 'tables', 'learningData.csv')
        if not isfile(data_filepath):
            user_msg = ('File "learningData.csv" not found in expected'
                        'place: %s') % data_filepath
            return err_resp(user_msg)

        # Make sure that the datasetDoc.json exists
        #
        datasetdoc_path = join(data_foldername, 'datasetDoc.json')
        if not isfile(datasetdoc_path):
            user_msg = ('File datasetDoc.json not found in'
                        ' expected place: %s') % datasetdoc_path
            return err_resp(user_msg)

        expected_filepath = kwargs.get('expected_filepath', None)
        if expected_filepath:
            if expected_filepath != data_filepath:
                user_msg = 'File not found on expected path: %s' % expected_filepath
                return err_resp(user_msg)

        return ok_resp({
            dm_static.KEY_DATA_PATH: data_filepath,
            dm_static.KEY_DATASET_DOC_PATH: datasetdoc_path
        })
Ejemplo n.º 5
0
def create_destination_directory(user_workspace, name):
    """Used to add a write directory for the partials app"""
    if not isinstance(user_workspace, UserWorkspace):
        return err_resp(
            'Error "user_workspace" must be a UserWorkspace object.')

    # build destination path for partials app
    dest_dir_path = os.path.join(user_workspace.d3m_config.additional_inputs,
                                 name, f'ws_{user_workspace.id}',
                                 get_timestamp_string())

    new_dir_info = create_directory(dest_dir_path)
    if not new_dir_info.success:
        return {
            KEY_SUCCESS: False,
            KEY_DATA: f' {new_dir_info.err_msg} ({dest_dir_path})'
        }

    return {KEY_SUCCESS: True, KEY_DATA: dest_dir_path}
Ejemplo n.º 6
0
def create_image_output_dir(user_workspace=None):
    """Create an image output dir based on a user workspace
    For DEMO: TEMP write this to staticfiles
    """
    if not isinstance(user_workspace, UserWorkspace):
        user_workspace_id = random_info.get_digits_string(3)
        #return {KEY_SUCCESS: False,
        #        KEY_SUCCESS: 'user_workspace is not a "UserWorkspace" object'}
    else:
        user_workspace_id = user_workspace.id

    output_path = join(\
            get_output_path_base(),
            im_static.IMAGE_MARKUP_DIR_NAME,
            f'{user_workspace_id}-{random_info.get_alphanumeric_lowercase(4)}',
            random_info.get_timestamp_string())

    dir_info = create_directory(output_path)

    if not dir_info.success:
        return {KEY_SUCCESS: False, KEY_MESSAGE: dir_info.err_msg}

    return {KEY_SUCCESS: True, KEY_DATA: dir_info.result_obj}
Ejemplo n.º 7
0
    def get_write_directory(self, kwargs_write_dir):
        """Determine the write directory"""
        if self.has_error():
            return
        # Was it sent as a kwarg?
        if kwargs_write_dir and isdir(kwargs_write_dir):
            return kwargs_write_dir

        # Use the d3m_config connected to the user workspace
        #
        if self.user_workspace:
            output_dir = self.user_workspace.d3m_config.root_output_directory
            output_dir = join(output_dir, 'problems')
        else:
            # Use the default/hard-coded directory
            #
            output_dir = OUTPUT_PROBLEMS_DIR

        dir_info = create_directory(output_dir)
        if dir_info.success:
            return dir_info.result_obj

        self.add_err_msg(dir_info.err_msg)
        return None
Ejemplo n.º 8
0
def view_upload_dataset(request):
    """Upload dataset and metadata"""
    print('FILE_UPLOAD_MAX_MEMORY_SIZE:', settings.FILE_UPLOAD_MAX_MEMORY_SIZE)

    user_workspace_info = get_latest_user_workspace(request)
    if not user_workspace_info.success:
        return JsonResponse(get_json_error(user_workspace_info.err_msg))
    user_workspace = user_workspace_info.result_obj

    # Destination directory for learningData.csv, learningData#.csv, etc.
    #   and about.json
    #
    dest_dir_info = create_directory_add_timestamp(\
                        join(settings.TWORAVENS_USER_DATASETS_DIR,
                             f'uploads_{user_workspace.user.id}',
                             get_alpha_string(6)))

    if not dest_dir_info.success:
        return JsonResponse(get_json_error(dest_dir_info.err_msg))
    dest_directory = dest_dir_info.result_obj

    print('view_upload_dataset. dest_directory', dest_directory)

    # Save the about.json
    #
    json_info = json_loads(request.POST.get('metadata'))
    if not json_info.success:
        return JsonResponse(get_json_error(json_info.err_msg))

    # save json data
    dataset_name = None
    if dp_static.DATASET_NAME_FROM_UI in json_info.result_obj:
        dataset_name = json_info.result_obj[dp_static.DATASET_NAME_FROM_UI]

    #with open(os.path.join(dest_directory, 'about.json'), 'w') as metadata_file:
    #    json.dump(json_info.result_obj, metadata_file)

    # Save data files.  They don't have to be .csv, that's handled latter,
    #     e.g. convert from .tab, .tsv, xls, etc.
    #
    for idx, file in enumerate(request.FILES.getlist('files')):
        print(file.name)
        _fname, fext = splitext(file.name)
        if not fext.lower() in dp_static.VALID_EXTENSIONS:
            # no extension found, won't be able to open it
            user_msg = (
                f'The extension for this file was not recognized: "{file.name}".'
                f' Valid extensions: {", ".join(dp_static.VALID_EXTENSIONS)}.')

            return JsonResponse(get_json_error(user_msg))

        new_filename = join(
            dest_directory,
            f'learningData{idx + 1 if idx else ""}{fext.lower()}')
        with open(new_filename, 'wb+') as outfile:
            for chunk in file.chunks():
                outfile.write(chunk)

    print('dest_directory', dest_directory)

    # Create new dataset folders/etc
    #
    additional_inputs_dir = user_workspace.d3m_config.additional_inputs
    created = create_directory(additional_inputs_dir)
    if not created.success:
        return JsonResponse(get_json_error(created.err_msg))

    new_dataset_info = UserDatasetUtil.make_new_dataset(\
                            user_workspace.user.id,
                            dest_directory,
                            settings.TWORAVENS_USER_DATASETS_DIR,
                            **{dp_static.DATASET_NAME: dataset_name})

    if not new_dataset_info.success:
        return JsonResponse(get_json_error(new_dataset_info.err_msg))
    #udu = UserDatasetUtil(1, input_files, output_dir)

    return JsonResponse(get_json_success('file upload completed successfully'))
Ejemplo n.º 9
0
    def make_doc(self):
        """Create the docs"""
        if self.has_error():
            return

        dataset_id = self.about['datasetName'].replace(' ', '_')

        # construct a mapping to output paths
        inout_data_paths = OrderedDict()

        print('-- Iterate through input files --')
        for src_data_path in self.input_data_paths:
            offset = 1
            print('src_data_path', src_data_path)

            file_ext = splitext(src_data_path)[1].lower()
            if not file_ext in dp_static.VALID_EXTENSIONS:
                print('  -> Invalid extension, skipping: ', file_ext)
                continue

            # Set the output file name: learningData.csv, learningData_01.csv, etc.
            filename = 'learningData'
            candidate_name = join('tables', filename + '.csv')
            while candidate_name in inout_data_paths.values():
                offset += 1
                offset_str = f'_{str(offset).zfill(2)}'
                #_name, extension = os.path.splitext(os.path.basename(src_data_path))
                candidate_name = join('tables', f'{filename}{offset_str}.csv')

            inout_data_paths[src_data_path] = candidate_name
            print(' -> post-conversion name:', candidate_name)

        print('inout_data_paths', inout_data_paths)

        def infer_roles(column_name):
            """Infer column role"""
            roles = []
            if column_name == 'd3mIndex':
                roles.append('index')
            elif column_name in self.targets:
                roles.append('suggestedTarget')
            else:
                roles.append('attribute')

            if column_name in self.problem.get('time', []):
                roles.append('timeIndicator')
            return roles

        target_configs = []
        # individually load, index, analyze, and save each dataset
        resource_configs = []

        # Iterate through input files / proposed output files
        #   - Open the input file and write it as a .csv
        #   - From each input file, gather information for the dataset doc
        #
        for input_path, output_data_path in inout_data_paths.items():
            #print('Doc Maker 3: Attempt to read:', input_path)
            data_info = self.d3m_load_resource(input_path)
            if not data_info.success:
                self.add_err_msg(data_info.err_msg)
                return
            data = data_info.result_obj

            if not isinstance(data, pd.DataFrame):
                user_msg = (f'Failed to load the file into a'
                            f' data frame: {input_path}')
                self.add_err_msg(user_msg)
                return

            resourceID = splitext(basename(input_path))[0]

            columnConfigs = []
            for colIndex, (colName, colType) in enumerate(
                    zip(data.columns.values, data.dtypes)):
                columnConfig = {
                    'colIndex': colIndex,
                    'colName': colName,
                    'colType': dp_static.DTYPES.get(str(colType), None)
                    or 'unknown',
                    'role': infer_roles(colName)
                }
                columnConfigs.append(columnConfig)
                if columnConfig['role'][0] == 'suggestedTarget':
                    target_configs.append({
                        'resID': resourceID,
                        'colIndex': colIndex,
                        'colName': colName
                    })

            # output_data_path = join('tables', 'learningData.csv')

            resource_configs.append({
                'resID':
                resourceID,
                'resPath':
                output_data_path,
                'resType':
                'table',
                'resFormat': {
                    "text/csv": ["csv"]
                },
                'isCollection':
                False,
                'columns': [{
                    'colIndex':
                    i,
                    'colName':
                    column[0],
                    'colType':
                    dp_static.DTYPES.get(str(column[1]), None) or 'unknown',
                    'role':
                    infer_roles(column[0])
                } for i, column in enumerate(
                    zip(data.columns.values, data.dtypes))]
            })

            final_data_file_path = join(self.dataset_output_dir,
                                        output_data_path)

            dir_info = create_directory(dirname(final_data_file_path))

            if not dir_info.success:
                self.add_err_msg(dir_info.err_msg)
                return

            data.to_csv(final_data_file_path, index=False)

        # write dataset config
        self.dataset_doc_path = join(self.dataset_output_dir,
                                     'datasetDoc.json')
        with open(self.dataset_doc_path, 'w') as dataset_doc:

            dataset_doc.write(
                json.dumps(
                    {
                        'about': {
                            **{
                                'datasetID':
                                dataset_id,
                                'datasetSchemaVersion':
                                dp_static.DATASET_SCHEMA_VERSION,
                                'redacted':
                                True,
                                'digest':
                                hashlib.sha256(self.about['datasetName'].encode(
                                               )).hexdigest()
                            },
                            **self.about
                        },
                        'dataResources': resource_configs
                    },
                    indent=4))