Esempio n. 1
0
    def store(self, resource_instance):
        '''
        Handles moving the file described by the `resource_instance`
        arg to its final location.
        '''
        relative_path = self.construct_relative_path(resource_instance)

        # where all user files are kept locally:
        base_storage_dir = os.path.join(settings.DATA_DIR, STORAGE_DIRNAME)

        # the final location of this file on our local storage:
        destination = os.path.join(base_storage_dir, relative_path)

        storage_dir = os.path.dirname(destination)
        if not os.path.exists(storage_dir):

            # this function can raise an exception which will get
            # pushed up to the caller
            make_local_directory(storage_dir)

        # storage directory existed.  Move the file:
        source = resource_instance.path

        if os.path.exists(source):  # if on the local filesystem
            move_resource(source, destination)
            return destination
        else:
            # NOT on the local filesystem. go get it.
            return localize_remote_resource(resource_instance)
Esempio n. 2
0
    def run(self, executed_op, op_data, validated_inputs):
        logger.info('Running in remote Cromwell mode.')
        logger.info('Executed op type: %s' % type(executed_op))
        logger.info('Executed op ID: %s' % str(executed_op.id))
        logger.info('Op data: %s' % op_data)
        logger.info(validated_inputs)

        # the UUID identifying the execution of this operation:
        execution_uuid = str(executed_op.id)

        # get the operation dir so we can look at which converters to use:
        op_dir = os.path.join(settings.OPERATION_LIBRARY_DIR,
                              str(op_data['id']))

        # create a sandbox directory where we will store the files:
        staging_dir = os.path.join(settings.OPERATION_EXECUTION_DIR,
                                   execution_uuid)
        make_local_directory(staging_dir)

        # create the Cromwell-compatible inputs.json from the user inputs
        self._create_inputs_json(op_dir, validated_inputs, staging_dir)

        # copy over the workflow contents:
        self._copy_workflow_contents(op_dir, staging_dir)

        # construct the request to the Cromwell server:
        self.send_job(staging_dir, executed_op)
Esempio n. 3
0
 def __init__(self):
     print('in init with self.ROOT_DIR=', self.ROOT_DIR)
     if not os.path.exists(self.ROOT_DIR):
         logger.info(
             'When instantiating an instance of GtexRnaseqDataSource, the'
             ' expected directory did not exist. Go create it...')
         make_local_directory(self.ROOT_DIR)
     self.date_str = datetime.datetime.now().strftime('%m%d%Y')
Esempio n. 4
0
 def _create_tmp_dir(self):
     '''
     Create a temporary directory where we munge all the files
     '''
     folder_name = 'tmp-{u}'.format(u=uuid.uuid4())
     tmp_dir = os.path.join(settings.DATA_DIR, folder_name)
     if not os.path.exists(tmp_dir):
         make_local_directory(tmp_dir)
     return tmp_dir
Esempio n. 5
0
    def get_local_resource_path(self, resource_instance):
        '''
        Returns the path to the file resource on the local machine.
        
        For this case of Google bucket-based storage, we download
        the blob to the local cache dir if it does not already exist
        there. 
        '''
        logger.info('Pulling Resource ({pk}) from google storage'
        ' to local cache.'.format(
            pk = resource_instance.pk
        )
        )
        # the path relative to the "root" of the storage backend
        relative_path = self.construct_relative_path(resource_instance)

        local_cache_location = os.path.join(
            settings.RESOURCE_CACHE_DIR,
            relative_path
        )

        # need to check that the cache directory for this user exists:
        user_cache_dir = os.path.dirname(local_cache_location)
        if not os.path.exists(user_cache_dir):
            logger.info('User cache dir did not exist.  Create it.')
            make_local_directory(user_cache_dir)

        # if the file doesn't exist in our cache, go get it
        if not os.path.exists(local_cache_location):
            logger.info('Did not locate file in local cache. Download it.')
            blob = self.get_blob(resource_instance.path)
            if blob is None:
                raise Exception('The object located at {p} did not exist.'.format(
                        p = resource_instance.path
                    )
                )
            try:
                self.download_blob(blob, local_cache_location)
            except Exception as ex:
                logger.error('Could not complete download to local cache.'
                    ' Requested file was at {blob}.'.format(
                        blob=blob
                    )
                )
                raise ex
        else:
            logger.info('Resource was already located in the local cache.')
        return local_cache_location 
Esempio n. 6
0
    def setup_files(self, dataset_id):
        '''
        This creates the solr core directory for the repository 
        and adds the proper files
        '''
        solr_home_path = os.path.join(os.path.dirname(settings.BASE_DIR),
                                      'solr')
        core_dir_path = os.path.join(solr_home_path, dataset_id)
        make_local_directory(core_dir_path)

        # copy the solrconfig template. This solrconfig prevents
        # certain default behavior which could cause problems. Examples
        # include adding new fields to existing schema-- we don't want that
        basic_config_path = os.path.join(solr_home_path,
                                         'basic_solrconfig.xml')
        dest = os.path.join(core_dir_path, 'solrconfig.xml')
        copy_local_resource(basic_config_path, dest)

        return core_dir_path
Esempio n. 7
0
    def create_from_query(self, database_record, query_filter, output_name=''):
        '''
        subsets the dataset based on the query_filter.
        Returns a 3-tuple of lists:
        - a list of paths
        - a list of names for the files 
        - a list of resource types
        '''
        # Look at the database object to get the path for the count matrix
        file_mapping = database_record.file_mapping
        count_matrix_path = file_mapping[self.COUNTS_FILE_KEY][0]
        if not os.path.exists(count_matrix_path):
            #TODO: better error handling here.
            logger.info('Could not find the count matrix')
            raise Exception('Failed to find the proper data for this'
                            ' request. An administrator has been notified')

        # if the query_filter was None (indicating no filtering was desired)
        # then we reject-- this dataset is too big to store as a single
        # dataframe
        if query_filter is None:
            raise Exception(
                'The {name} dataset is too large to request without'
                ' any filters. Please try again and request a'
                ' subset of the data.'.format(name=self.PUBLIC_NAME))

        # to properly filter our full HDF5 matrix, we expect a data structure that
        # looks like:
        #
        # {'tissue A': [<sample ID>, <sample ID>], 'tissue B': [<sample ID>]}
        #
        # The top level contains identifiers which we use to select
        # the groups within the HDF5 file. Then, we use the sample IDs to filter the
        # dataframes
        final_df = pd.DataFrame()
        with pd.HDFStore(count_matrix_path, 'r') as hdf:
            for ct in query_filter.keys():
                if not type(query_filter[ct]) is list:
                    raise Exception(
                        'Problem encountered with the filter'
                        ' provided. We expect each cancer type to address'
                        ' a list of sample identifiers, such as: {j}'.format(
                            j=json.dumps(self.EXAMPLE_PAYLOAD)))
                group_id = RnaSeqMixin.create_python_compatible_id(ct) + '/ds'
                try:
                    df = hdf.get(group_id)
                except KeyError as ex:
                    raise Exception(
                        'The requested project'
                        ' {ct} was not found in the dataset. Ensure your'
                        ' request was correctly formatted.'.format(ct=ct))
                try:
                    df = df[query_filter[ct]]
                except KeyError as ex:
                    message = (
                        'The subset of the count matrix failed since'
                        ' one or more requested samples were missing: {s}'.
                        format(s=str(ex)))
                    raise Exception(message)

                final_df = pd.concat([final_df, df], axis=1)

        if final_df.shape[1] == 0:
            raise Exception('The resulting matrix was empty. No'
                            ' data was created.')

        # write the file to a temp location:
        filename = '{u}.tsv'.format(u=str(uuid.uuid4()))
        dest_dir = os.path.join(settings.DATA_DIR, 'tmp')
        if not os.path.exists(dest_dir):
            make_local_directory(dest_dir)
        count_filepath = os.path.join(dest_dir, filename)
        try:
            final_df.to_csv(count_filepath, sep='\t')
        except Exception as ex:
            logger.info('Failed to write the subset of GDC RNA-seq'
                        ' Exception was: {ex}'.format(ex=ex))
            raise Exception('Failed when writing the filtered data.')

        # now create the annotation file:
        full_uuid_list = []
        [full_uuid_list.extend(query_filter[k]) for k in query_filter.keys()]
        ann_path = file_mapping[self.ANNOTATION_FILE_KEY][0]
        if not os.path.exists(ann_path):
            #TODO: better error handling here.
            logger.info('Could not find the annotation matrix')
            raise Exception('Failed to find the proper data for this'
                            ' request. An administrator has been notified')
        ann_df = pd.read_csv(ann_path, index_col=0)
        subset_ann = ann_df.loc[full_uuid_list]

        # drop columns which are completely empty:
        subset_ann = subset_ann.dropna(axis=1, how='all')

        filename = '{u}.tsv'.format(u=str(uuid.uuid4()))

        ann_filepath = os.path.join(dest_dir, filename)
        try:
            subset_ann.to_csv(ann_filepath, sep='\t')
        except Exception as ex:
            logger.info(
                'Failed to write the subset of the annotation dataframe.'
                ' Exception was: {ex}'.format(ex=ex))
            raise Exception(
                'Failed when writing the filtered annotation data.')

        # finally make some names for these files, which we return
        if output_name == '':
            u = str(uuid.uuid4())
            count_matrix_name = self.TAG + '_counts.' + u + '.tsv'
            ann_name = self.TAG + '_ann.' + u + '.tsv'
        else:
            count_matrix_name = output_name + '_counts.' + self.TAG + '.tsv'
            ann_name = output_name + '_ann.' + self.TAG + '.tsv'
        return [count_filepath, ann_filepath], \
                [count_matrix_name, ann_name], \
                ['RNASEQ_COUNT_MTX', 'ANN']
Esempio n. 8
0
 def __init__(self):
     if not os.path.exists(self.ROOT_DIR):
         logger.info('When instantiating an instance of TCGADataSource, the'
             ' expected directory did not exist. Go create it...'
         )
         make_local_directory(self.ROOT_DIR)
Esempio n. 9
0
    def run(self, executed_op, op_data, validated_inputs):
        logger.info('Running in local Docker mode.')
        logger.info('Executed op type: %s' % type(executed_op))
        logger.info('Executed op ID: %s' % str(executed_op.id))
        logger.info('Op data: %s' % op_data)
        logger.info(validated_inputs)

        # the UUID identifying the execution of this operation:
        execution_uuid = str(executed_op.id)

        # get the operation dir so we can look at which converters and command to use:
        op_dir = os.path.join(settings.OPERATION_LIBRARY_DIR,
                              str(op_data['id']))

        # To avoid conflicts or corruption of user data, we run each operation in its
        # own sandbox. We must first copy over their files to that sandbox dir.
        execution_dir = os.path.join(settings.OPERATION_EXECUTION_DIR,
                                     execution_uuid)
        make_local_directory(execution_dir)

        # convert the user inputs into args compatible with commandline usage:
        # For instance, a differential gene expression requires one to specify
        # the samples that are in each group-- to do this, the Operation requires
        # two ObservationSet instances are submitted as arguments. The "translator"
        # will take the ObservationSet data structures and turn them into something
        # that the call with use- e.g. making a CSV list to submit as one of the args
        # like:
        # docker run <image> run_something.R -a sampleA,sampleB -b sampleC,sampleD
        arg_dict = self._map_inputs(op_dir, validated_inputs, execution_dir)

        logger.info('After mapping the user inputs, we have the'
                    ' following structure: {d}'.format(d=arg_dict))

        # Construct the command that will be run in the container:
        entrypoint_file_path = os.path.join(op_dir, self.ENTRYPOINT_FILE)
        if not os.path.exists(entrypoint_file_path):
            logger.error(
                'Could not find the required entrypoint file at {p}.'
                ' Something must have corrupted the operation directory.'.
                format(p=entrypoint_file_path))
            raise Exception('The repository must have been corrupted.'
                            ' Failed to find the entrypoint file.'
                            ' Check dir at: {d}'.format(d=op_dir))
        entrypoint_cmd = self._get_entrypoint_command(entrypoint_file_path,
                                                      arg_dict)

        image_str = get_image_name_and_tag(op_data['repo_name'],
                                           op_data['git_hash'])

        cmd = self.DOCKER_RUN_CMD.format(
            container_name=execution_uuid,
            execution_mount=settings.OPERATION_EXECUTION_DIR,
            work_dir=settings.OPERATION_EXECUTION_DIR,
            job_dir=execution_dir,
            docker_image=image_str,
            cmd=entrypoint_cmd)
        try:
            run_shell_command(cmd)
            executed_op.job_id = execution_uuid
            executed_op.save()
        except Exception as ex:
            logger.info('Failed when running shell command: {c}'.format(c=cmd))
            logger.info('Exception was: {ex}'.format(ex=ex))
            # if an exception is raised when issuing the Docker run
            # command, then the job has failed. This error is likely
            # not due to user error, but something with the issuing
            # command or allocating appropriate Docker resources.
            executed_op.job_failed = True
            executed_op.execution_stop_datetime = datetime.datetime.now()
            executed_op.status = ExecutedOperation.ADMIN_NOTIFIED
            executed_op.save()
            alert_admins(str(ex))