Example #1
0
class genome_wide_association_studies:
    '''
    Module Name:
    genome_wide_association_studies

    Module Description:
    A KBase module: genome_wide_association_studies
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "[email protected]:pranjan77/genome_wide_association_studies"
    GIT_COMMIT_HASH = "fb9bc0c5d00a0314e9f832d30a996c7448e61db9"

    #BEGIN_CLASS_HEADER
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.scratch = os.path.abspath(config['scratch'])
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        # self.shared_folder = os.path.abspath(config['scratch'])
        self.dfu = DataFileUtil(self.callbackURL)

        #END_CONSTRUCTOR
        pass

    def import_gwas_data(self, ctx, import_gwas_data_params):
        """
        :param import_gwas_data_params: instance of type
           "import_gwas_data_params" (Insert your typespec information here.)
           -> structure: parameter "input_shock_id" of String, parameter
           "input_file_path" of String
        :returns: instance of type "Run_import_gwas_data_result" ->
           structure: parameter "report_ref" of String, parameter
           "report_name" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN import_gwas_data

        #Download file from staging area
        params = import_gwas_data_params
        print(params)
        #        print (import_gwas_data_params)

        #        download_staging_file_params = {
        #            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        #       }
        #        try:
        #            scratch_file_path = self.dfu.download_staging_file(
        #                      download_staging_file_params).get('copy_file_path')
        #        except:
        #            raise ValueError ('error uploading HTML file to shock')
        gwas_utils = gwas_results_utils.gwas_results_utils(self.config)

        returnVal = gwas_utils.run_import_gwas_results(params)

        #END import_gwas_data

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method import_gwas_data return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def import_snp_data(self, ctx, import_snp_data_params):
        """
        :param import_snp_data_params: instance of type
           "import_snp_data_params" -> structure: parameter "input_shock_id"
           of String, parameter "input_file_path" of String
        :returns: instance of type "Run_import_snp_data_result" -> structure:
           parameter "report_ref" of String, parameter "report_name" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN import_snp_data
        params = import_snp_data_params
        print(import_snp_data_params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        try:
            scratch_file_path = self.dfu.download_staging_file(
                download_staging_file_params).get('copy_file_path')
        except:
            raise ValueError('error uploading HTML file to shock')

        print scratch_file_path

        #END import_snp_data

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method import_snp_data return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def import_trait_data(self, ctx, import_trait_data_params):
        """
        :param import_trait_data_params: instance of type
           "import_trait_data_params" -> structure: parameter
           "input_shock_id" of String, parameter "input_file_path" of String
        :returns: instance of type "Run_import_trait_data_result" ->
           structure: parameter "report_ref" of String, parameter
           "report_name" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN import_trait_data
        params = import_trait_data_params
        print(import_trait_data_params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        try:
            scratch_file_path = self.dfu.download_staging_file(
                download_staging_file_params).get('copy_file_path')
        except:
            raise ValueError('error uploading HTML file to shock')
        #END import_trait_data

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method import_trait_data return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def import_network_data(self, ctx, import_network_data_params):
        """
        :param import_network_data_params: instance of type
           "import_network_data_params" -> structure: parameter
           "input_shock_id" of String, parameter "input_file_path" of String
        :returns: instance of type "Run_import_network_data_result" ->
           structure: parameter "report_ref" of String, parameter
           "report_name" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN import_network_data
        params = import_network_data_params
        print(import_network_data_params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        try:
            scratch_file_path = self.dfu.download_staging_file(
                download_staging_file_params).get('copy_file_path')
        except:
            raise ValueError('error uploading HTML file to shock')
        #END import_network_data

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method import_network_data return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def import_motif_data(self, ctx, import_motif_data_params):
        """
        :param import_motif_data_params: instance of type
           "import_motif_data_params" -> structure: parameter
           "input_shock_id" of String, parameter "input_file_path" of String
        :returns: instance of type "Run_import_motif_data_result" ->
           structure: parameter "report_ref" of String, parameter
           "report_name" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN import_motif_data
        #END import_motif_data

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method import_motif_data return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
class ImportMediaUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.fba = fba_tools(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_media_from_staging(self, params):
        '''
          import_media_from_staging: wrapper method for
                                    FBAFileUtil.tsv_file_to_media
                                    and
                                    FBAFileUtil.excel_file_to_media

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          media_name - output Media file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''
        log('--->\nrunning ImportMediaUtil.import_media_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_media_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')
        file = {
            'path': scratch_file_path
        }
        import_media_params = params
        import_media_params['media_file'] = file

        try:
            ref = self.fba.tsv_file_to_media(import_media_params)
        except:
            try:
                ref = self.fba.excel_file_to_media(import_media_params)
            except:
                raise ValueError('"{}" is not a valid EXCEL nor TSV file'.format(
                                                params.get('staging_file_subdir_path')))
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref.get('ref'))

        returnVal = {'obj_ref': ref.get('ref')}

        return returnVal

    def import_tsv_as_media_from_staging(self, params):
        '''
          import_tsv_as_media_from_staging: wrapper method for
                                    FBAFileUtil.tsv_file_to_media

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          media_name - output Media file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportMediaUtil.import_tsv_as_media_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_media_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')

        file = {
            'path': scratch_file_path
        }

        import_media_params = params
        import_media_params['media_file'] = file

        ref = self.fba.tsv_file_to_media(import_media_params)

        returnVal = {'obj_ref': ref.get('ref')}

        return returnVal

    def import_excel_as_media_from_staging(self, params):
        '''
          import_excel_as_media_from_staging: wrapper method for
                                    FBAFileUtil.excel_file_to_media

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          media_name - output Media file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportMediaUtil.import_excel_as_media_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_media_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')

        file = {
            'path': scratch_file_path
        }

        import_media_params = params
        import_media_params['media_file'] = file

        ref = self.fba.excel_file_to_media(import_media_params)

        returnVal = {'obj_ref': ref.get('ref')}

        return returnVal

    def validate_import_media_from_staging_params(self, params):
        """
        validate_import_media_from_staging_params:
                    validates params passed to import_excel(tsv)_as_media_from_staging method

        """

        # check for required parameters
        for p in ['staging_file_subdir_path', 'workspace_name', 'media_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                        import_excel(tsv)_as_media_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {
            'object_refs': [obj_ref],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "Media Object Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported File: {}\n'.format(
                              params.get('staging_file_subdir_path'))

        report_params = {
              'message': upload_message,
              'workspace_name': params.get('workspace_name'),
              'report_object_name': 'kb_upload_mothods_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
Example #3
0
class ImportGFFFastaUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = kb_gffupload(self.callback_url, service_ver='dev')

    def import_gff_fasta_from_staging(self, params):
        '''
          import_gff_fasta_from_staging: wrapper method for kb_gffupload.fasta_gff_to_genome

          required params:
          fasta_file: fasta file from user's staging area
          gff_file: gff file from user's staging area
          genome_name: output genome object name
          workspace_name: workspace name that genome will be stored to

          file paths for both fasta and gff files must be subdirectory file path in staging area
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name

          return:
          genome_ref: return object reference
          report_name: name of generated report (if any)
          report_ref: report reference (if any)
        '''

        log('--->\nrunning ImportGFFFastaUtil.import_gff_fasta_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_gff_fasta_from_staging_params(params)

        #If not testing, fetch from staging
        if ('test' not in params or params['test'] != 1):
            for key in ('fasta_file', 'gff_file'):
                file_path = params[key]
                file = os.path.basename(file_path)

                download_staging_file_params = {
                    'staging_file_subdir_path': file
                }
                dfu_returnVal = self.dfu.download_staging_file(
                    download_staging_file_params)
                params[key] = dfu_returnVal['copy_file_path']

        print params
        returnVal = self.gfu.fasta_gff_to_genome(params)
        return returnVal

    def validate_import_gff_fasta_from_staging_params(self, params):
        """
        validate_import_gff_fasta_from_staging_params: validates params passed to fasta_gff_to_genome method

        """

        # check for required parameters
        for p in ['genome_name', 'workspace_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        #for now must use workspace name, but no ws_id_to_name() function available
        if str(params["workspace_name"]).isdigit():
            raise ValueError(
                '"' + params["workspace_name"] +
                '" parameter is a workspace id and workspace name is required')

    def validate_import_genbank_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method

        """

        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'genome_name', 'workspace_name',
                'source'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')
Example #4
0
class ImportExpressionMatrixUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.fv = KBaseFeatureValues(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_tsv_as_expression_matrix_from_staging(self, params):
        '''
        import_tsv_as_expression_matrix_from_staging: wrapper method for
                                    KBaseFeatureValues.tsv_file_to_matrix

        required params:
            staging_file_subdir_path: subdirectory file path
              e.g.
                for file: /data/bulk/user_name/file_name
                staging_file_subdir_path is file_name
                for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
                staging_file_subdir_path is subdir_1/subdir_2/file_name
            matrix_name: output Expressin Matirx file name
            workspace_name: workspace name/ID of the object

        optional params:
            genome_ref: optional reference to a Genome object that will be
                  used for mapping feature IDs to
            fill_missing_values: optional flag for filling in missing
                    values in matrix (default value is false)
            data_type: optional filed, value is one of 'untransformed',
                    'log2_level', 'log10_level', 'log2_ratio', 'log10_ratio' or
                    'unknown' (last one is default value)
            data_scale: optional parameter (default value is '1.0')

        return:
            obj_ref: return object reference
        '''

        log('--->\nrunning ImportAssemblyUtil.import_tsv_as_expression_matrix_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_tsv_as_expression_matrix_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')

        import_matrix_params = params
        import_matrix_params['input_file_path'] = scratch_file_path
        import_matrix_params['output_ws_name'] = params.get('workspace_name')
        import_matrix_params['output_obj_name'] = params.get('matrix_name')

        ref = self.fv.tsv_file_to_matrix(import_matrix_params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'),
                                                   ref.get('output_matrix_ref'))
        returnVal = {'obj_ref': ref.get('output_matrix_ref')}

        return returnVal

    def validate_import_tsv_as_expression_matrix_from_staging_params(self, params):
        """
        validate_import_tsv_as_expression_matrix_from_staging_params:
                    validates params passed to import_tsv_as_expression_matrix_from_staging method

        """

        # check for required parameters
        for p in ['staging_file_subdir_path', 'workspace_name', 'matrix_name']:
            if p not in params:
                raise ValueError('"' + p + '" parameter is required, but missing')

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_tsv_as_expression_matrix_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {
            'object_refs': [obj_ref],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "Expression Matrix Object Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported TSV File: {}\n'.format(
                              params.get('staging_file_subdir_path'))

        report_params = {
              'message': upload_message,
              'workspace_name': params.get('workspace_name'),
              'report_object_name': 'kb_upload_mothods_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
class ImportFBAModelUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.fba = fba_tools(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_fbamodel_from_staging(self, params):

        log('--->\nrunning {}.{}\n params:\n{}'.format(
            self.__class__.__name__,
            sys._getframe().f_code.co_name, json.dumps(params, indent=1)))

        self._check_param(params, [
            'model_file', 'file_type', 'workspace_name', 'model_name',
            'biomass'
        ], ['genome', 'compounds_file'])
        if params['file_type'] == 'tsv' and not params.get(
                'compounds_file', None):
            raise ValueError('A compound file is required for tsv upload.')

        fba_tools_params = params.copy()
        for infile in ['model_file', 'compounds_file']:
            if not params.get(infile, None):
                continue
            download_staging_file_params = {
                'staging_file_subdir_path': params[infile]
            }
            scratch_file_path = self.dfu.download_staging_file(
                download_staging_file_params).get('copy_file_path')
            fba_tools_params[infile] = {'path': scratch_file_path}

        if params['file_type'] == 'sbml':
            res = self.fba.sbml_file_to_model(fba_tools_params)
        elif params['file_type'] == 'excel':
            res = self.fba.excel_file_to_model(fba_tools_params)
        elif params['file_type'] == 'tsv':
            res = self.fba.tsv_file_to_model(fba_tools_params)
        else:
            raise ValueError('"{}" is not a valid import file_type'.format(
                params['file_type']))
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            download_staging_file_params.get('staging_file_subdir_path'),
            res['ref'])
        return {'obj_ref': res['ref']}

    @staticmethod
    def _check_param(in_params, req_param, opt_param=list()):
        """
        Check if each of the params in the list are in the input params
        """
        for param in req_param:
            if param not in in_params:
                raise ValueError(
                    'Required parameter "{}" is missing'.format(param))
        defined_param = set(req_param + opt_param)
        for param in in_params:
            if param not in defined_param:
                print('WARNING: received unexpected parameter "{}"'.format(
                    param))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                        import_excel(tsv)_as_media_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        upload_message += "FBAModel Object Name: "
        upload_message += params['model_name'] + '\n'
        upload_message += 'Imported File: {}\n'.format(
            params.get('model_file'))

        report_params = {
            'message':
            upload_message,
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Imported FBAModel'
            }],
            'workspace_name':
            params.get('workspace_name'),
            'report_object_name':
            'kb_upload_methods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Example #6
0
class ImportAssemblyUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'import_assembly_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_fasta_as_assembly_from_staging(self, params):
        '''
          import_fasta_as_assembly_from_staging: wrapper method for
                                    AssemblyUtil.save_assembly_from_fasta

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          assembly_name - output Assembly file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''
        log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_fasta_as_assembly_from_staging(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        file = {'path': scratch_file_path}
        import_assembly_params = params
        import_assembly_params['file'] = file

        ref = self.au.save_assembly_from_fasta(import_assembly_params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            params.get('staging_file_subdir_path'), ref)

        returnVal = {'obj_ref': ref}
        return returnVal

    def validate_import_fasta_as_assembly_from_staging(self, params):
        """
        validate_import_fasta_as_assembly_from_staging:
                    validates params passed to import_fasta_as_assembly_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name', 'assembly_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

    def generate_html_report(self, assembly_ref, assembly_object, params):
        """
        _generate_html_report: generate html summary report
        """
        log('start generating html report')
        html_report = list()

        assembly_data = assembly_object.get('data')[0].get('data')
        assembly_info = assembly_object.get('data')[0].get('info')

        result_file_path = os.path.join(self.scratch, 'report.html')

        assembly_name = str(assembly_info[1])
        assembly_file = params.get('staging_file_subdir_path')

        dna_size = assembly_data.get('dna_size')
        num_contigs = assembly_data.get('num_contigs')

        assembly_overview_data = collections.OrderedDict()

        assembly_overview_data['Name'] = '{} ({})'.format(
            assembly_name, assembly_ref)
        assembly_overview_data['Uploaded File'] = assembly_file
        assembly_overview_data['Date Uploaded'] = time.strftime("%c")
        assembly_overview_data['DNA Size'] = dna_size
        assembly_overview_data['Number of Contigs'] = num_contigs

        overview_content = ''
        overview_content += '<br/><table>\n'
        for key, val in assembly_overview_data.iteritems():
            overview_content += '<tr><td><b>{}</b></td>'.format(key)
            overview_content += '<td>{}</td>'.format(val)
            overview_content += '</tr>\n'
        overview_content += '</table>'

        contig_data = assembly_data.get('contigs').values()
        contig_content = str([[str(e['contig_id']), e['length']]
                              for e in contig_data])

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template_assembly.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>*Overview_Content*</p>', overview_content)
                report_template = report_template.replace(
                    '*CONTIG_DATA*', contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({
            'file_path': self.scratch,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Imported Assembly'
        })
        return html_report

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_fasta_as_assembly_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to
        
        """
        uuid_string = str(uuid.uuid4())

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}
        object_data = self.dfu.get_objects(get_objects_params)
        objects_created = [{
            'ref': obj_ref,
            'description': 'Imported Assembly'
        }]

        output_html_files = self.generate_html_report(obj_ref, object_data,
                                                      params)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 270,
            'report_object_name': 'kb_upload_assembly_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
class ImportGenbankUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)

    def import_genbank_from_staging(self, params):
        '''
          import_genbank_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          genome_name - becomes the name of the object
          workspace_name - the name of the workspace it gets saved to.
          source - Source of the file typically something like RefSeq or Ensembl

          optional params:
          release - Release or version number of the data
              per example Ensembl has numbered releases of all their data: Release 31
          generate_ids_if_needed - If field used for feature id is not there,
              generate ids (default behavior is raising an exception)
          genetic_code - Genetic code of organism. Overwrites determined GC from
              taxon object
          type - Reference, Representative or User upload

          return:
          genome_ref: return object reference
        '''

        log('--->\nrunning ImportGenbankUtil.import_genbank_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_genbank_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')

        file = {
            'path': scratch_file_path
        }

        import_genbank_params = params
        import_genbank_params['file'] = file
        del import_genbank_params['staging_file_subdir_path']

        returnVal = self.gfu.genbank_to_genome(import_genbank_params)

        return returnVal

    def validate_import_genbank_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method

        """

        # check for required parameters
        for p in ['staging_file_subdir_path', 'genome_name', 'workspace_name', 'source']:
            if p not in params:
                raise ValueError('"' + p + '" parameter is required, but missing')
class ImportGenbankUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = os.path.join(config['scratch'], 'import_GenBank_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev')
        self.uploader_utils = UploaderUtil(config)

    def import_genbank_from_staging(self, params):
        '''
          import_genbank_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          genome_name - becomes the name of the object
          workspace_name - the name of the workspace it gets saved to.
          source - Source of the file typically something like RefSeq or Ensembl

          optional params:
          release - Release or version number of the data
              per example Ensembl has numbered releases of all their data: Release 31
          generate_ids_if_needed - If field used for feature id is not there,
              generate ids (default behavior is raising an exception)
          genetic_code - Genetic code of organism. Overwrites determined GC from
              taxon object
          type - Reference, Representative or User upload

          return:
          genome_ref: return object reference
        '''

        log('--->\nrunning ImportGenbankUtil.import_genbank_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_genbank_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                                 download_staging_file_params).get('copy_file_path')
        file = {
            'path': scratch_file_path
        }
        import_genbank_params = params
        import_genbank_params['file'] = file
        del import_genbank_params['staging_file_subdir_path']

        returnVal = self.gfu.genbank_to_genome(import_genbank_params)

        """
        Update the workspace object related meta-data for staged file
        """
        #self.uploader_utils.update_staging_service(
        #    download_staging_file_params.get('staging_file_subdir_path'),
        #    returnVal['genome_ref'])
        return returnVal

    def validate_import_genbank_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['staging_file_subdir_path', 'genome_name', 'workspace_name', 'source']:
            if p not in params:
                raise ValueError('"' + p + '" parameter is required, but missing')

    def generate_html_report(self, genome_ref, params):
        """
        _generate_html_report: generate html summary report
        """
        log('start generating html report')
        genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]})
        html_report = list()
        result_file_path = os.path.join(self.scratch, 'report.html')

        genome_name = str(genome_obj.get('data')[0].get('info')[1])
        genome_file = params.get('staging_file_subdir_path')

        genome_data = genome_obj.get('data')[0].get('data')
        genome_info = genome_obj.get('data')[0].get('info')
        source = genome_info[10].get('Source')
        num_contigs = genome_info[10].get('Number contigs')
        size = genome_info[10].get('Size')
        gc_content = genome_info[10].get('GC content')
        warnings = genome_data.get('warnings', [])
        feature_counts = sorted(list(genome_data.get('feature_counts', {})
                                     .items()))

        genome_overview_data = collections.OrderedDict()

        genome_overview_data['Name'] = '{} ({})'.format(genome_name, genome_ref)
        #genome_overview_data['Uploaded File'] = genome_file
        genome_overview_data['Date Uploaded'] = time.strftime("%c")
        genome_overview_data['Source'] = source
        genome_overview_data['Number of Contigs'] = num_contigs
        genome_overview_data['Size'] = size
        genome_overview_data['GC Content'] = gc_content
        genome_overview_data['Warnings'] = "\n".join(warnings)
        genome_overview_data.update(feature_counts)

        overview_content = ''
        overview_content += '<br/><table>\n'
        for key, val in genome_overview_data.iteritems():
            overview_content += '<tr><td><b>{}</b></td>'.format(key)
            overview_content += '<td>{}</td>'.format(val)
            overview_content += '</tr>\n'
        overview_content += '</table>'

        feature_content = str([[str(k), v] for k, v in
                               genome_data.get('feature_counts', {}).items()
                               if k != 'gene'])
        contig_content = str([[str(c), l] for c, l in
                              zip(genome_data.get('contig_ids', []),
                                  genome_data.get('contig_lengths', []))])
        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template_genome.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          overview_content)
                report_template = report_template.replace('*FEATURE_DATA*',
                                                          feature_content)
                report_template = report_template.replace('*CONTIG_DATA*',
                                                          contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({'file_path': self.scratch,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for imported Genome'})
        return html_report

    def generate_report(self, genome_ref, params):
        """
        :param genome_ref:  Return Val from GenomeFileUtil for Uploaded genome
                            Need to get report warnings and message from it.
        :return: 
        """
        uuid_string = str(uuid.uuid4())

        objects_created = [{'ref': genome_ref,
                            'description': 'Imported Genome'}]

        output_html_files = self.generate_html_report(genome_ref, params)
        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 300,
            'report_object_name': 'kb_genome_upload_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
class ImportAssemblyUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)

    def import_fasta_as_assembly_from_staging(self, params):
        '''
          import_fasta_as_assembly_from_staging: wrapper method for
                                    AssemblyUtil.save_assembly_from_fasta

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          assembly_name - output Assembly file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_fasta_as_assembly_from_staging(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')

        file = {'path': scratch_file_path}

        import_assembly_params = params
        import_assembly_params['file'] = file

        ref = self.au.save_assembly_from_fasta(import_assembly_params)

        returnVal = {'obj_ref': ref}

        return returnVal

    def validate_import_fasta_as_assembly_from_staging(self, params):
        """
        validate_import_fasta_as_assembly_from_staging:
                    validates params passed to import_fasta_as_assembly_from_staging method

        """

        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name', 'assembly_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_fasta_as_assembly_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}

        object_data = self.dfu.get_objects(get_objects_params)
        base_count = object_data.get('data')[0].get('data').get('base_counts')
        dna_size = object_data.get('data')[0].get('data').get('dna_size')

        upload_message += "Assembly Object Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported Fasta File: {}\n'.format(
            params.get('staging_file_subdir_path'))

        if isinstance(dna_size, (int, long)):
            upload_message += 'DNA Size: {:,}\n'.format(dna_size)

        if isinstance(base_count, dict):
            upload_message += 'Base Count:\n{}\n'.format(
                json.dumps(base_count, indent=1)[2:-2])

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Example #10
0
class CompoundSetUtils:
    '''
    Module Name:
    CompoundSetUtils

    Module Description:
    A KBase module: CompoundSetUtils
Contains tools for import & export of compound sets
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbaseapps/CompoundSetUtils.git"
    GIT_COMMIT_HASH = "53bac077a8efaaea9ead90d5557b1af1c0b23394"

    #BEGIN_CLASS_HEADER
    @staticmethod
    def _check_required_param(in_params, param_list):
        """
        Check if each of the params in the list are in the input params
        """
        for param in param_list:
            if param not in in_params or not in_params[param]:
                raise ValueError('{} parameter is required'.format(param))

    def _save_to_ws_and_report(self, ctx, method, workspace, source, compoundset):
        """Save compound set to the workspace and make report"""
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        if 'model' in method:
            provenance[0]['input_ws_objects'] = workspace + '/' + source
        provenance[0]['service'] = 'CompoundSetUtils'
        provenance[0]['method'] = method
        info = self.ws_client.save_objects(
            {'workspace': workspace,
             "objects": [{
                 "type": "KBaseBiochem.CompoundSet",
                 "data": compoundset,
                 "name": compoundset['name']
             }]})[0]
        compoundset_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        report_params = {
            'objects_created': [{'ref': compoundset_ref,
                                 'description': 'Compound Set'}],
            'message': 'Imported %s as %s' % (source, compoundset_ref),
            'workspace_name': workspace,
            'report_object_name': 'compound_set_creation_report'
        }

        # Construct the output to send back
        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report(report_params)
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref'],
                  'compoundset_ref': compoundset_ref}
        return output
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.ws_client = Workspace(self.ws_url)
        self.dfu = DataFileUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass


    def compound_set_from_file(self, ctx, params):
        """
        CompoundSetFromFile
        string staging_file_path
        :param params: instance of type "compoundset_upload_params" ->
           structure: parameter "workspace_name" of String, parameter
           "staging_file_path" of String, parameter "compound_set_name" of
           String
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_from_file
        self._check_required_param(params, ['workspace_name',
                                            'staging_file_path',
                                            'compound_set_name'])
        scratch_file_path = self.dfu.download_staging_file(
            {'staging_file_subdir_path': params['staging_file_path']}
        ).get('copy_file_path')
        # I probably should be uploading the raw files to shock

        ext = os.path.splitext(scratch_file_path)[1]
        file_name = os.path.basename(scratch_file_path)
        if ext == '.sdf':
            compounds = parse.read_sdf(scratch_file_path)
        elif ext == '.tsv':
            compounds = parse.read_tsv(scratch_file_path)
        else:
            raise ValueError('Invalid input file type. Expects .tsv or .sdf')

        compoundset = {
            'id': params['compound_set_name'],
            'name': params['compound_set_name'],
            'description': 'Compound Set produced from %s' % file_name,
            'compounds': compounds,
        }

        output = self._save_to_ws_and_report(ctx, 'compound_set_from_file',
                                             params['workspace_name'],
                                             params['staging_file_path'],
                                             compoundset)
        #END compound_set_from_file

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_from_file return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def compound_set_to_file(self, ctx, params):
        """
        CompoundSetToFile
        string compound_set_name
        string output_format
        :param params: instance of type "compoundset_download_params" ->
           structure: parameter "workspace_name" of String, parameter
           "compound_set_name" of String, parameter "output_format" of String
        :returns: instance of type "compoundset_download_results" ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_to_file
        self._check_required_param(params, ['workspace_name', 'compound_set_name',
                                            'output_format'])
        compoundset = self.ws_client.get_objects2({'objects': [
            {'workspace': params['workspace_name'],
             'name': params['compound_set_name']}]})['data'][0]['data']
        ext = params['output_format']
        out = "%s/%s.%s" % (self.scratch, compoundset['name'], ext)
        if ext == 'sdf':
            outfile_path = parse.write_sdf(compoundset, out)
        elif ext == 'tsv':
            outfile_path = parse.write_tsv(compoundset, out)
        else:
            raise ValueError('Invalid output file type. Expects tsv or sdf')

        report_files = [{'path': outfile_path,
                         'name': os.path.basename(outfile_path),
                         'label': os.path.basename(outfile_path),
                         'description': 'A compound set in %s format' % ext}]

        report_params = {
            'objects_created': [],
            'message': 'Converted %s compound set to %s format.' % (
                params['compound_set_name'], params['output_format']),
            'file_links': report_files,
            'workspace_name': params['workspace_name'],
            'report_object_name': 'compound_set_download_report'
        }

        # Construct the output to send back
        report_client = KBaseReport(self.callback_url)
        report_info = report_client.create_extended_report(report_params)
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref'],
                  }
        #END compound_set_to_file

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_to_file return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def compound_set_from_model(self, ctx, params):
        """
        CompoundSetFromModel
        required:
        string workspace_name
        string model_name
        string compound_set_name
        :param params: instance of type "compoundset_from_model_params" ->
           structure: parameter "workspace_name" of String, parameter
           "model_name" of String, parameter "compound_set_name" of String
        :returns: instance of type "compoundset_upload_results" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "compoundset_ref" of type "obj_ref"
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN compound_set_from_model
        self._check_required_param(params, ['workspace_name', 'model_name',
                                            'compound_set_name'])
        model = self.ws_client.get_objects2({'objects': [
            {'workspace': params['workspace_name'],
             'name': params['model_name']}]})['data'][0]['data']
        compounds, undef = parse.parse_model(model)
        compoundset = {
            'id': params['compound_set_name'],
            'name': params['compound_set_name'],
            'description': 'Compound Set produced from %s, a metabolic model'
                           % model['id'],
            'compounds': compounds,
        }

        output = self._save_to_ws_and_report(ctx, 'compound_set_from_model',
                                             params['workspace_name'],
                                             params['model_name'], compoundset)
        #END compound_set_from_model

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method compound_set_from_model return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {'state': "OK",
                     'message': "",
                     'version': self.VERSION,
                     'git_url': self.GIT_URL,
                     'git_commit_hash': self.GIT_COMMIT_HASH}
        #END_STATUS
        return [returnVal]
class ImportGFFFastaUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_gff_fasta_from_staging(self, params):
        """
        import_gff_fasta_from_staging: wrapper method for GenomeFileUtil.fasta_gff_to_genome

        required params:
        fasta_file: fasta file from user's staging area
        gff_file: gff file from user's staging area
        genome_name: output genome object name
        workspace_name: workspace name that genome will be stored to

        file paths for both fasta and gff files must be subdirectory file path in staging area
        e.g.
        for file: /data/bulk/user_name/file_name
        staging_file_subdir_path is file_name
        for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
        staging_file_subdir_path is subdir_1/subdir_2/file_name

        optional params:
        scientific_name: proper name for species, key for taxonomy lookup.Default to 'unknown_taxon'
        source: Source Of The GenBank File. Default to 'User'
        taxon_wsname - where the reference taxons are. Default to 'ReferenceTaxons'
        taxon_reference - if defined, will try to link the Genome to the specified taxonomy object
        release: Release Or Version Of The Source Data
        genetic_code: Genetic Code For The Organism
        type: 'Reference', 'User upload', 'Representative'

        return:
        genome_ref: return object reference
        report_name: name of generated report (if any)
        report_ref: report reference (if any)
        """

        log('--->\nrunning ImportGFFFastaUtil.import_gff_fasta_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_gff_fasta_from_staging_params(params)

        for key in ('fasta_file', 'gff_file'):
            file_path = params[key]
            download_staging_file_params = {
                'staging_file_subdir_path': file_path
            }
            dfu_returnVal = self.dfu.download_staging_file(
                download_staging_file_params)
            params[key] = {'path': dfu_returnVal['copy_file_path']}

        returnVal = self.gfu.fasta_gff_to_genome(params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            download_staging_file_params.get('staging_file_subdir_path'),
            returnVal['genome_ref'])
        return returnVal

    def validate_import_gff_fasta_from_staging_params(self, params):
        """
        validate_import_gff_fasta_from_staging_params:
                    validates params passed to import_gff_fasta_from_staging method
        """
        # check for required parameters
        for p in ['genome_name', 'workspace_name', 'fasta_file', 'gff_file']:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        # for now must use workspace name, but no ws_id_to_name() function available
        if str(params["workspace_name"]).isdigit():
            error_msg = '"{}" parameter is a workspace id and workspace name is required'.format(
                params["workspace_name"])
            raise ValueError(error_msg)
Example #12
0
class ImportSRAUtil:

    SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump'

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _check_fastq_dump_result(self, tmp_dir, sra_name):
        """
        _check_fastq_dump_result: check fastq_dump result is PE or SE
        """
        return os.path.exists(tmp_dir + '/' + sra_name + '/1')

    def _sra_to_fastq(self, scratch_sra_file_path, params):
        """
        _sra_to_fastq: convert SRA file to FASTQ file(s)
        """

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)

        command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O '
        command += tmp_dir + ' ' + scratch_sra_file_path

        self._run_command(command)

        sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0]
        paired_end = self._check_fastq_dump_result(tmp_dir, sra_name)

        if paired_end:
            self._validate_paired_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'

            rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq')
            os.rename(rev_file, rev_file + '.fastq')
            rev_file = rev_file + '.fastq'
        else:
            self._validate_single_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'
            rev_file = None

        fastq_file_path = {'fwd_file': fwd_file, 'rev_file': rev_file}
        return fastq_file_path

    def _validate_single_end_advanced_params(self, params):
        """
        _validate_single_end_advanced_params: validate advanced params for single end reads
        """
        if (params.get('insert_size_mean') or params.get('insert_size_std_dev')
                or params.get('read_orientation_outward')):
            error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or '
            error_msg += '"Reads Orientation Outward" is Paried End Reads specific'
            raise ValueError(error_msg)

        if 'interleaved' in params:
            del params['interleaved']

    def _validate_paired_end_advanced_params(self, params):
        """
        _validate_paired_end_advanced_params: validate advanced params for paired end reads

        """
        sequencing_tech = params.get('sequencing_tech')

        if sequencing_tech in ['PacBio CCS', 'PacBio CLR']:
            error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" '
            error_msg += 'is Single End Reads specific'
            raise ValueError(error_msg)

    def _validate_upload_staging_file_availability(self,
                                                   staging_file_subdir_path):
        """
        _validate_upload_file_path_availability: validates file availability in user's staging area

        """
        pass
        # TODO ftp_server needs to be fixed for subdir
        # list = ftp_service(self.callback_url).list_files()
        # if staging_file_subdir_path not in list:
        #     error_msg = 'Target file: {} is NOT available.\n'.format(
        #                                         staging_file_subdir_path.rpartition('/')[-1])
        #     error_msg += 'Available files:\n {}'.format("\n".join(list))
        #     raise ValueError(error_msg)

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = os.path.join(config['scratch'],
                                    'import_SRA_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_sra_from_staging(self, params):
        '''
          import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          sequencing_tech: sequencing technology
          name: output reads file name
          workspace_name: workspace name/ID of the object

          Optional Params:
          single_genome: whether the reads are from a single genome or a metagenome.
          insert_size_mean: mean (average) insert length
          insert_size_std_dev: standard deviation of insert lengths
          read_orientation_outward: whether reads in a pair point outward

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_sra_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        log('Downloaded staging file to: {}'.format(scratch_sra_file_path))

        fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params)

        import_sra_reads_params = params
        import_sra_reads_params.update(fastq_file_path)

        workspace_name_or_id = params.get('workspace_name')
        if str(workspace_name_or_id).isdigit():
            import_sra_reads_params['wsid'] = int(workspace_name_or_id)
        else:
            import_sra_reads_params['wsname'] = str(workspace_name_or_id)

        log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
            json.dumps(import_sra_reads_params, indent=1)))
        returnVal = self.ru.upload_reads(import_sra_reads_params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            params.get('staging_file_subdir_path'), returnVal['obj_ref'])
        return returnVal

    def import_sra_from_web(self, params):
        '''
        import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome

        required params:
        download_type: download type for web source fastq file
                       ('Direct Download', 'FTP', 'DropBox', 'Google Drive')
        workspace_name: workspace name/ID of the object

        sra_urls_to_add: dict of SRA file URLs
            required params:
            file_url: SRA file URL
            sequencing_tech: sequencing technology
            name: output reads file name

            Optional Params:
            single_genome: whether the reads are from a single genome or a metagenome.
            insert_size_mean: mean (average) insert length
            insert_size_std_dev: standard deviation of insert lengths
            read_orientation_outward: whether reads in a pair point outward

        return:
        obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_web_params(params)

        download_type = params.get('download_type')
        workspace_name = params.get('workspace_name')

        obj_refs = []
        uploaded_files = []

        for sra_url_to_add in params.get('sra_urls_to_add'):
            download_web_file_params = {
                'download_type': download_type,
                'file_url': sra_url_to_add.get('file_url')
            }
            scratch_sra_file_path = self.dfu.download_web_file(
                download_web_file_params).get('copy_file_path')
            log('Downloaded web file to: {}'.format(scratch_sra_file_path))

            fastq_file_path = self._sra_to_fastq(scratch_sra_file_path,
                                                 sra_url_to_add)

            import_sra_reads_params = sra_url_to_add
            import_sra_reads_params.update(fastq_file_path)

            workspace_name_or_id = workspace_name
            if str(workspace_name_or_id).isdigit():
                import_sra_reads_params['wsid'] = int(workspace_name_or_id)
            else:
                import_sra_reads_params['wsname'] = str(workspace_name_or_id)

            log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
                json.dumps(import_sra_reads_params, indent=1)))

            obj_ref = self.ru.upload_reads(import_sra_reads_params).get(
                'obj_ref')
            obj_refs.append(obj_ref)
            uploaded_files.append(sra_url_to_add.get('file_url'))

        return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files}

    def validate_import_sra_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'sequencing_tech', 'name',
                'workspace_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        self._validate_upload_staging_file_availability(
            params.get('staging_file_subdir_path'))

    def validate_import_sra_from_web_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['download_type', 'workspace_name', 'sra_urls_to_add']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        if not isinstance(params.get('sra_urls_to_add'), list):
            raise ValueError('sra_urls_to_add is not type list as required')

        for sra_url_to_add in params.get('sra_urls_to_add'):
            for p in ['file_url', 'sequencing_tech', 'name']:
                if p not in sra_url_to_add:
                    raise ValueError(
                        '"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_refs_list, params):
        """
        generate_report: generate summary report

        obj_refs: generated workspace object references. (return of import_sra_from_staging/web)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """
        uuid_string = str(uuid.uuid4())

        objects_created = list()
        objects_data = list()

        for obj_ref in obj_refs_list:
            get_objects_params = {
                'object_refs': [obj_ref],
                'ignore_errors': False
            }
            objects_data.append(self.dfu.get_objects(get_objects_params))

            objects_created.append({
                'ref': obj_ref,
                'description': 'Imported Reads'
            })

        output_html_files = self.generate_html_report(objects_data, params,
                                                      uuid_string)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 460,
            'report_object_name': 'kb_sra_upload_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def generate_html_report(self, reads_objs, params, uuid_string):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        pprint(params)

        result_file_path = os.path.join(self.scratch, 'report.html')
        html_report = list()
        objects_content = ''

        for index, reads_obj in enumerate(reads_objs):

            idx = str(index)
            reads_data = reads_obj.get('data')[0].get('data')
            reads_info = reads_obj.get('data')[0].get('info')
            reads_ref = str(reads_info[6]) + '/' + str(
                reads_info[0]) + '/' + str(reads_info[4])
            reads_obj_name = str(reads_info[1])

            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template_sra/table_panel.html'),
                    'r') as object_content_file:
                report_template = object_content_file.read()
                report_template = report_template.replace('_NUM', str(idx))
                report_template = report_template.replace(
                    'OBJECT_NAME', reads_obj_name)
                if index == 0:
                    report_template = report_template.replace(
                        'panel-collapse collapse',
                        'panel-collapse collapse in')

            objects_content += report_template
            base_percentages = ''
            for key, val in reads_data.get('base_percentages').iteritems():
                base_percentages += '{}({}%) '.format(key, val)

            reads_overview_data = collections.OrderedDict()

            reads_overview_data['Name'] = '{} ({})'.format(
                reads_obj_name, reads_ref)
            reads_overview_data['Uploaded File'] = params.get(
                'uploaded_files')[index]
            reads_overview_data['Date Uploaded'] = time.strftime("%c")
            reads_overview_data['Number of Reads'] = '{:,}'.format(
                reads_data.get('read_count'))

            reads_type = reads_info[2].lower()
            if 'single' in reads_type:
                reads_overview_data['Type'] = 'Single End'
            elif 'paired' in reads_type:
                reads_overview_data['Type'] = 'Paired End'
            else:
                reads_overview_data['Type'] = 'Unknown'

            reads_overview_data['Platform'] = reads_data.get(
                'sequencing_tech', 'Unknown')

            reads_single_genome = str(
                reads_data.get('single_genome', 'Unknown'))
            if '0' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'No'
            elif '1' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'Yes'
            else:
                reads_overview_data['Single Genome'] = 'Unknown'

            insert_size_mean = params.get('insert_size_mean', 'Not Specified')
            if insert_size_mean is not None:
                reads_overview_data['Insert Size Mean'] = str(insert_size_mean)
            else:
                reads_overview_data['Insert Size Mean'] = 'Not Specified'

            insert_size_std_dev = params.get('insert_size_std_dev',
                                             'Not Specified')
            if insert_size_std_dev is not None:
                reads_overview_data['Insert Size Std Dev'] = str(
                    insert_size_std_dev)
            else:
                reads_overview_data['Insert Size Std Dev'] = 'Not Specified'

            reads_outward_orientation = str(
                reads_data.get('read_orientation_outward', 'Unknown'))
            if '0' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'No'
            elif '1' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'Yes'
            else:
                reads_overview_data['Outward Read Orientation'] = 'Unknown'

            reads_stats_data = collections.OrderedDict()

            reads_stats_data['Number of Reads'] = '{:,}'.format(
                reads_data.get('read_count'))
            reads_stats_data['Total Number of Bases'] = '{:,}'.format(
                reads_data.get('total_bases'))
            reads_stats_data['Mean Read Length'] = str(
                reads_data.get('read_length_mean'))
            reads_stats_data['Read Length Std Dev'] = str(
                reads_data.get('read_length_stdev'))
            dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \
                                                reads_data.get('read_count'))
            reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \
                .format(str(reads_data.get('number_of_duplicates')),
                        dup_reads_percent)
            reads_stats_data['Phred Type'] = str(reads_data.get('phred_type'))
            reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format(
                reads_data.get('qual_mean'))
            reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format(
                str(reads_data.get('qual_min')),
                str(reads_data.get('qual_max')))
            reads_stats_data['GC Percentage'] = str(
                round(reads_data.get('gc_content') * 100, 2)) + '%'
            reads_stats_data['Base Percentages'] = base_percentages

            overview_content = ''
            for key, val in reads_overview_data.iteritems():
                overview_content += '<tr><td><b>{}</b></td>'.format(key)
                overview_content += '<td>{}</td>'.format(val)
                overview_content += '</tr>'

            stats_content = ''
            for key, val in reads_stats_data.iteritems():
                stats_content += '<tr><td><b>{}</b></td>'.format(key)
                stats_content += '<td>{}</td>'.format(val)
                stats_content += '</tr>'

            objects_content = objects_content.replace('###OVERVIEW_CONTENT###',
                                                      overview_content)
            objects_content = objects_content.replace('###STATS_CONTENT###',
                                                      stats_content)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template_sra/report_head.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '###TABLE_PANELS_CONTENT###', objects_content)
                result_file.write(report_template)
        result_file.close()

        shutil.copytree(
            os.path.join(os.path.dirname(__file__),
                         'report_template_sra/bootstrap-3.3.7'),
            os.path.join(self.scratch, 'bootstrap-3.3.7'))
        shutil.copy(
            os.path.join(os.path.dirname(__file__),
                         'report_template_sra/jquery-3.2.1.min.js'),
            os.path.join(self.scratch, 'jquery-3.2.1.min.js'))

        matched_files = []
        for root, dirnames, filenames in os.walk(self.scratch):
            for filename in fnmatch.filter(filenames, '*.gz'):
                matched_files.append(os.path.join(root, filename))

        for gz_file in matched_files:
            print('Removing ' + gz_file)
            os.remove(gz_file)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': self.scratch,
            'pack': 'zip'
        })['shock_id']
        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Imported Assembly'
        })
        return html_report
class ImportSRAUtil:

    SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump'

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed commend:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running commend:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _check_fastq_dump_result(self, tmp_dir, sra_name):
        """
        _check_fastq_dump_result: check fastq_dump result is PE or SE
        """
        return os.path.exists(tmp_dir + '/' + sra_name + '/1')

    def _sra_to_fastq(self, scratch_sra_file_path):
        """
        _sra_to_fastq: convert SRA file to FASTQ file(s)
        """

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(tmp_dir)

        command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O '
        command += tmp_dir + ' ' + scratch_sra_file_path

        self._run_command(command)

        sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0]
        paired_end = self._check_fastq_dump_result(tmp_dir, sra_name)

        if paired_end:
            fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'

            rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq')
            os.rename(rev_file, rev_file + '.fastq')
            rev_file = rev_file + '.fastq'
        else:
            fwd_file = os.path.join(tmp_dir, sra_name, 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'
            rev_file = None

        fastq_file_path = {'fwd_file': fwd_file, 'rev_file': rev_file}
        return fastq_file_path

    def _validate_upload_staging_file_availability(self,
                                                   staging_file_subdir_path):
        """
        _validate_upload_file_path_availability: validates file availability in user's staging area

        """
        pass
        # TODO ftp_server needs to be fixed for subdir
        # list = ftp_service(self.callback_url).list_files()
        # if staging_file_subdir_path not in list:
        #     error_msg = 'Target file: {} is NOT available.\n'.format(
        #                                         staging_file_subdir_path.rpartition('/')[-1])
        #     error_msg += 'Available files:\n {}'.format("\n".join(list))
        #     raise ValueError(error_msg)

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)

    def import_sra_from_staging(self, params):
        '''
          import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          sequencing_tech: sequencing technology
          name: output reads file name
          workspace_name: workspace name/ID of the object

          Optional Params:
          single_genome: whether the reads are from a single genome or a metagenome.
          insert_size_mean: mean (average) insert length
          insert_size_std_dev: standard deviation of insert lengths
          read_orientation_outward: whether reads in a pair point outward

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_sra_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        log('Downloaded staging file to: {}'.format(scratch_sra_file_path))

        fastq_file_path = self._sra_to_fastq(scratch_sra_file_path)

        import_sra_reads_params = params
        import_sra_reads_params.update(fastq_file_path)

        workspace_name_or_id = params.get('workspace_name')
        if str(workspace_name_or_id).isdigit():
            import_sra_reads_params['wsid'] = int(workspace_name_or_id)
        else:
            import_sra_reads_params['wsname'] = str(workspace_name_or_id)

        log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
            json.dumps(import_sra_reads_params, indent=1)))
        returnVal = self.ru.upload_reads(import_sra_reads_params)

        return returnVal

    def validate_import_sra_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method

        """

        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'sequencing_tech', 'name',
                'workspace_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        self._validate_upload_staging_file_availability(
            params.get('staging_file_subdir_path'))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report


        obj_ref: generated workspace object references. (return of import_sra_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}

        object_data = self.dfu.get_objects(get_objects_params)
        number_of_reads = object_data.get('data')[0].get('data').get(
            'read_count')

        upload_message += "Reads Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported Reads File: {}\n'.format(
            params.get('staging_file_subdir_path'))
        if isinstance(number_of_reads, (int, long)):
            upload_message += 'Number of Reads: {:,}\n'.format(number_of_reads)

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Example #14
0
class variation_importer_utils:
    def __init__(self, utility_params):
        self.params = utility_params
        # self.scratch = utility_params['scratch']
        self.scratch = os.path.join(utility_params['scratch'],
                                    'variation_importer_' + str(uuid.uuid4()))
        os.mkdir(self.scratch)
        self.service_wiz_url = utility_params['srv-wiz-url']
        self.callback_url = utility_params['callback_url']

        self.dfu = DataFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url,
                               token=utility_params['token'])

    def _create_fake_location_data(self):
        location = {
            'lat':
            random.uniform(-90, 90),
            'lon':
            random.uniform(-180, 180),
            'elevation':
            random.uniform(0, 100),
            'description':
            "".join([random.choice(string.ascii_letters) for n in xrange(20)])
        }
        return location

    def _create_fake_straininfo(self, genotype_id):
        straininfo = {
            'source_id': genotype_id,
            'location_info': self._create_fake_location_data()
        }
        return straininfo

    def _create_fake_population(self, genotypes):
        population = {'description': 'Faker population data.', 'strains': []}
        for genome in genotypes:
            population['strains'].append(self._create_fake_straininfo(genome))
        return population

    def _create_fake_kinship_matrix(self):
        kinship = {
            'row_ids': ['one', 'two'],
            'col_ids': ['one', 'two'],
            'kinship_coefficients': [[0.1, 0.1], [0.1, 0.1]]
        }
        return kinship

    def _compare(self, s, t):
        return Counter(s) == Counter(t)

    def pretend_download_staging_file(self, vcf_filename, scratch):
        vcf_filepath = os.path.join(scratch, vcf_filename)
        shutil.copy('/kb/module/data/' + vcf_filename, vcf_filepath)
        return {'copy_file_path': vcf_filepath}

    def _generate_population(self,
                             location_filepath,
                             genotypes,
                             population_description="None Provided"):
        locations = pd.read_csv(location_filepath, delimiter='\t')

        # Drop any missing data from id, latitude, or longitude.
        locations.dropna(subset=['id', 'latitude', 'longitude'], inplace=True)

        # Compare the location IDs with the genotype IDs
        if not (self._compare(locations.iloc[:, 0].astype(str).tolist(),
                              genotypes)):
            log("Location IDs do not match Sample IDs in Variation file!")
            raise ValueError(
                "Location IDs do not match Sample IDs in Variation file!")

        col_names = [x.lower() for x in locations.columns.values]
        expected_columns = ['id', 'latitude', 'longitude']
        optional_columns = ['elevation', 'description']

        # CHeck that first three columns match the expected columns.
        if not (self._compare(col_names[0:3], expected_columns)):
            raise ValueError("Missing or unexpected column names in {}".format(
                location_filepath))

        # If optional columns are not present, give default value for each.
        for col in optional_columns:
            if col not in col_names:
                if col == 'elevation':
                    locations[col] = 0.0
                else:
                    locations[col] = "None provided."

        population = {'description': population_description, 'strains': []}
        for idx, row in locations.iterrows():
            population['strains'].append({
                'source_id': str(row['id']),
                'location_info': {
                    'lat': row['latitude'],
                    'lon': row['longitude'],
                    'elevation': row['elevation'],
                    'description': row['description']
                }
            })

        return population

    def _validate_vcf(self, vcf_filepath, vcf_version):
        validation_output_dir = os.path.join(self.scratch,
                                             'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)
        ## TODO: Make this choice more robust.
        ## Attempt conversion to 4.1?
        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-o")
            validator_cmd.append(validation_output_dir)
        else:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version below 4.1.  No validation logging.")

        print("Validator command: {}".format(validator_cmd))
        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            validator_output.append(line)

        p.wait()

        validation_output_filename = [
            f for f in os.listdir(validation_output_dir) if f.endswith('.txt')
        ][0]
        validation_output_filepath = os.path.join(validation_output_dir,
                                                  validation_output_filename)

        if not validation_output_filename:
            print('Validator did not generate log file!')
            raise Exception("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filepath))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filepath, p.returncode

    # Retrieve contigs from assembly file.
    def _get_contigs_from_assembly(self, assembly_ref, type='Assembly'):
        try:
            assembly_data = self.dfu.get_objects(
                {'object_refs': [assembly_ref]})['data'][0]['data']
        except Exception as e:
            print("Unable to retrieve Assembly reference: {}".format(
                assembly_ref))
            raise ValueError(e)
        raw_contigs = assembly_data['contigs']
        contigs = {}

        # Contigs returns just a dict with key and contig_id
        # Why did I have this here?  Is it necessary?
        for key, value in raw_contigs.iteritems():
            contigs[str(key)] = value['contig_id']
        return raw_contigs

    def _get_version_contigs_genotypes(self, vcf_filepath):
        contigs = []
        genotypes = []
        version = ''
        with (gzip.open if vcf_filepath.endswith('.gz') else open)(
                vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted."
                    )
                raise ValueError(
                    "Invalid VCF.  ##fileformat line in meta is improperly formatted."
                )
            version = float(tokens[1][-4:].rstrip())
            log("VCF version: {}".format(version))
            for line in vcf:
                if line.startswith("#CHROM"):
                    log("#CHROM encountered, exiting loop.")
                    genotypes = line.split()[9:]
                    log("Number Genotypes in vcf: {}".format(len(genotypes)))
                    break
                tokens = line.split("=")

                if tokens[0].startswith('##contig'):
                    contigs.append(tokens[2][:-2])
        return version, contigs, genotypes

    # Arabidopsis ref: 18590/2/8
    def _get_assembly_ref_from_genome(self, genome_ref):
        ga = GenomeAnnotationAPI(self.service_wiz_url)
        inputs_get_assembly = {'ref': genome_ref}
        try:
            assembly_object_ref = ga.get_assembly(inputs_get_assembly)
        except Exception as e:
            print(
                "Unable to retrieve Assembly reference ID from Genome ref_id: {}"
                .format(genome_ref))
            raise Exception(e)

        return assembly_object_ref

    def _generate_output_file_list(self):
        log('Start packing result files')
        output_files = list()

        result_file = os.path.join(self.scratch,
                                   'variation_importer_results.zip')
        excluded_extensions = ['.zip', '.vcf', '.vcf.gz', '.html', '.DS_Store']
        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(self.scratch):
                for file in files:
                    if not (file.endswith(tuple(excluded_extensions))
                            # file.endswith('.zip') or
                            # file.endswith('.vcf') or
                            # file.endswith('.vcf.gz') or
                            # file.endswith('.html') or
                            # file.endswith('.DS_Store')
                            ):
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({
            'path':
            result_file,
            'name':
            os.path.basename(result_file),
            'label':
            os.path.basename(result_file),
            'description':
            'File(s) generated by Variation Importer'
        })
        log("Importer output generated: {}".format(output_files))

        return output_files

    def _generate_report(self, params, variation_results, variation_file_path):

        stats_results = self._generate_variation_stats(None,
                                                       variation_file_path)

        html_report = self._generate_html_report(variation_results,
                                                 stats_results)

        file_links = self._generate_output_file_list()
        objects = []
        if (variation_results['valid_variation_file']):
            objects = [{
                'ref':
                variation_results['variation_obj_ref'],
                'description':
                'Variation Object created by VCF Importer'
            }]

        report_params = {
            'objects_created': objects,
            'message': '',
            'direct_html_link_index': 0,
            'file_links': file_links,
            'html_links': html_report,
            'html_window_height': 330,
            'workspace_name': params['workspace_name'],
            'report_object_name':
            'variation_importer_report_' + str(uuid.uuid4())
        }
        kbr_output = self.kbr.create_extended_report(report_params)
        report_output = {
            'report_name': kbr_output['name'],
            'report_ref': kbr_output['ref'],
            'variation_ref': variation_results['variation_obj_ref']
        }
        log("Returning from _generate_report!")
        return report_output

    def _generate_html_report(self, variation_results, stats_output=None):
        """
            _generate_html_report: generate html report from output files
        """
        html_report = list()
        print("Validation output filepath passed to html report: {}".format(
            variation_results['validation_output_filepath']))
        try:
            report_dir = os.path.join(self.scratch, 'html')
            os.mkdir(report_dir)

            with open(template_dir, 'r') as html, open(
                    variation_results['validation_output_filepath'],
                    'r') as validation:

                validation_content = '<p><h4>{} '.format(
                    variation_results['variation_filename'])
                if variation_results.get('valid_variation_file'):
                    validation_content += '<em><i>is</i> a valid </em> variation file.'
                else:
                    validation_content += '<em><i>is not</i> a valid </em>variation file. Details below.'
                validation_content += '</h4></p>'

                report = html.read()

                # Discard the first line of the validation file.  It is irrelevant.
                validation.readline()

                validation_content += '<p><h4>Errors and warning generated by VCF validator:</h4></p>'
                validation_content += '<ul>'
                for line in validation.readlines():
                    validation_content += '<li>{}</li>'.format(line)
                validation_content += '</ul>'

                if variation_results.get('invalid_contigs'):
                    validation_content += '<h4>The following Contigs were not found in the reference genome.  The possible contigs have been written to the file {}.  Please see the associated links to download.</h4>'.format(
                        variation_results.get('genome_ref'),
                        'valid_contigs.txt')
                    validation_content += '<ul>'
                    for contig in variation_results.get('invalid_contigs'):
                        validation_content += '<li>{}</li>'.format(contig)
                    validation_content += '</ul>'

                # if not variation_results.get('contigs'):
                #     validation_content += '<h4>No contig information was included in the VCF file header!  Please recreate the VCF file with each contig described in the meta description </h4>'
                report = report.replace('Validation_Results',
                                        validation_content)

                if (stats_output.get('stats_file_dir')):
                    summary_results = '<p><h4>Summary Statistics</h4></p>'
                    summary_results += '''
                                        <table>
                                            <tr>
                                                <th>Number of SNPs</th>
                                                <th>Number of Genotypes </th>
                                            </tr>
                                        '''
                    summary_results += '<tr>'
                    summary_results += '<td>{}</td><td>{}</td>'.format(
                        'To be added later',
                        variation_results['num_genotypes'])
                    summary_results += '</tr></table>'
                    report = report.replace('Variation_Statistics',
                                            summary_results)

                # visualization
                image_content = ''
                if (stats_output.get('stats_img_dir')):
                    image_dir = stats_output.get('stats_img_dir')

                    for file in glob.glob(os.path.join(image_dir, '*.png')):
                        shutil.move(file, report_dir)

                    for image in glob.glob(report_dir + "/*.png"):
                        image = image.replace(report_dir + '/', '')
                        caption = image.replace(report_dir + '/',
                                                '').replace('.png', '')
                        image_content += '<p style="text-align:center"><img align="center" src="{}" ' \
                            '></a><a target="_blank"><br>' \
                            '<p align="center">{}</p></p>'.format(image, caption)

                else:
                    image_content += 'No visualizations generated.'

                report = report.replace("Visualization_Results", image_content)
        except Exception as e:
            print("Error generating HTML report.")
            raise

        report_file_path = os.path.join(report_dir, 'index.html')
        with open(report_file_path, 'w') as output:
            output.write(report)
        try:
            html_upload_ret = self.dfu.file_to_shock({
                'file_path': report_file_path,
                'make_handle': 0,
                'pack': 'zip'
            })
            log("Variation HTML report to shock ref: {}".format(
                html_upload_ret))
        except:
            raise ValueError('Error uploading HTML to shock')

        html_report.append({
            'shock_id': html_upload_ret['shock_id'],
            'name': os.path.basename(report_file_path),
            'label': os.path.basename(report_file_path),
            'description': 'HTML report for Variation Importer'
        })

        return html_report

    def _generate_variation_stats(self, cmd_line_args, variation_filepath):
        """
            :param commments go here
        """
        file_output_directory = os.path.join(self.scratch,
                                             'stats_' + str(uuid.uuid4()))
        os.mkdir(file_output_directory)

        image_output_directory = os.path.join(
            self.scratch, 'stats_images_' + str(uuid.uuid4()))
        os.mkdir(image_output_directory)

        # TODO: Validate user supplied params and build PLINK command
        plink_cmd = ["plink"]
        plink_cmd.append('--vcf')
        plink_cmd.append(variation_filepath)
        if (cmd_line_args is not None):
            cmds = cmd_line_args.split(';')
            for cmd in cmds:
                plink_cmd.append(cmd)
        # plink_cmd.append('--recode12')
        # plink_cmd.append('transpose')
        # plink_cmd.append('--output-missing-genotype')
        # plink_cmd.append("0")
        plink_cmd.append('--freq')
        plink_cmd.append('--hardy')
        # plink_cmd.append('gz')
        plink_cmd.append('--out')
        plink_cmd.append(variation_filepath)

        print("PLINK arguments: {}".format(plink_cmd))

        plink_output = {
            "errors": [],
            "warnings": []
            # "notes" : []
        }
        p = subprocess.Popen(plink_cmd,
                             cwd=file_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        while True:
            line = p.stdout.readline()
            if not line:
                break
            # log(line)
            tokens = line.split(':')
            if (tokens[0] == 'Error'):
                plink_output['errors'].append(line)
                raise ValueError('PLINK 1.9 error: ' + line)
            elif (tokens[0] == 'Warning'):
                plink_output['warnings'].append(line)
                print(line)
            # elif(tokens[0] == 'Note'):
            #     plink_output['notes'].append(line)
            #     print(line)

        p.stdout.close()
        p.wait()
        plink_output_filepath = os.path.join(file_output_directory,
                                             'plink_cli_output.txt')
        with open(plink_output_filepath, 'w') as plink:
            for data in plink_output:
                plink.write("{}: {}\n".format(data, plink_output[data]))

        plink_output_files = [
            f for f in os.listdir(self.scratch)
            if f.startswith(os.path.basename(variation_filepath) + '.')
        ]

        for file in plink_output_files:
            shutil.move(os.path.join(self.scratch, file),
                        file_output_directory)

        if p.returncode != 0:
            log("PLINK encountered an error during runtime.  Please see log file."
                )

        variation_filename = os.path.basename(variation_filepath)
        base_filepath = os.path.join(file_output_directory, variation_filename)
        freq_filepath = base_filepath + '.frq'

        maf_script_filepath = '/kb/module/lib/kb_variation_importer/Utils/MAF_check.R'
        hwe_script_filepath = '/kb/module/lib/kb_variation_importer/Utils/HWE.R'
        log("Frequency filepath: {}".format(freq_filepath))
        # TODO: make function to do Rscript calls.
        # generate visualizations and store in directory
        maf_command = ['Rscript']
        maf_command.append('--no-save')
        maf_command.append('--vanilla')
        maf_command.append(maf_script_filepath)
        maf_command.append(freq_filepath)
        maf_command.append("Minor Allele Frequencies.png")
        print("MAF command: {}".format(maf_command))
        r = subprocess.Popen(maf_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        r.wait()

        if r.returncode != 0:
            log("Error creating MAF histogram in R")

        hwe_filepath = base_filepath + '.hwe'
        zoom_filepath = hwe_filepath + '.zoom'
        log("HWE filepath: {}".format(hwe_filepath))
        zoom_command = '''awk '{{ if ($9 < 0.00001) print $0 }}' {} > {}'''.format(
            hwe_filepath, zoom_filepath)
        log("Zoom cmd: {}".format(zoom_command))

        try:
            z = subprocess.Popen(zoom_command,
                                 cwd=file_output_directory,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=True)
            z.wait()

            if z.returncode != 0:
                log("Error creating HWE zoom file.")

        except Exception as e:
            log("Error creating zoom HWE file: {}".format(e))

        hwe_command = ['Rscript']
        hwe_command.append('--no-save')
        hwe_command.append('--vanilla')
        hwe_command.append(hwe_script_filepath)
        hwe_command.append(hwe_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium.png")
        hwe_command.append(zoom_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium Zoom.png")
        print("MAF command: {}".format(hwe_command))
        h = subprocess.Popen(hwe_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        h.wait()

        if h.returncode != 0:
            log("Error generating HWE Zoom plot")

        return {
            'stats_file_dir': file_output_directory,
            'stats_img_dir': image_output_directory
        }

    def _save_variation_to_ws(self, workspace_name, variation_object_name,
                              variation_obj, variation_filepath,
                              kinship_matrix):
        ws_id = self.dfu.ws_name_to_id(workspace_name)
        try:
            vcf_shock_return = self.dfu.file_to_shock({
                'file_path': variation_filepath,
                'make_handle': 1,
                'pack': 'gzip'
            })
        except Exception as e:
            print("Error uploading file to shock!")
            raise ValueError(e)

        variation_obj['variation_file_reference'] = vcf_shock_return.get(
            'shock_id')

        info = self.dfu.save_objects({
            'id':
            ws_id,
            'objects': [{
                'type': 'KBaseGwasData.Variations',
                'data': variation_obj,
                'name': variation_object_name
            }]
        })[0]

        variation_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        log("Variation reference created: {}".format(variation_ref))
        return variation_ref

    def validate_vcf(self, params):
        """
            :param params: dict containing all input parameters.
        """

        returnVal = {}
        valid_vcf_file = True

        try:
            # vcf_filepath = self.pretend_download_staging_file(
            #     params['variation_file_subdir_path'], self.scratch).get('copy_file_path')

            vcf_filepath = self.dfu.download_staging_file({
                'staging_file_subdir_path':
                params['variation_file_subdir_path']
            }).get('copy_file_path')

            location_filepath = self.pretend_download_staging_file(
                params['variation_attributes_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['variation_file_subdir_path']))

        try:
            location_filepath = self.pretend_download_staging_file(
                params['variation_attributes_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['variation_attributes_subdir_path']))

        # Check file size
        log("{} file size: {}".format(vcf_filepath,
                                      os.path.getsize(vcf_filepath)))
        log('\nValidating {}...'.format(vcf_filepath))

        vcf_version, vcf_contigs, vcf_genotypes = self._get_version_contigs_genotypes(
            vcf_filepath)

        if not vcf_contigs:
            log("No contig data in {} header.".format(vcf_filepath))
            raise ValueError(
                "No contig data in {} header.".format(vcf_filepath))

        if (vcf_version < 4.1):
            log("VCF file is version {}.  Must be at least version 4.1".format(
                vcf_version))
            raise ValueError(
                "VCF file is version {}.  Must be at least version 4.1".format(
                    vcf_version))

        # Generate population object
        population = self._generate_population(location_filepath,
                                               vcf_genotypes)

        # Retrieve Assembly object reference associated with genome.
        try:
            assembly_ref = self._get_assembly_ref_from_genome(
                params['genome_ref'])
        except Exception as e:
            print("Unable to retrieve {}".format(params['genome_ref']))
            raise ValueError(e)

        # Retrieve contig list from Assembly object.
        try:
            assembly_contigs = self._get_contigs_from_assembly(assembly_ref)
        except Exception as e:
            print("Unable to retrieve contigs from Assembly ref: {}".format(
                assembly_ref))
            raise ValueError(e)

        log("Length of assembly contigs: {}".format(len(assembly_contigs)))
        # Compare contig IDs from VCF to those in the Assembly object
        invalid_contigs = []
        for contig in vcf_contigs:
            if contig not in assembly_contigs.keys():
                invalid_contigs.append(contig)

        if invalid_contigs:
            log("Invalid contig IDs found in {}".format(vcf_filepath))
            valid_contig_filepath = os.path.join(self.scratch,
                                                 'valid_contigs.txt')
            log("Writing valid contigs to file: {}".format(
                valid_contig_filepath))
            with open(valid_contig_filepath, 'w') as icf:
                for contig in assembly_contigs:
                    icf.write(contig + '\n')
            valid_vcf_file = False

        validation_output_filepath, returncode = self._validate_vcf(
            vcf_filepath, vcf_version)

        if returncode != 0:
            valid_vcf_file = False

        kinship_matrix = self._create_fake_kinship_matrix()

        variation_obj_ref = ''
        if valid_vcf_file:
            variation_object = {
                "genome": params['genome_ref'],
                "population": population,
                "contigs": vcf_contigs,
                "comment": "Comments go here",
                "assay": "Assay data goes gere.",
                "originator": "PI/Lab info goes here",
                "pubmed_id": "PubMed ID goes here",
                "kinship_info": kinship_matrix
            }

            variation_obj_ref = self._save_variation_to_ws(
                params['workspace_name'], params['variation_object_name'],
                variation_object, vcf_filepath, kinship_matrix)

        log("Variation object reference: {}".format(variation_obj_ref))
        variation_report_metadata = {
            'valid_variation_file': valid_vcf_file,
            'variation_obj_ref': variation_obj_ref,
            'variation_filename': os.path.basename(vcf_filepath),
            'validation_output_filepath': validation_output_filepath,
            'vcf_version': vcf_version,
            'num_genotypes': len(vcf_genotypes),
            'num_contigs': len(vcf_contigs),
            'invalid_contigs': invalid_contigs
        }

        returnVal = self._generate_report(params, variation_report_metadata,
                                          vcf_filepath)

        return returnVal