Ejemplo n.º 1
0
class ImportSRAUtil:

    SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump'

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _check_fastq_dump_result(self, tmp_dir, sra_name):
        """
        _check_fastq_dump_result: check fastq_dump result is PE or SE
        """
        return os.path.exists(tmp_dir + '/' + sra_name + '/1')

    def _sra_to_fastq(self, scratch_sra_file_path, params):
        """
        _sra_to_fastq: convert SRA file to FASTQ file(s)
        """

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)

        command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O '
        command += tmp_dir + ' ' + scratch_sra_file_path

        self._run_command(command)

        sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0]
        paired_end = self._check_fastq_dump_result(tmp_dir, sra_name)

        if paired_end:
            self._validate_paired_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'

            rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq')
            os.rename(rev_file, rev_file + '.fastq')
            rev_file = rev_file + '.fastq'
        else:
            self._validate_single_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'
            rev_file = None

        fastq_file_path = {
            'fwd_file': fwd_file,
            'rev_file': rev_file
        }
        return fastq_file_path


    def _validate_single_end_advanced_params(self, params):
        """
        _validate_single_end_advanced_params: validate advanced params for single end reads
        """
        if (params.get('insert_size_mean')
           or params.get('insert_size_std_dev')
           or params.get('read_orientation_outward')):
            error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or '
            error_msg += '"Reads Orientation Outward" is Paried End Reads specific'
            raise ValueError(error_msg)

        if 'interleaved' in params:
            del params['interleaved']

    def _validate_paired_end_advanced_params(self, params):
        """
        _validate_paired_end_advanced_params: validate advanced params for paired end reads

        """
        sequencing_tech = params.get('sequencing_tech')

        if sequencing_tech in ['PacBio CCS', 'PacBio CLR']:
            error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" '
            error_msg += 'is Single End Reads specific'
            raise ValueError(error_msg)

    def _validate_upload_staging_file_availability(self, staging_file_subdir_path):
        """
        _validate_upload_file_path_availability: validates file availability in user's staging area

        """
        pass
        # TODO ftp_server needs to be fixed for subdir
        # list = ftp_service(self.callback_url).list_files()
        # if staging_file_subdir_path not in list:
        #     error_msg = 'Target file: {} is NOT available.\n'.format(
        #                                         staging_file_subdir_path.rpartition('/')[-1])
        #     error_msg += 'Available files:\n {}'.format("\n".join(list))
        #     raise ValueError(error_msg)

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = os.path.join(config['scratch'], 'import_SRA_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_sra_from_staging(self, params):
        '''
          import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          sequencing_tech: sequencing technology
          name: output reads file name
          workspace_name: workspace name/ID of the object

          Optional Params:
          single_genome: whether the reads are from a single genome or a metagenome.
          insert_size_mean: mean (average) insert length
          insert_size_std_dev: standard deviation of insert lengths
          read_orientation_outward: whether reads in a pair point outward

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_sra_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')
        log('Downloaded staging file to: {}'.format(scratch_sra_file_path))

        fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params)

        import_sra_reads_params = params
        import_sra_reads_params.update(fastq_file_path)

        workspace_name_or_id = params.get('workspace_name')
        if str(workspace_name_or_id).isdigit():
            import_sra_reads_params['wsid'] = int(workspace_name_or_id)
        else:
            import_sra_reads_params['wsname'] = str(workspace_name_or_id)

        log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
                                            json.dumps(import_sra_reads_params, indent=1)))
        returnVal = self.ru.upload_reads(import_sra_reads_params)

        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'),
                                                   returnVal['obj_ref'])
        return returnVal

    def import_sra_from_web(self, params):
        '''
        import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome

        required params:
        download_type: download type for web source fastq file
                       ('Direct Download', 'FTP', 'DropBox', 'Google Drive')
        workspace_name: workspace name/ID of the object

        sra_urls_to_add: dict of SRA file URLs
            required params:
            file_url: SRA file URL
            sequencing_tech: sequencing technology
            name: output reads file name

            Optional Params:
            single_genome: whether the reads are from a single genome or a metagenome.
            insert_size_mean: mean (average) insert length
            insert_size_std_dev: standard deviation of insert lengths
            read_orientation_outward: whether reads in a pair point outward

        return:
        obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_web_params(params)

        download_type = params.get('download_type')
        workspace_name = params.get('workspace_name')

        obj_refs = []
        uploaded_files = []

        for sra_url_to_add in params.get('sra_urls_to_add'):
            download_web_file_params = {
                'download_type': download_type,
                'file_url': sra_url_to_add.get('file_url')
            }
            scratch_sra_file_path = self.dfu.download_web_file(
                        download_web_file_params).get('copy_file_path')
            log('Downloaded web file to: {}'.format(scratch_sra_file_path))

            fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, sra_url_to_add)

            import_sra_reads_params = sra_url_to_add
            import_sra_reads_params.update(fastq_file_path)

            workspace_name_or_id = workspace_name
            if str(workspace_name_or_id).isdigit():
                import_sra_reads_params['wsid'] = int(workspace_name_or_id)
            else:
                import_sra_reads_params['wsname'] = str(workspace_name_or_id)

            log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
                                            json.dumps(import_sra_reads_params, indent=1)))

            obj_ref = self.ru.upload_reads(import_sra_reads_params).get('obj_ref')
            obj_refs.append(obj_ref)
            uploaded_files.append(sra_url_to_add.get('file_url'))

        return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files}

    def validate_import_sra_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['staging_file_subdir_path', 'sequencing_tech', 'name', 'workspace_name']:
            if p not in params:
                raise ValueError('"' + p + '" parameter is required, but missing')

        self._validate_upload_staging_file_availability(params.get('staging_file_subdir_path'))

    def validate_import_sra_from_web_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['download_type', 'workspace_name', 'sra_urls_to_add']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

        if not isinstance(params.get('sra_urls_to_add'), list):
            raise ValueError('sra_urls_to_add is not type list as required')

        for sra_url_to_add in params.get('sra_urls_to_add'):
            for p in ['file_url', 'sequencing_tech', 'name']:
                if p not in sra_url_to_add:
                    raise ValueError('"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_refs_list, params):
        """
        generate_report: generate summary report

        obj_refs: generated workspace object references. (return of import_sra_from_staging/web)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """
        uuid_string = str(uuid.uuid4())

        objects_created = list()
        objects_data = list()

        for obj_ref in obj_refs_list:
            get_objects_params = {
                'object_refs': [obj_ref],
                'ignore_errors': False
            }
            objects_data.append(self.dfu.get_objects(get_objects_params))

            objects_created.append({'ref': obj_ref,
                                    'description': 'Imported Reads'})

        output_html_files = self.generate_html_report(objects_data, params, uuid_string)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 460,
            'report_object_name': 'kb_sra_upload_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def generate_html_report(self, reads_objs, params, uuid_string):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        pprint(params)

        tmp_dir = os.path.join(self.scratch, uuid_string)
        handler_utils._mkdir_p(tmp_dir)
        result_file_path = os.path.join(tmp_dir, 'report.html')
        html_report = list()
        objects_content = ''

        for index, reads_obj in enumerate(reads_objs):

            idx = str(index)
            reads_data = reads_obj.get('data')[0].get('data')
            reads_info = reads_obj.get('data')[0].get('info')
            reads_ref = str(reads_info[6]) + '/' + str(reads_info[0]) + '/' + str(reads_info[4])
            reads_obj_name = str(reads_info[1])

            with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/table_panel.html'),
                      'r') as object_content_file:
                report_template = object_content_file.read()
                report_template = report_template.replace('_NUM', str(idx))
                report_template = report_template.replace('OBJECT_NAME', reads_obj_name)
                if index == 0:
                    report_template = report_template.replace('panel-collapse collapse', 'panel-collapse collapse in')

            objects_content += report_template
            base_percentages = ''
            for key, val in reads_data.get('base_percentages').items():
                base_percentages += '{}({}%) '.format(key, val)

            reads_overview_data = collections.OrderedDict()

            reads_overview_data['Name'] = '{} ({})'.format(reads_obj_name, reads_ref)
            reads_overview_data['Uploaded File'] = params.get('uploaded_files')[index]
            reads_overview_data['Date Uploaded'] = time.strftime("%c")
            reads_overview_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count'))

            reads_type = reads_info[2].lower()
            if 'single' in reads_type:
                reads_overview_data['Type'] = 'Single End'
            elif 'paired' in reads_type:
                reads_overview_data['Type'] = 'Paired End'
            else:
                reads_overview_data['Type'] = 'Unknown'

            reads_overview_data['Platform'] = reads_data.get('sequencing_tech', 'Unknown')

            reads_single_genome = str(reads_data.get('single_genome', 'Unknown'))
            if '0' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'No'
            elif '1' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'Yes'
            else:
                reads_overview_data['Single Genome'] = 'Unknown'

            insert_size_mean = params.get('insert_size_mean', 'Not Specified')
            if insert_size_mean is not None:
                reads_overview_data['Insert Size Mean'] = str(insert_size_mean)
            else:
                reads_overview_data['Insert Size Mean'] = 'Not Specified'

            insert_size_std_dev = params.get('insert_size_std_dev', 'Not Specified')
            if insert_size_std_dev is not None:
                reads_overview_data['Insert Size Std Dev'] = str(insert_size_std_dev)
            else:
                reads_overview_data['Insert Size Std Dev'] = 'Not Specified'

            reads_outward_orientation = str(reads_data.get('read_orientation_outward', 'Unknown'))
            if '0' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'No'
            elif '1' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'Yes'
            else:
                reads_overview_data['Outward Read Orientation'] = 'Unknown'

            reads_stats_data = collections.OrderedDict()

            reads_stats_data['Number of Reads'] = '{:,}'.format(reads_data.get('read_count'))
            reads_stats_data['Total Number of Bases'] = '{:,}'.format(reads_data.get('total_bases'))
            reads_stats_data['Mean Read Length'] = str(reads_data.get('read_length_mean'))
            reads_stats_data['Read Length Std Dev'] = str(reads_data.get('read_length_stdev'))
            dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \
                                                reads_data.get('read_count'))
            reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \
                .format(str(reads_data.get('number_of_duplicates')),
                        dup_reads_percent)
            reads_stats_data['Phred Type'] = str(reads_data.get('phred_type'))
            reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format(reads_data.get('qual_mean'))
            reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format(str(reads_data.get('qual_min')),
                                                                         str(reads_data.get('qual_max')))
            reads_stats_data['GC Percentage'] = str(round(reads_data.get('gc_content') * 100, 2)) + '%'
            reads_stats_data['Base Percentages'] = base_percentages

            overview_content = ''
            for key, val in reads_overview_data.items():
                overview_content += '<tr><td><b>{}</b></td>'.format(key)
                overview_content += '<td>{}</td>'.format(val)
                overview_content += '</tr>'

            stats_content = ''
            for key, val in reads_stats_data.items():
                stats_content += '<tr><td><b>{}</b></td>'.format(key)
                stats_content += '<td>{}</td>'.format(val)
                stats_content += '</tr>'

            objects_content = objects_content.replace('###OVERVIEW_CONTENT###', overview_content)
            objects_content = objects_content.replace('###STATS_CONTENT###', stats_content)

        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template_sra/report_head.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('###TABLE_PANELS_CONTENT###',
                                                          objects_content)
                result_file.write(report_template)
        result_file.close()

        shutil.copytree(os.path.join(os.path.dirname(__file__), 'report_template_sra/bootstrap-3.3.7'),
                        os.path.join(tmp_dir, 'bootstrap-3.3.7'))
        shutil.copy(os.path.join(os.path.dirname(__file__), 'report_template_sra/jquery-3.2.1.min.js'),
                    os.path.join(tmp_dir, 'jquery-3.2.1.min.js'))

        matched_files = []
        for root, dirnames, filenames in os.walk(tmp_dir):
            for filename in fnmatch.filter(filenames, '*.gz'):
                matched_files.append(os.path.join(root, filename))

        for gz_file in matched_files:
            print(('Removing ' + gz_file))
            os.remove(gz_file)

        report_shock_id = self.dfu.file_to_shock({'file_path': tmp_dir,
                                                  'pack': 'zip'})['shock_id']
        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for Imported Assembly'})
        return html_report
Ejemplo n.º 2
0
class UnpackFileUtil:
    def _staging_service_host(self):

        deployment_path = os.environ["KB_DEPLOYMENT_CONFIG"]

        parser = SafeConfigParser()
        parser.read(deployment_path)

        endpoint = parser.get('kb_uploadmethods', 'kbase-endpoint')
        staging_service_host = endpoint + '/staging_service'

        return staging_service_host

    def _file_to_staging(self, file_path_list, subdir_folder=None):
        """
        _file_to_staging: upload file(s) to staging area
        """
        subdir_folder_str = '/' if not subdir_folder else '/{}'.format(
            subdir_folder)
        staging_service_host = self._staging_service_host()
        end_point = staging_service_host + '/upload'
        headers = {'Authorization': self.token}

        files = {'destPath': subdir_folder_str}

        for file_path in file_path_list:
            files.update({
                'uploads': (os.path.basename(file_path), open(file_path, 'rb'))
            })

            resp = _requests.post(end_point, headers=headers, files=files)

            if resp.status_code != 200:
                raise ValueError(
                    'Upload file {} failed.\nError Code: {}\n{}\n'.format(
                        file_path, resp.status_code, resp.text))
            else:
                log("return message from server:\n{}\n".format(resp.text))

    def _remove_irrelevant_files(self, file_path):
        """
        _remove_irrelevant_files: remove irrelevant files
        """
        target_name = os.path.basename(file_path)
        file_dir = os.path.dirname(file_path)
        for dirpath, dirnames, filenames in os.walk(file_dir):
            for filename in filenames:
                if filename != target_name:
                    irrelevant_file_path = os.sep.join([dirpath, filename])
                    os.remove(irrelevant_file_path)
                    log('removing irrelevant file: {}'.format(
                        irrelevant_file_path))

    def _r_unpack(self, file_path, count):
        """
        _r_unpack: recursively unpack file_path
        """
        if count == 0:
            self._remove_irrelevant_files(file_path)

        count += 1
        if os.path.isfile(file_path):
            log('processing:      {}{}'.format('-' * count, file_path))
            t = magic.from_file(file_path, mime=True)

            if os.path.basename(file_path).endswith('.DS_Store'):
                os.remove(file_path)
                log('removing file:   {}{}'.format('-' * count, file_path))
            elif t in [
                    'application/' + x
                    for x in ('x-gzip', 'gzip', 'x-bzip', 'x-bzip2', 'bzip',
                              'bzip2', 'x-tar', 'tar', 'x-gtar', 'zip',
                              'x-zip-compressed')
            ]:
                file_dir = os.path.dirname(file_path)
                files_before_unpack = os.listdir(file_dir)
                self.dfu.unpack_file({'file_path': file_path}).get('file_path')
                files_after_unpack = os.listdir(file_dir)
                new_files = [
                    item for item in files_after_unpack
                    if item not in files_before_unpack
                ]
                for new_file in new_files:
                    self._r_unpack(os.sep.join([file_dir, new_file]), count)
                os.remove(file_path)
                log('removing file:   {}{}'.format('-' * count, file_path))
            else:
                return file_path
        else:
            if os.path.basename(file_path).startswith('_'):
                shutil.rmtree(file_path, ignore_errors=True)
                log('removing folder: {}{}'.format('-' * count, file_path))
            else:
                for dirpath, dirnames, filenames in os.walk(file_path):
                    for filename in filenames:
                        self._r_unpack(os.sep.join([dirpath, filename]), count)

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.user_id = config['USER_ID']
        self.scratch = config['scratch']
        self.dfu = DataFileUtil(self.callback_url)

    def unpack_staging_file(self, params):
        """
        Unpack a staging area file

        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
                for file: /data/bulk/user_name/file_name
                staging_file_subdir_path is file_name
                for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
                staging_file_subdir_path is subdir_1/subdir_2/file_name

        result:
        unpacked_file_path: unpacked file path(s) in staging area

        """

        log('--->\nrunning UnpackFileUtil.unpack_staging_file\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        scratch_file_path = self.dfu.download_staging_file(params).get(
            'copy_file_path')

        self._r_unpack(scratch_file_path, 0)
        unpacked_file_path_list = []
        for dirpath, dirnames, filenames in os.walk(
                os.path.dirname(scratch_file_path)):
            for filename in filenames:
                unpacked_file_path_list.append(os.sep.join([dirpath,
                                                            filename]))

        log("Unpacked files:\n  {}".format(
            '\n  '.join(unpacked_file_path_list)))

        self._file_to_staging(
            unpacked_file_path_list,
            os.path.dirname(params.get('staging_file_subdir_path')))

        unpacked_file_path = ','.join(unpacked_file_path_list)
        returnVal = {'unpacked_file_path': unpacked_file_path}

        return returnVal

    def unpack_web_file(self, params):
        """
        Download and unpack a web file to staging area

        params:
        file_url: file URL
        download_type: one of ['Direct Download', 'FTP',
                    'DropBox', 'Google Drive']

            result:
            unpacked_file_path: unpacked file path(s) in staging area

        """
        log('--->\nrunning UnpackFileUtil.unpack_web_file\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        scratch_file_path = self.dfu.download_web_file(params).get(
            'copy_file_path')

        self._r_unpack(scratch_file_path, 0)
        unpacked_file_path_list = []
        for dirpath, dirnames, filenames in os.walk(
                os.path.dirname(scratch_file_path)):
            for filename in filenames:
                unpacked_file_path_list.append(os.sep.join([dirpath,
                                                            filename]))

        log("Unpacked files:\n  {}".format(
            '\n  '.join(unpacked_file_path_list)))

        self._file_to_staging(unpacked_file_path_list)
        unpacked_file_path = ','.join(unpacked_file_path_list)
        returnVal = {'unpacked_file_path': unpacked_file_path}

        return returnVal

    def generate_report(self, unpacked_file_path, params):
        """
        generate_report: generate summary report

        unpacked_file_path: generated unpacked file path(s) in staging area.
                  (return of unpack_staging_file or unpack_web_file)

        """

        log("generating report")
        uuid_string = str(uuid.uuid4())
        unpacked_file_path_list = unpacked_file_path.split(',')

        subdir = os.path.dirname(
            params.get('staging_file_subdir_path')) + '/' if params.get(
                'staging_file_subdir_path') else '/'

        upload_message = 'Uploaded Files: {}\n'.format(
            len(unpacked_file_path_list))
        for file_path in unpacked_file_path_list:
            upload_message += subdir + os.path.basename(file_path) + '\n'

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Ejemplo n.º 3
0
class GenbankToGenome:
    def __init__(self, config):
        self.cfg = config
        self.gi = GenomeInterface(config)
        self.dfu = DataFileUtil(config.callbackURL)
        self.aUtil = AssemblyUtil(config.callbackURL)
        self.ws = Workspace(config.workspaceURL)
        self._messages = []
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        self.version = re.search("module-version:\n\W+(.+)\n",
                                 yml_text).group(1)
        self.generate_parents = False
        self.generate_ids = False
        self.genes = OrderedDict()
        self.mrnas = OrderedDict()
        self.cdss = OrderedDict()
        self.noncoding = []
        self.ontologies_present = defaultdict(dict)
        self.ontology_events = list()
        self.skiped_features = Counter()
        self.feature_counts = Counter()
        self.orphan_types = Counter()
        self.contig_seq = {}
        self.circ_contigs = set()
        self.features_spaning_zero = set()
        self.genome_warnings = []
        self.genome_suspect = False
        self.defects = Counter()
        self.spoofed_genes = 0
        self.excluded_features = ('source', 'exon', 'fasta_record')
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.re_api_url = config.re_api_url
        # dict with feature 'id's that have been used more than once.
        self.used_twice_identifiers = {}
        self.default_params = {
            'source':
            'Genbank',
            'taxon_wsname':
            self.cfg.raw['taxon-workspace-name'],
            'taxon_lookup_obj_name':
            self.cfg.raw['taxon-lookup-object-name'],
            'ontology_wsname':
            self.cfg.raw['ontology-workspace-name'],
            'ontology_GO_obj_name':
            self.cfg.raw['ontology-gene-ontology-obj-name'],
            'ontology_PO_obj_name':
            self.cfg.raw['ontology-plant-ontology-obj-name'],
            'release':
            None,
            'genetic_code':
            11,
            'generate_ids_if_needed':
            0,
            'metadata': {}
        }

    @property
    def messages(self):
        return "\n".join(self._messages)

    def refactored_import(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) construct the input directory staging area
        input_directory = self.stage_input(params)

        # 3) update default params
        self.default_params.update(params)
        params = self.default_params
        self.generate_parents = params.get('generate_missing_genes')
        self.generate_ids = params.get('generate_ids_if_needed')
        if params.get('genetic_code'):
            self.code_table = params['genetic_code']

        # 4) Do the upload
        files = self._find_input_files(input_directory)
        consolidated_file = self._join_files_skip_empty_lines(files)
        genome = self.parse_genbank(consolidated_file, params)
        if params.get('genetic_code'):
            genome["genetic_code"] = params['genetic_code']

        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params['metadata'],
        })
        ref = f"{result['info'][6]}/{result['info'][0]}/{result['info'][4]}"
        logging.info(f"Genome saved to {ref}")

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        details = {'genome_ref': ref, 'genome_info': info}

        return details

    @staticmethod
    def validate_params(params):
        if 'workspace_name' not in params:
            raise ValueError('required "workspace_name" field was not defined')
        if 'genome_name' not in params:
            raise ValueError('required "genome_name" field was not defined')
        if 'file' not in params:
            raise ValueError('required "file" field was not defined')

        # one and only one of 'path', 'shock_id', or 'ftp_url' is required
        file = params['file']
        if not isinstance(file, dict):
            raise ValueError('required "file" field must be a map/dict')
        sources = ('path', 'shock_id', 'ftp_url')
        n_valid_fields = sum(1 for f in sources if file.get(f))
        if n_valid_fields < 1:
            raise ValueError(f'required "file" field must include one source: '
                             f'{", ".join(sources)}')
        if n_valid_fields > 1:
            raise ValueError(
                f'required "file" field has too many sources specified: '
                f'{", ".join(file.keys())}')
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(f"Invalid genetic code specified: {params}")

    def stage_input(self, params):
        """ Setup the input_directory by fetching the files and uncompressing if needed. """

        # construct the input directory where we stage files
        input_directory = os.path.join(
            self.cfg.sharedFolder, f'genome-upload-staging-{uuid.uuid4()}')
        os.makedirs(input_directory)

        # at this point, the 'file' input is validated, so we don't have to catch any special cases
        # we expect one and only one of path, shock_id, or ftp_url

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory
        file = params['file']
        genbank_file_path = None
        if file.get('path') is not None:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = file['path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if 'shock_id' in file and file['shock_id'] is not None:
            # handle shock file
            logging.info(
                f'Downloading file from SHOCK node: {self.cfg.shockURL} - {file["shock_id"]}'
            )
            sys.stdout.flush()
            file_name = self.dfu.shock_to_file({
                'file_path': input_directory,
                'shock_id': file['shock_id']
            })['node_file_name']
            genbank_file_path = os.path.join(input_directory, file_name)

        if 'ftp_url' in file and file['ftp_url'] is not None:
            logging.info('Downloading file from: ' + str(file['ftp_url']))
            local_file_path = self.dfu.download_web_file({
                'file_url':
                file['ftp_url'],
                'download_type':
                'FTP'
            })['copy_file_path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        # extract the file if it is compressed
        if genbank_file_path is not None:
            logging.info("staged input file =" + genbank_file_path)
            self.dfu.unpack_file({'file_path': genbank_file_path})

        else:
            raise ValueError(
                'No valid files could be extracted based on the input')

        return input_directory

    def parse_genbank(self, file_path, params):
        logging.info("Saving original file to shock")
        shock_res = self.dfu.file_to_shock({
            'file_path': file_path,
            'make_handle': 1,
            'pack': 'gzip',
        })
        # Write and save assembly file
        assembly_ref = self._save_assembly(file_path, params)
        assembly_data = self.dfu.get_objects({
            'object_refs': [assembly_ref],
            'ignore_errors': 0
        })['data'][0]['data']
        genome = {
            "id": params['genome_name'],
            "original_source_file_name": os.path.basename(file_path),
            "assembly_ref": assembly_ref,
            "gc_content": assembly_data['gc_content'],
            "dna_size": assembly_data['dna_size'],
            "md5": assembly_data['md5'],
            "genbank_handle_ref": shock_res['handle']['hid'],
            "publications": set(),
            "contig_ids": [],
            "contig_lengths": [],
        }
        genome['source'], genome['genome_tiers'] = self.gi.determine_tier(
            params['source'])

        if params.get('genome_type'):
            genome['genome_type'] = params['genome_type']

        # Set taxonomy-related fields in the genome
        # Also validates the given taxon ID
        if params.get('taxon_id'):
            set_taxon_data(int(params['taxon_id']), self.re_api_url, genome)
        else:
            set_default_taxon_data(genome)

        dates = []
        # Parse data from genbank file
        contigs = Bio.SeqIO.parse(file_path, "genbank")
        for record in contigs:
            r_annot = record.annotations
            logging.info("parsing contig: " + record.id)
            try:
                dates.append(time.strptime(r_annot.get('date'), "%d-%b-%Y"))
            except (TypeError, ValueError):
                pass
            genome['contig_ids'].append(record.id)
            genome['contig_lengths'].append(len(record))
            genome["publications"] |= self._get_pubs(r_annot)

            # only do the following once(on the first contig)
            if "source_id" not in genome:
                genome["source_id"] = record.id.split('.')[0]
                organism = r_annot.get('organism', 'Unknown Organism')
                if params.get('scientific_name'):
                    genome['scientific_name'] = params['scientific_name']
                else:
                    genome['scientific_name'] = organism
                self.code_table = genome['genetic_code']
                genome["molecule_type"] = r_annot.get('molecule_type', 'DNA')
                genome['notes'] = r_annot.get('comment',
                                              "").replace('\\n', '\n')

            self._parse_features(record, genome['source'])

        genome.update(self.get_feature_lists())

        genome['num_contigs'] = len(genome['contig_ids'])
        # add dates
        dates.sort()
        if dates:
            genome['external_source_origination_date'] = time.strftime(
                "%d-%b-%Y", dates[0])
            if dates[0] != dates[-1]:
                genome['external_source_origination_date'] += " _ " + \
                    time.strftime("%d-%b-%Y", dates[-1])

        if self.ontologies_present:
            genome['ontologies_present'] = dict(self.ontologies_present)
            genome["ontology_events"] = self.ontology_events
        genome['feature_counts'] = dict(self.feature_counts)
        # can't serialize a set
        genome['publications'] = list(genome['publications'])

        if len(genome['cdss']) and (self.defects['cds_seq_not_matching'] /
                                    float(len(genome['cdss'])) > 0.02):
            self.genome_warnings.append(
                warnings["genome_inc_translation"].format(
                    self.defects['cds_seq_not_matching'], len(genome['cdss'])))
            self.genome_suspect = 1

        if self.defects['bad_parent_loc']:
            self.genome_warnings.append(
                f"There were {self.defects['bad_parent_loc']} parent/child "
                "relationships that were not able to be determined. Some of "
                "these may have splice variants that may be valid relationships."
            )

        if self.defects['spoofed_genes']:
            self.genome_warnings.append(warnings['spoofed_genome'].format(
                self.defects['spoofed_genes']))
            genome['suspect'] = 1

        if self.defects['not_trans_spliced']:
            self.genome_warnings.append(
                warnings['genome_not_trans_spliced'].format(
                    self.defects['not_trans_spliced']))
            genome['suspect'] = 1

        if self.genome_warnings:
            genome['warnings'] = self.genome_warnings
        if self.genome_suspect:
            genome['suspect'] = 1
        logging.info(f"Feature Counts: {genome['feature_counts']}")
        return genome

    def _save_assembly(self, genbank_file, params):
        """Convert genbank file to fasta and sve as assembly"""
        contigs = Bio.SeqIO.parse(genbank_file, "genbank")
        assembly_id = f"{params['genome_name']}_assembly"
        fasta_file = f"{self.cfg.sharedFolder}/{params['genome_name']}_assembly.fasta"

        out_contigs = []
        extra_info = defaultdict(dict)
        for in_contig in contigs:
            if in_contig.annotations.get('topology', "") == 'circular':
                extra_info[in_contig.id]['is_circ'] = 1
                self.circ_contigs.add(in_contig.id)
            elif in_contig.annotations.get('topology', "") == 'linear':
                extra_info[in_contig.id]['is_circ'] = 0
            out_contigs.append(in_contig)
            self.contig_seq[in_contig.id] = in_contig.seq.upper()

        assembly_ref = params.get("use_existing_assembly")
        if assembly_ref:
            if not re.match("\d+\/\d+\/\d+", assembly_ref):
                raise ValueError(
                    f"Assembly ref: {assembly_ref} is not a valid format. Must"
                    f" be in numerical <ws>/<object>/<version> format.")
            ret = self.dfu.get_objects({'object_refs':
                                        [assembly_ref]})['data'][0]
            if "KBaseGenomeAnnotations.Assembly" not in ret['info'][2]:
                raise ValueError(
                    f"{assembly_ref} is not a reference to an assembly")
            unmatched_ids = list()
            unmatched_ids_md5s = list()
            for current_contig in self.contig_seq.keys():
                current_contig_md5 = hashlib.md5(
                    str(self.contig_seq[current_contig]).encode(
                        'utf8')).hexdigest()
                if current_contig in ret['data']['contigs']:
                    if current_contig_md5 != ret['data']['contigs'][
                            current_contig]['md5']:
                        unmatched_ids_md5s.append(current_contig)
                else:
                    unmatched_ids.append(current_contig)
            if len(unmatched_ids) > 0:
                raise ValueError(warnings['assembly_ref_extra_contigs'].format(
                    ", ".join(unmatched_ids)))
            if len(unmatched_ids_md5s) > 0:
                raise ValueError(warnings["assembly_ref_diff_seq"].format(
                    ", ".join(unmatched_ids_md5s)))
            logging.info(f"Using supplied assembly: {assembly_ref}")
            return assembly_ref
        logging.info("Saving sequence as Assembly object")
        Bio.SeqIO.write(out_contigs, fasta_file, "fasta")
        assembly_ref = self.aUtil.save_assembly_from_fasta({
            'file': {
                'path': fasta_file
            },
            'workspace_name':
            params['workspace_name'],
            'assembly_name':
            assembly_id,
            'type':
            params.get('genome_type', 'isolate'),
            'contig_info':
            extra_info
        })
        logging.info(f"Assembly saved to {assembly_ref}")
        return assembly_ref

    def _find_input_files(self, input_directory):
        logging.info("Scanning for Genbank Format files.")
        valid_extensions = [".gbff", ".gbk", ".gb", ".genbank", ".dat", ".gbf"]

        files = os.listdir(os.path.abspath(input_directory))
        logging.info("Genbank Files : " + ", ".join(files))
        genbank_files = [
            x for x in files
            if os.path.splitext(x)[-1].lower() in valid_extensions
        ]

        if len(genbank_files) == 0:
            raise Exception(
                f"The input directory does not have any files with one of the "
                f"following extensions {','.join(valid_extensions)}.")

        logging.info(f"Found {len(genbank_files)} genbank files")

        input_files = []
        for genbank_file in genbank_files:
            input_files.append(os.path.join(input_directory, genbank_file))

        return input_files

    def _join_files_skip_empty_lines(self, input_files):
        """ Applies strip to each line of each input file.
            Args:
                input_files: Paths to input files in Genbank format.
            Returns:
                Path to resulting file (currenly it's the same file as input).
            """
        if len(input_files) == 0:
            raise ValueError("NO GENBANK FILE")
        temp_dir = os.path.join(os.path.dirname(input_files[0]), "combined")
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)
        ret_file = os.path.join(temp_dir, os.path.basename(input_files[0]))

        # take in Genbank file and remove all empty lines from it.
        with open(ret_file, 'w', buffering=2**20) as f_out:
            for input_file in input_files:
                with open(input_file, 'r') as f_in:
                    for line in f_in:
                        line = line.rstrip('\r\n')
                        if line.strip():
                            f_out.write(line + '\n')
        return ret_file

    def _get_pubs(self, r_annotations):
        """Get a contig's publications"""
        pub_list = []
        for in_pub in r_annotations.get('references', []):
            # don't add blank pubs
            if not in_pub.authors:
                continue
            out_pub = [
                0,  # pmid
                "",  # source
                in_pub.title,
                "",  # web address
                "",  # date
                in_pub.authors,
                in_pub.journal,
            ]
            date_match = re.match("\((\d{4})\)", in_pub.journal)
            if date_match:
                out_pub[4] = date_match.group(1)
            if in_pub.pubmed_id:
                out_pub[0:4] = [
                    int(in_pub.pubmed_id), "PubMed", in_pub.title,
                    f"http://www.ncbi.nlm.nih.gov/pubmed/{in_pub.pubmed_id}"
                ]
            pub_list.append(tuple(out_pub))
        logging.info(f"Parsed {len(pub_list)} publication records")
        return set(pub_list)

    def _get_id(self, feat, tags=None):
        """Assign a id to a feature based on the first tag that exists"""
        _id = ""
        if not tags:
            tags = ['locus_tag', 'kbase_id']
        for t in tags:
            _id = feat.qualifiers.get(t, [""])[0]
            if _id:
                break

        if not _id:
            if feat.type == 'gene':
                if not self.generate_ids:
                    raise ValueError(
                        f"Unable to find a valid id for gene "
                        f"among these tags: {', '.join(tags)}. Correct the "
                        f"file or rerun with generate_ids\n {feat}")
                self.orphan_types['gene'] += 1
                _id = f"gene_{self.orphan_types['gene']}"
            if 'rna' in feat.type.lower() or feat.type in {
                    'CDS', 'sig_peptide', 'five_prime_UTR', 'three_prime_UTR'
            }:
                _id = f"gene_{self.orphan_types['gene']}"

        return _id

    def _parse_features(self, record, source):
        def _location(feat):
            """Convert to KBase style location objects"""
            strand_trans = ("", "+", "-")
            loc = []
            for part in feat.location.parts:
                contig_id = part.ref if part.ref else record.id
                if part.strand >= 0:
                    begin = int(part.start) + 1
                else:
                    begin = int(part.end)
                loc.append(
                    (contig_id, begin, strand_trans[part.strand], len(part)))
            return loc

        def _warn(message):
            if message not in out_feat.get('warnings', []):
                out_feat['warnings'] = out_feat.get('warnings', []) + [message]

        def _check_suspect_location(parent=None):
            if 'trans_splicing' in out_feat.get('flags', []):
                return

            if out_feat['location'] == sorted(
                    out_feat['location'],
                    reverse=(in_feature.location.strand == -1)):
                return

            if record.id in self.circ_contigs and \
                    in_feature.location.start == 0 \
                    and in_feature.location.end == len(record):
                self.features_spaning_zero.add(out_feat['id'])
                return

            if parent and parent['id'] in self.features_spaning_zero:
                return

            _warn(warnings['not_trans_spliced'])
            self.defects['not_trans_spliced'] += 1

        for in_feature in record.features:
            if in_feature.type in self.excluded_features:
                self.skiped_features[in_feature.type] += 1
                continue
            feat_seq = self._get_seq(in_feature, record.id)
            if source == "Ensembl":
                _id = self._get_id(in_feature, ['gene', 'locus_tag'])
            else:
                _id = self._get_id(in_feature)

            # The following is common to all the feature types
            out_feat = {
                "id": "_".join([_id, in_feature.type]),
                "location": _location(in_feature),
                "dna_sequence": str(feat_seq),
                "dna_sequence_length": len(feat_seq),
                "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
            }
            if not _id:
                out_feat['id'] = in_feature.type

            # validate input feature
            # note that end is the larger number regardless of strand
            if int(in_feature.location.end) > len(record):
                self.genome_warnings.append(
                    warnings["coordinates_off_end"].format(out_feat['id']))
                self.genome_suspect = 1
                continue

            for piece in in_feature.location.parts:
                if not isinstance(piece.start, ExactPosition) \
                        or not isinstance(piece.end, ExactPosition):
                    _warn(warnings["non_exact_coordinates"])

            self.feature_counts[in_feature.type] += 1

            # add optional fields
            if 'note' in in_feature.qualifiers:
                out_feat['note'] = in_feature.qualifiers["note"][0]

            out_feat.update(self._get_aliases_flags_functions(in_feature))

            ont, db_xrefs = self._get_ontology_db_xrefs(in_feature)
            if ont:
                out_feat['ontology_terms'] = ont
            if db_xrefs:
                out_feat['db_xrefs'] = db_xrefs

            if 'inference' in in_feature.qualifiers:
                out_feat['inference_data'] = parse_inferences(
                    in_feature.qualifiers['inference'])

            _check_suspect_location(self.genes.get(_id))

            # add type specific features
            if in_feature.type == 'CDS':
                self.process_cds(_id, feat_seq, in_feature, out_feat)

            elif in_feature.type == 'gene':
                self.process_gene(_id, out_feat)

            elif in_feature.type == 'mRNA':
                self.process_mrna(_id, out_feat)

            else:
                self.noncoding.append(
                    self.process_noncoding(_id, in_feature.type, out_feat))

    def get_feature_lists(self):
        """sort genes into their final arrays"""
        coding = []
        for g in self.genes.values():
            if len(g['cdss']):
                if g['mrnas'] and len(g['mrnas']) != len(g['cdss']):
                    msg = "The length of the mrna and cdss arrays are not equal"
                    g['warnings'] = g.get('warnings', []) + [msg]

                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in g:
                        g[key] = list(set(g[key]))
                if not g['mrnas']:
                    del g['mrnas']
                del g['type']
                coding.append(g)
                self.feature_counts["protein_encoding_gene"] += 1
            else:
                del g['mrnas'], g['cdss']
                self.noncoding.append(g)
                self.feature_counts["non_coding_genes"] += 1

        self.feature_counts["non_coding_features"] = len(self.noncoding)
        return {
            'features': coding,
            'non_coding_features': self.noncoding,
            'cdss': list(self.cdss.values()),
            'mrnas': list(self.mrnas.values())
        }

    def _get_seq(self, feat, contig):
        """Extract the DNA sequence for a feature"""
        seq = []
        for part in feat.location.parts:
            strand = part.strand
            # handle trans-splicing across contigs
            if part.ref:
                part_contig = part.ref
            else:
                part_contig = contig

            if strand >= 0:
                seq.append(
                    str(self.contig_seq[part_contig][part.start:part.end]))
            else:
                seq.append(
                    str(self.contig_seq[part_contig]
                        [part.start:part.end].reverse_complement()))
        return "".join(seq)

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(f"{ontology_type} is not a supported ontology")

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = defaultdict(dict)
        db_xrefs = []
        for key in ("GO_process", "GO_function", "GO_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.qualifiers.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        for ref in feature.qualifiers.get('db_xref', []):
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            elif ":" not in ref:
                db_xrefs.append(tuple(["Unknown_Source", ref]))
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))

        return dict(ontology), sorted(db_xrefs)

    @staticmethod
    def _get_aliases_flags_functions(feat):
        """Get the values for aliases flags and features from qualifiers"""
        alias_keys = {
            'locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id',
            'gene', 'EC_number', 'gene_synonym'
        }
        result = defaultdict(list)
        for key, val_list in feat.qualifiers.items():
            if key in alias_keys:
                result['aliases'].extend([(key, val) for val in val_list])
            # flags have no other information associated with them
            if val_list == ['']:
                result['flags'].append(key)
            if key == 'function':
                result['functional_descriptions'].extend(
                    val_list[0].split('; '))
            if key == 'product':
                result['functions'] = val_list

        return result

    def _find_parent_gene(self, potential_id, feature):
        """Unfortunately, Genbank files don't have a parent ID and the features can be out of
        order at times. To account for this, the this function works backwards from the end of
        list of IDs and stops when if finds a parent with valid coordinates or it hits the maximum
        number of tries"""
        if potential_id in self.genes:
            lookup_attempts = 0
            while lookup_attempts < MAX_PARENT_LOOKUPS:
                if is_parent(self.genes[potential_id], feature):
                    return potential_id

                lookup_attempts += 1
                try:
                    potential_id = list(
                        self.genes.keys())[-(lookup_attempts + 1)]
                except IndexError:
                    break  # no more genes that could match exist

            self.defects['bad_parent_loc'] += 1
        return None

    def assign_new_id(self, _id):
        """given a feature id that has already been used, add a unique modifier to it"""
        _id_modifier = self.used_twice_identifiers.get(_id, 1)
        self.used_twice_identifiers[_id] = _id_modifier + 1
        return _id + "." + str(_id_modifier)

    def process_gene(self, _id, out_feat):
        out_feat.update({
            "id": _id,
            "type": 'gene',
            "mrnas": [],
            'cdss': [],
        })
        if _id in self.genes:
            _id = self.assign_new_id(_id)
            out_feat.update({"id": _id})
            # raise ValueError(f"Duplicate gene ID: {_id}")
        self.genes[_id] = out_feat

    def process_noncoding(self, gene_id, feat_type, out_feat):
        out_feat["type"] = feat_type

        # this prevents big misc_features from blowing up the genome size
        if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
            del out_feat['dna_sequence']

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            if 'children' not in self.genes[gene_id]:
                self.genes[gene_id]['children'] = []
            out_feat['id'] += "_" + str(
                len(self.genes[gene_id]['children']) + 1)
            self.genes[gene_id]['children'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types[feat_type] += 1
            out_feat['id'] += "_" + str(self.orphan_types[feat_type])

        return out_feat

    def process_mrna(self, gene_id, out_feat):
        if gene_id not in self.genes and self.generate_parents:
            self.process_gene(gene_id, copy.copy(out_feat))

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            out_feat['id'] = "_".join(
                (gene_id, "mRNA", str(len(self.genes[gene_id]['mrnas']) + 1)))
            self.genes[gene_id]['mrnas'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['mrna'] += 1
            out_feat['id'] = f"mRNA_{self.orphan_types['mrna']}"
            out_feat['warnings'] = out_feat.get('warnings', []) + [
                'Unable to find parent gene for ' + str(out_feat['id'])
            ]

        self.mrnas[out_feat['id']] = out_feat

    def process_cds(self, gene_id, feat_seq, in_feature, out_feat):
        # Associate CDS with parents
        cds_warnings = out_feat.get('warnings', [])
        validated_gene_id = self._find_parent_gene(gene_id, out_feat)
        if validated_gene_id:
            out_feat['id'] = "_".join(
                (validated_gene_id, "CDS",
                 str(len(self.genes[validated_gene_id]['cdss']) + 1)))
            self.genes[validated_gene_id]['cdss'].append(out_feat['id'])
            out_feat['parent_gene'] = validated_gene_id
        elif self.generate_parents and gene_id not in self.genes:
            new_feat = copy.copy(out_feat)
            new_feat['id'] = gene_id
            new_feat['warnings'] = [warnings['spoofed_gene']]
            self.orphan_types['gene'] += 1
            self.defects['spoofed_genes'] += 1
            self.process_gene(new_feat['id'], new_feat)

            out_feat['id'] = "_".join(
                (gene_id, "CDS", str(len(self.genes[gene_id]['cdss']) + 1)))
            self.genes[gene_id]['cdss'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['cds'] += 1
            out_feat['id'] = f"CDS_{self.orphan_types['cds']}"
            cds_warnings.append(
                f"Unable to find parent gene for {out_feat['id']}")

        # there is a 1 to 1 relationship of mRNA to CDS so XXX_mRNA_1 will match XXX_CDS_1
        mrna_id = out_feat["id"].replace('CDS', 'mRNA')
        if mrna_id in self.mrnas:
            if not is_parent(self.mrnas[mrna_id], out_feat):
                cds_warnings.append(warnings['cds_mrna_cds'].format(mrna_id))
                self.mrnas[mrna_id]['warnings'] = self.mrnas[mrna_id].get(
                    'warnings', []) + [warnings['cds_mrna_mrna']]
                self.defects['bad_parent_loc'] += 1
            else:
                out_feat['parent_mrna'] = mrna_id
                self.mrnas[mrna_id]['cds'] = out_feat['id']

        # process protein
        prot_seq = in_feature.qualifiers.get("translation", [""])[0]

        # allow a little slack to account for frameshift and stop codon
        if prot_seq and abs(len(prot_seq) * 3 - len(feat_seq)) > 4:
            cds_warnings.append(warnings["inconsistent_CDS_length"].format(
                len(feat_seq), len(prot_seq)))
            self.genome_warnings.append(
                warnings['genome_inc_CDS_length'].format(
                    out_feat['id'], len(feat_seq), len(prot_seq)))
            self.genome_suspect = 1

        try:
            if prot_seq and prot_seq != Seq.translate(
                    feat_seq, self.code_table, cds=True).strip("*"):
                cds_warnings.append(warnings["inconsistent_translation"])
                self.defects['cds_seq_not_matching'] += 1

        except TranslationError as e:
            cds_warnings.append("Unable to verify protein sequence:" + str(e))

        if not prot_seq:
            try:
                prot_seq = Seq.translate(feat_seq, self.code_table,
                                         cds=True).strip("*")
                cds_warnings.append(warnings["no_translation_supplied"])

            except TranslationError as e:
                cds_warnings.append(warnings["no_translation_supplied"] +
                                    str(e))

        out_feat.update({
            "protein_translation":
            prot_seq,
            "protein_md5":
            hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
            "protein_translation_length":
            len(prot_seq),
        })

        if out_feat.get('parent_gene'):
            propagate_cds_props_to_gene(out_feat,
                                        self.genes[out_feat['parent_gene']])

        if cds_warnings:
            out_feat['warnings'] = cds_warnings

        self.cdss[out_feat['id']] = out_feat
Ejemplo n.º 4
0
class FastaToAssembly:

    def __init__(self, callback_url, scratch, ws_url):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)
        self.ws = Workspace(ws_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"

    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print(f'filtering FASTA file by contig length (min len={min_contig_length} bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print(f'parsing FASTA file: {fasta_file_path}')
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(f' - parsed {assembly_data["num_contigs"]} contigs,{assembly_data["dna_size"]} bp')
        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)
        json.dump(assembly_object_to_save, open(self.scratch+"/example.json", 'w'))

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info

    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        """ construct the WS object data to save based on the parsed info and params """
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        fasta_file_handle_info['handle'] = fasta_file_handle_info['handle']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            info = self.ws.get_object_info3({'objects':[{'ref': params['taxon_ref']}]})['infos'][0]
            assembly_data['taxon_ref'] = f'{info[6]}/{info[0]}/{info[4]}'

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return sort_dict(assembly_data)

    def parse_fasta(self, fasta_file_path, params):
        """ Do the actual work of inspecting each contig """

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This FASTA file may have amino acids in it instead '
                                         'of the required nucleotides.')
                    raise ValueError(f"This FASTA file has non nucleic acid characters: "
                                     f"{character}")

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence.encode()).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The FASTA header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')

            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list)).encode()).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data

    @staticmethod
    def fasta_filter_contigs_generator(fasta_record_iter, min_contig_length):
        """ generates SeqRecords iterator for writing from a legacy contigset object """
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(f' - filtered out {rows - rows_added} of {rows} contigs that were shorter '
              f'than {(min_contig_length)} bp.')

    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        """ removes all contigs less than the min_contig_length provided """
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path

    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        if len(obj_data["contigs"]) == 0:
            raise ValueError('There are no contigs to save, thus there is no valid assembly.')
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info

    def save_fasta_file_to_shock(self, fasta_file_path):
        """ Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        """
        print(f'Uploading FASTA file ({fasta_file_path}) to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})

    def stage_input(self, params):
        """ Setup the input_directory by fetching the files and returning the path to the file"""
        file_path = None
        if 'file' in params:
            if not os.path.isfile(params['file']['path']):
                raise ValueError('KBase Assembly Utils tried to save an assembly, but the calling application specified a file ('+params['file']['path']+') that is missing. Please check the application logs for details.')
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print(f'Downloading file from SHOCK node: {params["shock_id"]}')
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print(f'Downloading file from: {params["ftp_url"]}')
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid FASTA could be extracted based on the input parameters')


    @staticmethod
    def validate_params(params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a FASTA file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required FASTA file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one FASTA file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')