Ejemplo n.º 1
0
    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info
Ejemplo n.º 2
0
    def export_genome_features_protein_to_fasta(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_features_protein_to_fasta
        print('export_genome_features_protein_to_fasta -- paramaters = ')

        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot run export_genome_features_protein_to_fasta - no "input_ref" field defined.'
            )

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_object_info_new({
            'objects': [{
                'ref': params['input_ref']
            }],
            'includeMetadata': 0,
            'ignoreErrors': 0
        })[0]

        genome_to_protein_fasta_params = {'genome_ref': params['input_ref']}

        # export to file (building from KBase Genome Object)
        result = self.genome_to_genbank(
            ctx, genome_to_protein_fasta_params)[0]['genbank_file']

        #END export_genome_features_protein_to_fasta

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError(
                'Method export_genome_features_protein_to_fasta return value '
                + 'output is not type dict as required.')
        # return the results
        return [output]
Ejemplo n.º 3
0
class SPAdesUtils:
    """
    Define the SPAdesUtils functions
    """
    SPADES_VERSION = '3.13.0'
    SPADES_BIN = '/opt/SPAdes-' + SPADES_VERSION + '-Linux/bin'

    DISABLE_SPADES_OUTPUT = False  # should be False in production

    # Basic options
    PARAM_IN_SINGLE_CELL = 'single_cell'  # --sc
    PARAM_IN_METAGENOME = 'metagenomic'  # --meta
    PARAM_IN_PLASMID = 'plasmid'  # --plasmid
    PARAM_IN_RNA = 'rna'  # --rna
    PARAM_IN_IONTORRENT = 'iontorrent'  # --iontorrent

    # Pipeline options
    PARAM_IN_ONLY_ERROR_CORR = 'only-error-correction'  # --only-error-correction
    PARAM_IN_ONLY_ASSEMBLER = 'only-assembler'  # --only-assembler
    PARAM_IN_CAREFUL = 'careful'  # --careful
    PARAM_IN_CONTINUE = 'continue'  # --continue
    PARAM_IN_DISABLE_GZIP = 'disable-gzip-output'  # --disable-gzip-output

    # Input parameters
    PARAM_IN_WS = 'workspace_name'
    PARAM_IN_CS_NAME = 'output_contigset_name'
    PARAM_IN_READS = 'reads_libraries'
    PARAM_IN_LONG_READS = 'long_reads_libraries'
    PARAM_IN_KMER_SIZES = 'kmer_sizes'
    PARAM_IN_SKIP_ERR_CORRECT = 'skip_error_correction'
    PARAM_IN_MIN_CONTIG_LENGTH = 'min_contig_length'
    PARAM_IN_DNA_SOURCE = 'dna_source'
    PARAM_IN_PIPELINE_OPTION = 'pipeline_options'
    ASSEMBLE_RESULTS_DIR = 'assemble_results'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    THREADS_PER_CORE = 3
    MAX_THREADS = 64  # per email thread with Anton Korobeynikov
    MAX_THREADS_META = 128  # Increase threads for metagenomic assemblies
    MEMORY_OFFSET_GB = 1  # 1GB
    MIN_MEMORY_GB = 5
    MAX_MEMORY_GB_SPADES = 500
    MAX_MEMORY_GB_META_SPADES = 1000
    GB = 1000000000

    # private method definition
    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token, service_ver='release')
        self.au = AssemblyUtil(self.callback_url, token=self.token, service_ver='release')
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir

        self.spades_version = 'SPAdes-' + os.environ['SPADES_VERSION']

    def _get_kbreads_info(self, wsname, reads_refs):
        """
        _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding
        reads info with as interleaved fastq files and returns a list of reads data in the
        following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'type': reads_type,  # ('interleaved', 'paired', or 'single')
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file,  # only if paired end
        }
        """
        obj_ids = []
        for r in reads_refs:
            if r:
                obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})

        if not obj_ids:
            return []

        ws_info = self.ws_client.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = self.ru.download_reads({
                        'read_libraries': reads_params,
                        'interleaved': 'false'
                        })['files']
        except ServerError as se:
            log('logging stacktrace from dynamic client error')
            log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.SingleEndLibrary ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'KBaseFile.SingleEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        # log('Downloaded reads data from KBase:\n' + pformat(reads))
        reads_data = []
        for ref in reads_refs:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            seq_tech = reads[ref]['sequencing_tech']
            rds_info = {
                'fwd_file': f['fwd'],
                'reads_ref': ref,
                'type': f['type'],
                'seq_tech': seq_tech,
                'reads_name': reads_name
            }
            if f.get('rev', None):
                rds_info['rev_file'] = f['rev']
            reads_data.append(rds_info)

        return reads_data

    def _generate_output_file_list(self, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.proj_dir, str(uuid.uuid4()))
        _mkdir_p(output_directory)
        spades_output = os.path.join(output_directory, 'spades_output.zip')
        self._zip_folder(out_dir, spades_output)

        output_files.append({'path': spades_output,
                             'name': os.path.basename(spades_output),
                             'label': os.path.basename(spades_output),
                             'description': 'Output file(s) generated by {}'.format(
                                 self.spades_version)})

        return output_files

    def _zip_folder(self, folder_path, output_path):
        """
        _zip_folder: Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders could be included in the archive as well
        if the commented portion is used.
        """
        with zipfile.ZipFile(output_path, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    # print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print("{} created successfully.".format(output_path))
        # with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size

    def _load_stats(self, input_file_name):
        log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly')
        log('Building Object.')
        if not os.path.isfile(input_file_name):
            raise Exception('The input file name {0} is not a file!'.format(input_file_name))
        with open(input_file_name, 'r') as input_file_handle:
            contig_id = None
            sequence_len = 0
            fasta_dict = dict()
            first_header_found = False
            # Pattern for replacing white space
            pattern = re.compile(r'\s+')
            for current_line in input_file_handle:
                if (current_line[0] == '>'):
                    # found a header line
                    # Wrap up previous fasta sequence
                    if not first_header_found:
                        first_header_found = True
                    else:
                        fasta_dict[contig_id] = sequence_len
                        sequence_len = 0
                    fasta_header = current_line.replace('>', '').strip()
                    try:
                        contig_id = fasta_header.strip().split(' ', 1)[0]
                    except (IndexError, ValueError, KeyError):
                        contig_id = fasta_header.strip()
                else:
                    sequence_len += len(re.sub(pattern, '', current_line))
        # wrap up last fasta sequence
        if not first_header_found:
            raise Exception("There are no contigs in this file")
        else:
            fasta_dict[contig_id] = sequence_len
        return fasta_dict

    def _parse_single_reads(self, reads_type, reads_list):
        """
        _parse_single_reads: given the reads_type and a list of reads, return an object
        defining the type and a list of fastq files.
        """
        single_reads_fqs = []
        ret_obj = {}
        if reads_list and isinstance(reads_list, list):
            for rds in reads_list:
                single_reads_fqs.append(rds['fwd_file'])
        if single_reads_fqs:
            ret_obj = {
                "type": reads_type,
                "single reads": single_reads_fqs
            }

        return ret_obj

    def _parse_pair_reads(self, reads_type, reads_list):
        """
        _parse_pair_reads: given the reads_type and a list of reads, return an object
        defining the type and a list of fastq files.
        """
        right_reads_fqs = []
        left_reads_fqs = []
        ret_obj = {}
        if reads_list and isinstance(reads_list, list):
            for rds in reads_list:
                right_reads_fqs.append(rds['fwd_file'])
                if rds.get('rev_file', None):
                    left_reads_fqs.append(rds['rev_file'])
            orent = reads_list[0]['orientation']

        if right_reads_fqs:
            ret_obj["right reads"] = right_reads_fqs
            ret_obj["orientation"] = orent
            ret_obj["type"] = reads_type
        if left_reads_fqs:
            ret_obj["left reads"] = left_reads_fqs

        return ret_obj
    # end of private methods

    # public method definitions

    def check_spades_params(self, params):
        """
        check_spades_params: checks params passed to run_HybridSPAdes method and set default values
        """
        # log('Start validating run_HybridSPAdes parameters:\n{}'.format(
        # json.dumps(params, indent=1)))

        # check for mandatory parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_WS))
        if self.INVALID_WS_NAME_RE.search(params[self.PARAM_IN_WS]):
            raise ValueError('Invalid workspace name: {}.'.format(params[self.PARAM_IN_WS]))

        if params.get(self.PARAM_IN_CS_NAME, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_CS_NAME))
        if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]):
            raise ValueError('Invalid workspace object name: {}.'.format(
                params[self.PARAM_IN_CS_NAME]))

        if params.get(self.PARAM_IN_READS, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_READS))
        if type(params[self.PARAM_IN_READS]) != list:
            raise ValueError('Input reads {} must be a list.'.format(self.PARAM_IN_READS))
        if len(params[self.PARAM_IN_READS]) == 0:
            raise ValueError('Input parameter {} should have at least one reads.'.format(
                             self.PARAM_IN_READS))

        if self.PARAM_IN_MIN_CONTIG_LENGTH in params:
            if not isinstance(params[self.PARAM_IN_MIN_CONTIG_LENGTH], int):
                raise ValueError('{} must be of type int.'.format(self.PARAM_IN_MIN_CONTIG_LENGTH))

        if not params.get(self.PARAM_IN_KMER_SIZES, None):
            params[self.PARAM_IN_KMER_SIZES] = [21, 33, 55]
        kmer_sstr = ",".join(str(num) for num in params[self.PARAM_IN_KMER_SIZES])
        params[self.PARAM_IN_KMER_SIZES] = kmer_sstr
        print("KMER_SIZES: " + kmer_sstr)

        if params.get(self.PARAM_IN_SKIP_ERR_CORRECT, None):
            print("SKIP ERR CORRECTION: " + str(params[self.PARAM_IN_SKIP_ERR_CORRECT]))

        # check for basic option parameters
        if params.get(self.PARAM_IN_DNA_SOURCE, None):
            dna_src = params[self.PARAM_IN_DNA_SOURCE]
            if dna_src not in [self.PARAM_IN_SINGLE_CELL,
                               self.PARAM_IN_METAGENOME,
                               self.PARAM_IN_PLASMID,
                               self.PARAM_IN_RNA,
                               self.PARAM_IN_IONTORRENT]:
                params[self.PARAM_IN_DNA_SOURCE] = None
        else:
            params[self.PARAM_IN_DNA_SOURCE] = None

        # a list of basic options0
        params['basic_options'] = ['-o', self.ASSEMBLE_RESULTS_DIR]
        dna_src = params.get(self.PARAM_IN_DNA_SOURCE)
        if dna_src == self.PARAM_IN_SINGLE_CELL:
            params['basic_options'].append('--sc')
        elif dna_src == self.PARAM_IN_METAGENOME:
            params['basic_options'].append('--meta')
        elif dna_src == self.PARAM_IN_PLASMID:
            params['basic_options'].append('--plasmid')
        elif dna_src == self.PARAM_IN_RNA:
            params['basic_options'].append('--rna')
        elif dna_src == self.PARAM_IN_IONTORRENT:
            params['basic_options'].append('--iontorrent')

        # processing pipeline option parameters
        if params.get(self.PARAM_IN_PIPELINE_OPTION, None):
            pipe_opts = params[self.PARAM_IN_PIPELINE_OPTION]
            opts = [self.PARAM_IN_ONLY_ERROR_CORR,
                    self.PARAM_IN_ONLY_ASSEMBLER,
                    self.PARAM_IN_CONTINUE,
                    self.PARAM_IN_DISABLE_GZIP,
                    self.PARAM_IN_CAREFUL]
            if any(elem in opts for elem in pipe_opts):
                pass
            else:
                params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL]
        else:
            params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL]

        if '--meta' in params['basic_options']:
            # you cannot specify --careful, --mismatch-correction
            # or --cov-cutoff in metagenomic mode!
            try:
                params[self.PARAM_IN_PIPELINE_OPTION].remove(self.PARAM_IN_CAREFUL)
                params[self.PARAM_IN_PIPELINE_OPTION].remove('mismatch-correction')
                params[self.PARAM_IN_PIPELINE_OPTION].remove('cov-cutoff')
            except ValueError:
                pass

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return params

    def generate_report(self, fa_file_name, params, out_dir, wsname):
        """
        Generating and saving report
        """
        log('Generating and saving report')

        fa_file_with_path = os.path.join(out_dir, fa_file_name)
        fasta_stats = self._load_stats(fa_file_with_path)
        lengths = [fasta_stats[contig_id] for contig_id in fasta_stats]

        assembly_ref = wsname + '/' + params[self.PARAM_IN_CS_NAME]

        report_text = ''
        report_text += 'SPAdes results saved to: ' + wsname + '/' + out_dir + '\n'
        report_text += 'Assembly saved to: ' + assembly_ref + '\n'
        report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report_text += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n'
        for c in range(bins):
            report_text += ('   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' +
                            str(edges[c + 1]) + ' bp\n')
        print('Running QUAST')
        quastret = self.kbq.run_QUAST(
            {'files': [{'path': fa_file_with_path, 'label': params[self.PARAM_IN_CS_NAME]}]})

        output_files = self._generate_output_file_list(out_dir)

        print('Saving report')
        report_output = self.kbr.create_extended_report(
            {'message': report_text,
             'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}],
             'direct_html_link_index': 0,
             'file_links': output_files,
             'html_links': [{'shock_id': quastret['shock_id'],
                             'name': 'report.html',
                             'label': 'QUAST report'}
                            ],
             'report_object_name': 'kb_spades_report_' + str(uuid.uuid4()),
             'workspace_name': params[self.PARAM_IN_WS]})

        return report_output['name'], report_output['ref']

    def get_hybrid_reads_info(self, input_params):
        """
        get_hybrid_reads_info--from a list of ReadsParams structures fetches the corresponding
        reads info with the ReadsParams[lib_ref]
        returns None or a tuple of nine reads data each is a list of the following structure:
        {
                'fwd_file': path_to_fastq_file,
                'orientation': (default value is "fr" (forward-reverse) for paired-end libraries
                                "rf" (reverse-forward) for mate-pair libraries), None for others
                'lib_type': ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio",
                              "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"),
                'type': reads_type, # 'interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file  # only if paired end
        }
        OR:
        {
                'fwd_file': path_to_fastq_file,
                'long_reads_type': ("pacbio-ccs", "pacbio-clr", "nanopore", "sanger",
                                    "trusted-contigs", "untrusted-contigs"),
                'type': reads_type, # 'interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience
        }
        """
        rds_params = copy.deepcopy(input_params)
        if rds_params.get(self.PARAM_IN_READS, None) is None:
            return ()  # an empty tuple

        wsname = rds_params[self.PARAM_IN_WS]

        sgl_rds_data = []  # single
        pe_rds_data = []   # paired-end
        mp_rds_data = []   # mate-pairs
        pb_ccs_data = []   # pacbio-ccs
        pb_clr_data = []   # pacbio-clr
        np_rds_data = []   # nanopore
        sgr_rds_data = []  # sanger
        tr_ctg_data = []   # trusted-contigs
        ut_ctg_data = []   # untrusted-contigs

        # a list of Illumina or IonTorrent paired-end/high-quality mate-pairs/unpaired reads
        rds_refs = []

        rds_libs = rds_params[self.PARAM_IN_READS]
        for rds_lib in rds_libs:
            if rds_lib.get('lib_ref', None):
                rds_refs.append(rds_lib['lib_ref'])
        kb_rds_data = self._get_kbreads_info(wsname, rds_refs)

        for rds_lib in rds_libs:
            for kb_d in kb_rds_data:
                if 'lib_ref' in rds_lib and rds_lib['lib_ref'] == kb_d['reads_ref']:
                    if rds_lib['lib_type'] == 'single':  # single end reads grouped params
                        kb_d['orientation'] = None
                        kb_d['lib_type'] = 'single'
                        sgl_rds_data.append(kb_d)
                    elif rds_lib['lib_type'] == 'paired-end':  # pairedEnd reads grouped params
                        kb_d['orientation'] = ('fr' if rds_lib.get('orientation', None) is None
                                               else rds_lib['orientation'])
                        kb_d['lib_type'] = 'paired-end'
                        pe_rds_data.append(kb_d)
                    elif rds_lib['lib_type'] == 'mate-pairs':
                        # mate-pairs reads grouped params
                        kb_d['orientation'] = ('rf' if rds_lib.get('orientation', None) is None
                                               else rds_lib['orientation'])
                        kb_d['lib_type'] = 'mate-pairs'
                        mp_rds_data.append(kb_d)

        # a list of PacBio (CCS or CLR), Oxford Nanopore Sanger reads
        # and/or additional contigs
        long_rds_refs = []
        if rds_params.get(self.PARAM_IN_LONG_READS, None):
            long_rds_libs = rds_params[self.PARAM_IN_LONG_READS]
            for lrds_lib in long_rds_libs:
                if lrds_lib.get('long_reads_ref', None):
                    long_rds_refs.append(lrds_lib['long_reads_ref'])
            kb_lrds_data = self._get_kbreads_info(wsname, long_rds_refs)

            for lrds_lib in long_rds_libs:
                for kb_ld in kb_lrds_data:
                    if ('long_reads_ref' in lrds_lib and
                            lrds_lib['long_reads_ref'] == kb_ld['reads_ref']):
                        if lrds_lib['long_reads_type'] == 'pacbio-ccs':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            pb_ccs_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'pacbio-clr':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            pb_clr_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'nanopore':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            np_rds_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'sanger':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            sgr_rds_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'trusted-contigs':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            tr_ctg_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'untrusted-contigs':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            ut_ctg_data.append(kb_ld)

        return (sgl_rds_data, pe_rds_data, mp_rds_data, pb_ccs_data, pb_clr_data, np_rds_data,
                sgr_rds_data, tr_ctg_data, ut_ctg_data)

    def construct_yaml_dataset_file(self, sgl_libs=None, pe_libs=None, mp_libs=None,
                                    pb_ccs=None, pb_clr=None, np_libs=None,
                                    sgr_libs=None, tr_ctgs=None, ut_ctgs=None):
        """
        construct_yaml_dataset_file: Specifying input data with YAML data set file (advanced)
        An alternative way to specify an input data set for SPAdes is to create a YAML
        data set file.
        By using a YAML file you can provide an unlimited number of paired-end, mate-pair
        and unpaired libraries. Basically, YAML data set file is a text file, in which input
        libraries are provided as a comma-separated list in square brackets. Each library is
        provided in braces as a comma-separated list of attributes.

        The following attributes are available:

            - orientation ("fr", "rf", "ff")
            - type ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio", "nanopore",
                "sanger", "trusted-contigs", "untrusted-contigs")
            - interlaced reads (comma-separated list of files with interlaced reads)
            - left reads (comma-separated list of files with left reads)
            - right reads (comma-separated list of files with right reads)
            - single reads (comma-separated list of files with single reads or unpaired reads from
                paired library)
            - merged reads (comma-separated list of files with merged reads)

        To properly specify a library you should provide its type and at least one file with reads.
        For ONT, PacBio, Sanger and contig libraries you can provide only single reads. Orientation
        is an optional attribute. Its default value is "fr" (forward-reverse) for paired-end
        libraries and "rf" (reverse-forward) for mate-pair libraries.

        The value for each attribute is given after a colon. Comma-separated lists of files should
        be given in square brackets.
        For each file you should provide its full path in double quotes. Make sure that files with
        right reads are given in the same order as corresponding files with left reads.

        For example, if you have one paired-end library splitted into two pairs of files:
            lib_pe1_left_1.fastq
            lib_pe1_right_1.fastq
            lib_pe1_left_2.fastq
            lib_pe1_right_2.fastq

        one mate-pair library:
            lib_mp1_left.fastq
            lib_mp1_right.fastq

        and PacBio CCS and CLR reads:
            pacbio_ccs.fastq
            pacbio_clr.fastq

        YAML file should look like this:
        ------------------------------------------------
        [
            {
                orientation: "fr",
                type: "paired-end",
                right reads: [
                "/FULL_PATH_TO_DATASET/lib_pe1_right_1.fastq",
                "/FULL_PATH_TO_DATASET/lib_pe1_right_2.fastq"
                ],
                left reads: [
                "/FULL_PATH_TO_DATASET/lib_pe1_left_1.fastq",
                "/FULL_PATH_TO_DATASET/lib_pe1_left_2.fastq"
                ]
            },
            {
                orientation: "rf",
                type: "mate-pairs",
                right reads: [
                "/FULL_PATH_TO_DATASET/lib_mp1_right.fastq"
                ],
                left reads: [
                "/FULL_PATH_TO_DATASET/lib_mp1_left.fastq"
                ]
            },
            {
                type: "single",
                single reads: [
                "/FULL_PATH_TO_DATASET/pacbio_ccs.fastq"
                ]
            },
            {
                type: "pacbio",
                single reads: [
                "/FULL_PATH_TO_DATASET/pacbio_clr.fastq"
                ]
            }
        ]
        ------------------------------------------------

        Once you have created a YAML file save it with .yaml extension (e.g. as my_data_set.yaml)
        and run SPAdes using the --dataset option:
        e.g., <SPAdes_bin_dir>/spades.py --dataset <your YAML file> -o spades_output

        """
        # STEP 1: get the working folder housing the .yaml file and the SPAdes results
        if not os.path.exists(self.proj_dir):
            os.makedirs(self.proj_dir)
        yaml_file_path = os.path.join(self.proj_dir, 'input_data_set.yaml')

        # STEP 2: construct and save the 'input_data_set.yaml' file
        # generate the object array
        input_data_set = []

        if pe_libs:
            pair_libs = self._parse_pair_reads('paired-end', pe_libs)
            if pair_libs:
                input_data_set.append(pair_libs)

        if mp_libs:
            pair_libs = self._parse_pair_reads('mate-pairs', mp_libs)
            if pair_libs:
                input_data_set.append(pair_libs)

        # for reads_type = 'single'
        if sgl_libs:
            single_libs = self._parse_single_reads("single", sgl_libs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'pacbio-ccs', treated as type of 'single'
        if pb_ccs:
            single_libs = self._parse_single_reads("single", pb_ccs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'pacbio-clr'
        if pb_clr:
            single_libs = self._parse_single_reads("pacbio", pb_clr)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'nanopore'
        if np_libs:
            single_libs = self._parse_single_reads("nanopore", np_libs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'sanger'
        if sgr_libs:
            single_libs = self._parse_single_reads("sanger", sgr_libs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'trusted-contigs'
        if tr_ctgs:
            single_libs = self._parse_single_reads("trusted-contigs", tr_ctgs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'untrusted-contigs'
        if ut_ctgs:
            single_libs = self._parse_single_reads("untrusted-contigs", ut_ctgs)
            if single_libs:
                input_data_set.append(single_libs)

        if input_data_set == []:
            print('Empty input data set!!')
            return ''

        pprint(input_data_set)
        try:
            with open(yaml_file_path, 'w') as yaml_file:
                json.dump(input_data_set, yaml_file)
        except IOError as ioerr:
            log('Creation of the {} file raised error:\n'.format(yaml_file_path))
            pprint(ioerr)
            return ''
        else:
            return yaml_file_path

    def run_assemble(self, yaml_file, kmer_sizes, dna_source=None,
                     basic_opts=None, pipeline_opts=['careful']):
        """
        run_assemble: run the SPAdes assemble with given input parameters/options
        """
        exit_code = 1
        if not os.path.isfile(yaml_file):
            log("The input data set yaml file DOES NOT exist at {}\n".format(yaml_file))
            return exit_code

        log("The input data set yaml file exists at {}\n".format(yaml_file))
        yf_dir, yf_nm = os.path.split(yaml_file)

        mem = (psutil.virtual_memory().available / self.GB - self.MEMORY_OFFSET_GB)
        if mem < self.MIN_MEMORY_GB:
            raise ValueError(
                'Only ' + str(psutil.virtual_memory().available) +
                ' bytes of memory are available. The SPAdes wrapper will' +
                ' not run without at least ' +
                str(self.MIN_MEMORY_GB + self.MEMORY_OFFSET_GB) +
                ' gigabytes available')

        if dna_source and dna_source == self.PARAM_IN_METAGENOME:
            max_mem = self.MAX_MEMORY_GB_META_SPADES
            max_threads = self.MAX_THREADS_META
        else:
            max_mem = self.MAX_MEMORY_GB_SPADES
            max_threads = self.MAX_THREADS

        threads = min(max_threads, psutil.cpu_count() * self.THREADS_PER_CORE)

        if mem > max_mem:
            mem = max_mem

        tmpdir = os.path.join(self.proj_dir, 'spades_tmp_dir')
        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        a_cmd = [os.path.join(self.SPADES_BIN, 'spades.py')]
        a_cmd += ['--threads', str(threads), '--memory', str(mem)]
        a_cmd += ['--tmp-dir', tmpdir]
        a_cmd += ['--dataset', yaml_file]

        if kmer_sizes is not None:
            a_cmd += ['-k ' + kmer_sizes]

        if basic_opts is None:
            basic_opts = ['-o', self.ASSEMBLE_RESULTS_DIR]
        if isinstance(basic_opts, list):
            a_cmd += basic_opts

        if pipeline_opts and isinstance(pipeline_opts, list):
            for p_opt in pipeline_opts:
                if p_opt == self.PARAM_IN_CAREFUL:
                    a_cmd += ['--careful']
                if p_opt == self.PARAM_IN_ONLY_ERROR_CORR:
                    a_cmd += ['--only-error-correction']
                if p_opt == self.PARAM_IN_ONLY_ASSEMBLER:
                    a_cmd += ['--only-assembler']
                if p_opt == self.PARAM_IN_CONTINUE:
                    a_cmd += ['--continue']
                if p_opt == self.PARAM_IN_DISABLE_GZIP:
                    a_cmd += ['--disable-gzip-output']

        # Last check of command options before the call
        if '--meta' in a_cmd:
            # you cannot specify --careful, --mismatch-correction
            # or --cov-cutoff in metagenomic mode!
            try:
                a_cmd.remove(self.PARAM_IN_CAREFUL)
                a_cmd.remove('mismatch-correction')
                a_cmd.remove('cov-cutoff')
            except ValueError:
                pass

        log("**************The HybridSPAdes assembling command is:\n{}".format(' '.join(a_cmd)))
        assemble_out_dir = os.path.join(self.proj_dir, self.ASSEMBLE_RESULTS_DIR)
        if not os.path.exists(assemble_out_dir):
            os.makedirs(assemble_out_dir)

        p = subprocess.Popen(a_cmd, cwd=yf_dir, shell=False)
        exit_code = p.wait()
        log('Return code: ' + str(exit_code))

        if p.returncode != 0:
            raise ValueError('Error running spades.py, return code: ' + str(p.returncode) + '\n')
        else:
            exit_code = p.returncode
        return exit_code

    def save_assembly(self, fa_file_path, wsname, a_name, min_ctg_length=0):
        """
        save_assembly: save the assembly to KBase workspace
        """
        if os.path.isfile(fa_file_path):
            log('Uploading FASTA file to Assembly...')
            if min_ctg_length > 0:
                self.au.save_assembly_from_fasta(
                            {'file': {'path': fa_file_path},
                             'workspace_name': wsname,
                             'assembly_name': a_name,
                             'min_contig_length': min_ctg_length})
            else:
                self.au.save_assembly_from_fasta(
                            {'file': {'path': fa_file_path},
                             'workspace_name': wsname,
                             'assembly_name': a_name})
        else:
            log("The resulting sequence file {} is not found.".format(fa_file_path))
class NarrativeManager:

    KB_CELL = 'kb-cell'
    KB_TYPE = 'type'
    KB_APP_CELL = 'kb_app'
    KB_FUNCTION_CELL = 'function_input'
    KB_OUTPUT_CELL = 'function_output'
    KB_ERROR_CELL = 'kb_error'
    KB_CODE_CELL = 'kb_code'
    KB_STATE = 'widget_state'

    DEBUG = False

    DATA_PALETTES_TYPES = DataPaletteTypes(False)

    def __init__(self, config, ctx, set_api_client, data_palette_client):
        self.narrativeMethodStoreURL = config['narrative-method-store']
        self.set_api_cache = set_api_client  # DynamicServiceCache type
        self.data_palette_client = data_palette_client  # DynamicServiceCache type
        self.token = ctx["token"]
        self.user_id = ctx["user_id"]
        self.ws = Workspace(config['workspace-url'], token=self.token)
        self.intro_md_file = config['intro-markdown-file']
        # We switch DPs on only for internal Continuous Integration environment for now:
        if config['kbase-endpoint'].startswith("https://ci.kbase.us/") or \
           'USE_DP' in os.environ:
            self.DATA_PALETTES_TYPES = DataPaletteTypes(True)

    def list_objects_with_sets(self,
                               ws_id=None,
                               ws_name=None,
                               workspaces=None,
                               types=None,
                               include_metadata=0,
                               include_data_palettes=0):
        if not workspaces:
            if not ws_id and not ws_name:
                raise ValueError(
                    "One and only one of 'ws_id', 'ws_name', 'workspaces' " +
                    "parameters should be set")
            workspaces = [self._get_workspace_name_or_id(ws_id, ws_name)]
        return self._list_objects_with_sets(workspaces, types,
                                            include_metadata,
                                            include_data_palettes)

    def _list_objects_with_sets(self, workspaces, types, include_metadata,
                                include_data_palettes):
        type_map = None
        if types is not None:
            type_map = {key: True for key in types}

        processed_refs = {}
        data = []
        if self.DEBUG:
            print("NarrativeManager._list_objects_with_sets: processing sets")
        t1 = time.time()
        set_ret = self.set_api_cache.call_method(
            "list_sets", [{
                'workspaces': workspaces,
                'include_set_item_info': 1,
                'include_metadata': include_metadata
            }], self.token)
        sets = set_ret['sets']
        for set_info in sets:
            # Process
            target_set_items = []
            for set_item in set_info['items']:
                target_set_items.append(set_item['info'])
            if self._check_info_type(set_info['info'], type_map):
                data_item = {
                    'object_info': set_info['info'],
                    'set_items': {
                        'set_items_info': target_set_items
                    }
                }
                data.append(data_item)
                processed_refs[set_info['ref']] = data_item
        if self.DEBUG:
            print("    (time=" + str(time.time() - t1) + ")")

        if self.DEBUG:
            print("NarrativeManager._list_objects_with_sets: loading ws_info")
        t2 = time.time()
        ws_info_list = []
        # for ws in workspaces:
        if len(workspaces) == 1:
            ws = workspaces[0]
            ws_id = None
            ws_name = None
            if str(ws).isdigit():
                ws_id = int(ws)
            else:
                ws_name = str(ws)
            ws_info_list.append(
                self.ws.get_workspace_info({
                    "id": ws_id,
                    "workspace": ws_name
                }))
        else:
            ws_map = {key: True for key in workspaces}
            for ws_info in self.ws.list_workspace_info({'perm': 'r'}):
                if ws_info[1] in ws_map or str(ws_info[0]) in ws_map:
                    ws_info_list.append(ws_info)
        if self.DEBUG:
            print("    (time=" + str(time.time() - t2) + ")")

        if self.DEBUG:
            print(
                "NarrativeManager._list_objects_with_sets: loading workspace objects"
            )
        t3 = time.time()
        for info in WorkspaceListObjectsIterator(
                self.ws,
                ws_info_list=ws_info_list,
                list_objects_params={'includeMetadata': include_metadata}):
            item_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
            if item_ref not in processed_refs and self._check_info_type(
                    info, type_map):
                data_item = {'object_info': info}
                data.append(data_item)
                processed_refs[item_ref] = data_item
        if self.DEBUG:
            print("    (time=" + str(time.time() - t3) + ")")

        return_data = {"data": data}

        if include_data_palettes == 1:
            if self.DEBUG:
                print(
                    "NarrativeManager._list_objects_with_sets: processing DataPalettes"
                )
            t5 = time.time()
            dp_ret = self.data_palette_client.call_method(
                "list_data", [{
                    'workspaces': workspaces,
                    'include_metadata': include_metadata
                }], self.token)
            for item in dp_ret['data']:
                ref = item['ref']
                if self._check_info_type(item['info'], type_map):
                    data_item = None
                    if ref in processed_refs:
                        data_item = processed_refs[ref]
                    else:
                        data_item = {'object_info': item['info']}
                        processed_refs[ref] = data_item
                        data.append(data_item)
                    dp_info = {}
                    if 'dp_ref' in item:
                        dp_info['ref'] = item['dp_ref']
                    if 'dp_refs' in item:
                        dp_info['refs'] = item['dp_refs']
                    data_item['dp_info'] = dp_info
            return_data["data_palette_refs"] = dp_ret['data_palette_refs']
            if self.DEBUG:
                print("    (time=" + str(time.time() - t5) + ")")

        return return_data

    def _check_info_type(self, info, type_map):
        if type_map is None:
            return True
        obj_type = info[2].split('-')[0]
        return type_map.get(obj_type, False)

    def copy_narrative(self, newName, workspaceRef, workspaceId):
        time_ms = int(round(time.time() * 1000))
        newWsName = self.user_id + ':narrative_' + str(time_ms)
        # add the 'narrative' field to newWsMeta later.
        newWsMeta = {"narrative_nice_name": newName, "searchtags": "narrative"}

        # start with getting the existing narrative object.
        currentNarrative = self.ws.get_objects([{'ref': workspaceRef}])[0]
        if not workspaceId:
            workspaceId = currentNarrative['info'][6]
        # Let's prepare exceptions for clone the workspace.
        # 1) currentNarrative object:
        excluded_list = [{'objid': currentNarrative['info'][0]}]
        # 2) let's exclude objects of types under DataPalette handling:

        ## DP CODE
        # data_palette_type = "DataPalette.DataPalette"
        # excluded_types = [data_palette_type]
        # excluded_types.extend(self.DATA_PALETTES_TYPES.keys())
        # add_to_palette_list = []
        # dp_detected = False
        ## END DP CODE
        # for obj_type in excluded_types:
        #     list_objects_params = {'type': obj_type}
        ## DP CODE
        # if obj_type == data_palette_type:
        #     list_objects_params['showHidden'] = 1
        ## END DP CODE
        # for info in WorkspaceListObjectsIterator(self.ws,
        #                                          ws_id=workspaceId,
        #                                          list_objects_params=list_objects_params):
        ## DP CODE
        # if obj_type == data_palette_type:
        # dp_detected = True
        # else:
        #     add_to_palette_list.append({
        #         'ref': str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        #     })
        ## END DP CODE
        # excluded_list.append({'objid': info[0]})
        # clone the workspace EXCEPT for currentNarrative object
        newWsId = self.ws.clone_workspace({
            'wsi': {
                'id': workspaceId
            },
            'workspace': newWsName,
            'meta': newWsMeta,
            'exclude': excluded_list
        })[0]
        try:
            ## DP CODE
            # if dp_detected:
            #     self.data_palette_client.call_method(
            #         "copy_palette",
            #         [{'from_workspace': str(workspaceId), 'to_workspace': str(newWsId)}],
            #         self.token
            #     )
            # if len(add_to_palette_list) > 0:
            #     # There are objects in source workspace that have type under DataPalette handling
            #     # but these objects are physically stored in source workspace rather that saved
            #     # in DataPalette object. So they weren't copied by "dps.copy_palette".
            #     self.data_palette_client.call_method(
            #         "add_to_palette",
            #         [{'workspace': str(newWsId), 'new_refs': add_to_palette_list}],
            #         self.token
            #     )
            ## END DP CODE

            # update the ref inside the narrative object and the new workspace metadata.
            newNarMetadata = currentNarrative['info'][10]
            newNarMetadata['name'] = newName
            newNarMetadata['ws_name'] = newWsName
            newNarMetadata['job_info'] = json.dumps({
                'queue_time': 0,
                'running': 0,
                'completed': 0,
                'run_time': 0,
                'error': 0
            })

            is_temporary = newNarMetadata.get('is_temporary', 'false')
            if 'is_temporary' not in newNarMetadata:
                if newNarMetadata['name'] == 'Untitled' or newNarMetadata[
                        'name'] is None:
                    is_temporary = 'true'
                newNarMetadata['is_temporary'] = is_temporary

            currentNarrative['data']['metadata']['name'] = newName
            currentNarrative['data']['metadata']['ws_name'] = newWsName
            currentNarrative['data']['metadata']['job_ids'] = {
                'apps': [],
                'methods': [],
                'job_usage': {
                    'queue_time': 0,
                    'run_time': 0
                }
            }
            # save the shiny new Narrative so it's at version 1
            newNarInfo = self.ws.save_objects({
                'id':
                newWsId,
                'objects': [{
                    'type': currentNarrative['info'][2],
                    'data': currentNarrative['data'],
                    'provenance': currentNarrative['provenance'],
                    'name': currentNarrative['info'][1],
                    'meta': newNarMetadata
                }]
            })
            # now, just update the workspace metadata to point
            # to the new narrative object

            if 'worksheets' in currentNarrative['data']:  # handle legacy.
                num_cells = len(
                    currentNarrative['data']['worksheets'][0]['cells'])
            else:
                num_cells = len(currentNarrative['data']['cells'])
            newNarId = newNarInfo[0][0]
            self.ws.alter_workspace_metadata({
                'wsi': {
                    'id': newWsId
                },
                'new': {
                    'narrative': str(newNarId),
                    'is_temporary': is_temporary,
                    'cell_count': str(num_cells)
                }
            })
            return {'newWsId': newWsId, 'newNarId': newNarId}
        except Exception:
            # let's delete copy of workspace so it's out of the way - it's broken
            self.ws.delete_workspace({'id': newWsId})
            raise

    def create_new_narrative(self, app, method, appparam, appData, markdown,
                             copydata, importData, includeIntroCell, title):
        if app and method:
            raise ValueError(
                "Must provide no more than one of the app or method params")

        if not importData and copydata:
            importData = copydata.split(';')

        if not appData and appparam:
            appData = []
            for tmp_item in appparam.split(';'):
                tmp_tuple = tmp_item.split(',')
                step_pos = None
                if tmp_tuple[0]:
                    try:
                        step_pos = int(tmp_tuple[0])
                    except ValueError:
                        pass
                appData.append([step_pos, tmp_tuple[1], tmp_tuple[2]])
        cells = None
        if app:
            cells = [{"app": app}]
        elif method:
            cells = [{"method": method}]
        elif markdown:
            cells = [{"markdown": markdown}]
        narr_info = self._create_temp_narrative(cells, appData, importData,
                                                includeIntroCell, title)
        if title is not None:
            # update workspace info so it's not temporary
            pass
        return narr_info

    def _get_intro_markdown(self):
        """
        Creates and returns a cell with the introductory text included.
        """
        # Load introductory markdown text
        with open(self.intro_md_file) as intro_file:
            intro_md = intro_file.read()
        return intro_md

    def _create_temp_narrative(self, cells, parameters, importData,
                               includeIntroCell, title):
        # Migration to python of JavaScript class from https://github.com/kbase/kbase-ui/blob/4d31151d13de0278765a69b2b09f3bcf0e832409/src/client/modules/plugins/narrativemanager/modules/narrativeManager.js#L414
        narr_id = int(round(time.time() * 1000))
        workspaceName = self.user_id + ':narrative_' + str(narr_id)
        narrativeName = "Narrative." + str(narr_id)

        ws = self.ws
        ws_info = ws.create_workspace({
            'workspace': workspaceName,
            'description': ''
        })
        [narrativeObject, metadataExternal
         ] = self._fetchNarrativeObjects(workspaceName, cells, parameters,
                                         includeIntroCell, title)
        is_temporary = 'true'
        if title is not None and title != 'Untitled':
            is_temporary = 'false'

        metadataExternal['is_temporary'] = is_temporary
        objectInfo = ws.save_objects({
            'workspace':
            workspaceName,
            'objects': [{
                'type':
                'KBaseNarrative.Narrative',
                'data':
                narrativeObject,
                'name':
                narrativeName,
                'meta':
                metadataExternal,
                'provenance': [{
                    'script':
                    'NarrativeManager.py',
                    'description':
                    'Created new ' + 'Workspace/Narrative bundle.'
                }],
                'hidden':
                0
            }]
        })[0]
        objectInfo = ServiceUtils.object_info_to_object(objectInfo)
        ws_info = self._completeNewNarrative(ws_info[0], objectInfo['id'],
                                             importData, is_temporary, title,
                                             len(narrativeObject['cells']))
        return {
            'workspaceInfo': ServiceUtils.workspace_info_to_object(ws_info),
            'narrativeInfo': objectInfo
        }

    def _fetchNarrativeObjects(self, workspaceName, cells, parameters,
                               includeIntroCell, title):
        if not cells:
            cells = []
        if not title:
            title = 'Untitled'

        # fetchSpecs
        appSpecIds = []
        methodSpecIds = []
        specMapping = {'apps': {}, 'methods': {}}
        for cell in cells:
            if 'app' in cell:
                appSpecIds.append(cell['app'])
            elif 'method' in cell:
                methodSpecIds.append(cell['method'])
        nms = NarrativeMethodStore(self.narrativeMethodStoreURL,
                                   token=self.token)
        if len(appSpecIds) > 0:
            appSpecs = nms.get_app_spec({'ids': appSpecIds})
            for spec in appSpecs:
                spec_id = spec['info']['id']
                specMapping['apps'][spec_id] = spec
        if len(methodSpecIds) > 0:
            methodSpecs = nms.get_method_spec({'ids': methodSpecIds})
            for spec in methodSpecs:
                spec_id = spec['info']['id']
                specMapping['methods'][spec_id] = spec
        # end of fetchSpecs

        metadata = {
            'job_ids': {
                'methods': [],
                'apps': [],
                'job_usage': {
                    'queue_time': 0,
                    'run_time': 0
                }
            },
            'format': 'ipynb',
            'creator': self.user_id,
            'ws_name': workspaceName,
            'name': title,
            'type': 'KBaseNarrative.Narrative',
            'description': '',
            'data_dependencies': []
        }
        cellData = self._gatherCellData(cells, specMapping, parameters,
                                        includeIntroCell)
        narrativeObject = {
            'nbformat_minor': 0,
            'cells': cellData,
            'metadata': metadata,
            'nbformat': 4
        }
        metadataExternal = {}
        for key in metadata:
            value = metadata[key]
            if isinstance(value, str):
                metadataExternal[key] = value
            else:
                metadataExternal[key] = json.dumps(value)
        return [narrativeObject, metadataExternal]

    def _gatherCellData(self, cells, specMapping, parameters,
                        includeIntroCell):
        cell_data = []
        if includeIntroCell == 1:
            cell_data.append({
                'cell_type': 'markdown',
                'source': self._get_intro_markdown(),
                'metadata': {}
            })
        for cell_pos, cell in enumerate(cells):
            if 'app' in cell:
                cell_data.append(
                    self._buildAppCell(len(cell_data),
                                       specMapping['apps'][cell['app']],
                                       parameters))
            elif 'method' in cell:
                cell_data.append(
                    self._buildMethodCell(
                        len(cell_data), specMapping['methods'][cell['method']],
                        parameters))
            elif 'markdown' in cell:
                cell_data.append({
                    'cell_type': 'markdown',
                    'source': cell['markdown'],
                    'metadata': {}
                })
            else:
                raise ValueError("cannot add cell #" + str(cell_pos) +
                                 ", unrecognized cell content")
        return cell_data

    def _buildAppCell(self, pos, spec, params):
        cellId = 'kb-cell-' + str(pos) + '-' + str(uuid.uuid4())
        cell = {
            "cell_type":
            "markdown",
            "source":
            "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" +
            cellId + "').kbaseNarrativeAppCell({'appSpec' : '" +
            self._safeJSONStringify(spec) + "', 'cellId' : '" + cellId +
            "'});" + "</script>",
            "metadata": {}
        }
        cellInfo = {}
        widgetState = []
        cellInfo[self.KB_TYPE] = self.KB_APP_CELL
        cellInfo['app'] = spec
        if params:
            steps = {}
            for param in params:
                stepid = 'step_' + str(param[0])
                if stepid not in steps:
                    steps[stepid] = {}
                    steps[stepid]['inputState'] = {}
                steps[stepid]['inputState'][param[1]] = param[2]
            state = {'state': {'step': steps}}
            widgetState.append(state)
        cellInfo[self.KB_STATE] = widgetState
        cell['metadata'][self.KB_CELL] = cellInfo
        return cell

    def _buildMethodCell(self, pos, spec, params):
        cellId = "kb-cell-" + str(pos) + "-" + str(uuid.uuid4())
        cell = {
            "cell_type":
            "markdown",
            "source":
            "<div id='" + cellId + "'></div>" + "\n<script>" + "$('#" +
            cellId + "').kbaseNarrativeMethodCell({'method' : '" +
            self._safeJSONStringify(spec) + "'});" + "</script>",
            "metadata": {}
        }
        cellInfo = {"method": spec, "widget": spec["widgets"]["input"]}
        cellInfo[self.KB_TYPE] = self.KB_FUNCTION_CELL
        widgetState = []
        if params:
            wparams = {}
            for param in params:
                wparams[param[1]] = param[2]
            widgetState.append({"state": wparams})
        cellInfo[self.KB_STATE] = widgetState
        cell["metadata"][self.KB_CELL] = cellInfo
        return cell

    def _completeNewNarrative(self, workspaceId, objectId, importData,
                              is_temporary, title, num_cells):
        """
        'Completes' the new narrative by updating workspace metadata with the required fields and
        copying in data from the importData list of references.
        """
        new_meta = {
            'narrative': str(objectId),
            'is_temporary': is_temporary,
            'searchtags': 'narrative',
            'cell_count': str(num_cells)
        }
        if is_temporary == 'false' and title is not None:
            new_meta['narrative_nice_name'] = title

        self.ws.alter_workspace_metadata({
            'wsi': {
                'id': workspaceId
            },
            'new': new_meta
        })
        # copy_to_narrative:
        if importData:
            objectsToCopy = [{'ref': x} for x in importData]
            infoList = self.ws.get_object_info_new({
                'objects': objectsToCopy,
                'includeMetadata': 0
            })
            for item in infoList:
                objectInfo = ServiceUtils.object_info_to_object(item)
                self.copy_object(objectInfo['ref'], workspaceId, None, None,
                                 objectInfo)

        return self.ws.get_workspace_info({'id': workspaceId})

    def _safeJSONStringify(self, obj):
        return json.dumps(self._safeJSONStringifyPrepare(obj))

    def _safeJSONStringifyPrepare(self, obj):
        if isinstance(obj, str):
            return obj.replace("'", "&apos;").replace('"', "&quot;")
        elif isinstance(obj, list):
            for pos in range(len(obj)):
                obj[pos] = self._safeJSONStringifyPrepare(obj[pos])
        elif isinstance(obj, dict):
            obj_keys = list(obj.keys())
            for key in obj_keys:
                obj[key] = self._safeJSONStringifyPrepare(obj[key])
        else:
            pass  # it's boolean/int/float/None
        return obj

    def _get_workspace_name_or_id(self, ws_id, ws_name):
        ret = ws_name
        if not ret:
            ret = str(ws_id)
        return ret

    def copy_object(self, ref, target_ws_id, target_ws_name, target_name,
                    src_info):
        """
        Copies an object from one workspace to another.
        """
        if not target_ws_id and not target_ws_name:
            raise ValueError("Neither target workspace id nor name is defined")
        if not src_info:
            src_info_tuple = self.ws.get_object_info_new({
                'objects': [{
                    'ref': ref
                }],
                'includeMetadata':
                0
            })[0]
            src_info = ServiceUtils.object_info_to_object(src_info_tuple)
        if not target_name:
            target_name = src_info['name']
        obj_info_tuple = self.ws.copy_object({
            'from': {
                'ref': ref
            },
            'to': {
                'wsid': target_ws_id,
                'workspace': target_ws_name,
                'name': target_name
            }
        })
        obj_info = ServiceUtils.object_info_to_object(obj_info_tuple)
        return {'info': obj_info}

    def list_available_types(self, workspaces):
        data = self.list_objects_with_sets(workspaces=workspaces)['data']
        type_stat = {}
        for item in data:
            info = item['object_info']
            obj_type = info[2].split('-')[0]
            if obj_type in type_stat:
                type_stat[obj_type] += 1
            else:
                type_stat[obj_type] = 1
        return {'type_stat': type_stat}
Ejemplo n.º 5
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'
        [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11)  # object_info tuple
        ws = Workspace(self.ws_url)

        # 1) generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)


        # 2) based on type, download the files
        obj_name = self.get_data_obj_name (input_ref)
        type_name = self.get_data_obj_type (input_ref)

        # auClient
        try:
            auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e))

        # mguClient
        try:
            mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))


        # Standard Single Assembly
        #
        if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']:
            # create file data
            filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension)
            auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename})
            if not os.path.isfile(filename):
                raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # AssemblySet
        #
        elif type_name == 'KBaseSets.AssemblySet':

            # read assemblySet
            try:
                assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1})
            except Exception as e:
                raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e))
            assembly_refs = []
            assembly_names = []
            for assembly_item in assemblySet_obj['data']['items']:
                this_assembly_ref = assembly_item['ref']
                # assembly obj info
                try:
                    this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0]
                    this_assembly_name = this_assembly_info[NAME_I]
                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e))
                assembly_refs.append(this_assembly_ref)
                assembly_names.append(this_assembly_name)

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(assembly_refs):
                this_name = assembly_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Binned Contigs
        #
        elif type_name == 'KBaseMetagenomes.BinnedContigs':

            # download the bins as fasta and set the input folder name
            bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            # make sure fasta file isn't empty
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
            for (dirpath, dirnames, filenames) in os.walk(input_dir):
                for fasta_file in filenames:
                    fasta_path = os.path.join (input_dir,fasta_file)
                    min_fasta_len = 1
                    if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len):
                        raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path))
                break

        # Genome and GenomeSet
        #
        elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet':
            genome_obj_names = []
            genome_sci_names = []
            genome_assembly_refs = []

            if type_name == 'KBaseGenomes.Genome':
                genomeSet_refs = [input_ref]
            else:  # get genomeSet_refs from GenomeSet object
                genomeSet_refs = []
                try:
                    genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data']
                except Exception as e:
                    raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e))
                    #to get the full stack trace: traceback.format_exc()

                # iterate through genomeSet members
                for genome_id in genomeSet_object['elements'].keys():
                    if 'ref' not in genomeSet_object['elements'][genome_id] or \
                       genomeSet_object['elements'][genome_id]['ref'] == None or \
                       genomeSet_object['elements'][genome_id]['ref'] == '':
                        raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref))
                    else:
                        genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref'])

            # genome obj data
            for i,this_input_ref in enumerate(genomeSet_refs):
                try:
                    objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data']
                    genome_obj = objects[0]['data']
                    genome_obj_info = objects[0]['info']
                    genome_obj_names.append(genome_obj_info[NAME_I])
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError ("unable to fetch genome: "+this_input_ref)

                # Get genome_assembly_ref
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    raise ValueError (msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['contigset_ref'])

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(genome_assembly_refs):
                this_name = genome_obj_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Unknown type slipped through
        #
        else:
            raise ValueError('Cannot stage fasta file input directory from type: ' + type_name)


        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
Ejemplo n.º 6
0
class masurca_utils:
    """
    masurca_utils: defining a system of utils for running masurca
    """
    MaSuRCA_VERSION = 'MaSuRCA-3.2.9'
    MaSuRCA_BIN = '/kb/module/' + MaSuRCA_VERSION + '/bin/masurca'
    PARAM_IN_WS = 'workspace_name'
    PARAM_IN_THREADN = 'num_threads'
    PARAM_IN_READS_LIBS = 'reads_libraries'
    PARAM_IN_JUMP_LIBS = 'jump_libraries'
    PARAM_IN_JF_SIZE = 'jf_size'
    PARAM_IN_CS_NAME = 'output_contigset_name'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')

    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token)
        self.au = AssemblyUtil(self.callback_url, token=self.token)
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir
        self.prog_runner = Program_Runner(self.MaSuRCA_BIN, self.proj_dir)

    def _has_long_reads(self, params):
        """
        _has_long_reads: check if a long reads input exists in the parameters
        """
        return (params.get('pacbio_reads', None)
                or params.get('nanopore_reads', None)
                or params.get('other_frg_file', None))

    def _get_data_portion(self,
                          pe_reads_data,
                          jp_reads_data=None,
                          pacbio_reads_file='',
                          nanopore_reads_file='',
                          other_frg_file=''):
        """
        _get_data_portion: build the 'DATA...END' portion for the config.txt file
        """
        data_str = ''
        if pe_reads_data:
            # log('PE reads data details:\n{}'.format(json.dumps(pe_reads_data, indent=1)))
            for pe in pe_reads_data:
                if data_str != '':
                    data_str += '\n'
                data_str += 'PE= ' + pe['pe_prefix'] + ' ' + str(pe['pe_mean']) + ' ' + \
                            str(pe['pe_stdev']) + ' ' + pe['fwd_file']
                if pe.get('rev_file', None):
                    data_str += ' ' + pe['rev_file']

        if jp_reads_data:
            # log('JUMP reads data details:\n{}'.format(json.dumps(jp_reads_data, indent=1)))
            for jp in jp_reads_data:
                if data_str != '':
                    data_str += '\n'
                data_str += 'JUMP= ' + jp['jp_prefix'] + ' ' + str(jp['jp_mean']) + ' ' + \
                            str(jp['jp_stdev']) + ' ' + jp['fwd_file']
                if jp.get('rev_file', None):
                    data_str += ' ' + jp['rev_file']

        # Adding the pacbio_reads
        # Note that pcbio reads must be in a single fasta file!
        # For example:
        # data_str +='\nPACBIO= /pool/genomics/frandsenp/masurca/PacBio/pacbio_reads.fasta'
        # ***if you have both types of reads supply them both as NANOPORE type***
        if pacbio_reads_file != '':
            if data_str != '':
                data_str += '\n'
            if nanopore_reads_file != '':
                data_str += 'NANOPORE=' + pacbio_reads_file
            else:
                data_str += 'PACBIO=' + pacbio_reads_file

        # Adding the nanopore_reads and note that nanopore reads must be in a single fasta file!
        # For example:
        # data_str +='\nNANOPORE= /pool/genomics/frandsenp/masurca/NanoPore/nanopore_reads.fasta'
        if nanopore_reads_file != '':
            if data_str != '':
                data_str += '\n'
            data_str += 'NANOPORE= ' + nanopore_reads_file

        # Adding the other_frg_file inputs if any
        # any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first converted into
        # Celera Assembler compatible .frg file
        # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg
        if other_frg_file != '':
            if data_str != '':
                data_str += '\n'
            data_str += 'OTHER=' + other_frg_file

        return data_str

    def _get_parameters_portion(self, params):
        """
        build the 'PARAMETERS...END' portion for the config.txt file
        """
        # set the default parameters as suggested in the example configuration file
        param_str = (
            "EXTEND_JUMP_READS=0\nUSE_GRID=0\nGRID_QUEUE=all.q\nGRID_BATCH_SIZE"
            + "=300000000\nLHE_COVERAGE=25\nMEGA_READS_ONE_PASS=0")
        if (params.get('graph_kmer_size', None)
                and type(params['graph_kmer_size']) == int):
            if param_str != '':
                param_str += '\n'
            param_str += 'GRAPH_KMER_SIZE=' + str(params['graph_kmer_size'])
        else:
            if param_str != '':
                param_str += '\n'
            param_str += 'GRAPH_KMER_SIZE=auto'
        if params.get('use_linking_mates', None):
            if param_str != '':
                param_str += '\n'
            if params['use_linking_mates'] == 1 and not self._has_long_reads(
                    params):
                param_str += 'USE_LINKING_MATES=1'
            else:
                param_str += 'USE_LINKING_MATES=0'
        if params.get('limit_jump_coverage', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'LIMIT_JUMP_COVERAGE = ' + str(
                params['limit_jump_coverage'])
        if params.get('cgwErrorRate', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'CA_PARAMETERS = cgwErrorRate=' + str(
                params['cgwErrorRate'])
        if params.get(self.PARAM_IN_THREADN, None):
            if param_str != '':
                param_str += '\n'
            param_str += 'NUM_THREADS = ' + str(params[self.PARAM_IN_THREADN])
        if params.get('jf_size', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'JF_SIZE=' + str(params['jf_size'])
        if params.get('kmer_count_threshold', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'KMER_COUNT_THRESHOLD=' + str(
                params['kmer_count_threshold'])
        if params.get('do_homopolymer_trim', None):
            if param_str != '':
                param_str += '\n'
            if params['do_homopolymer_trim'] == 1:
                param_str += 'DO_HOMOPOLYMER_TRIM=1'
            else:
                param_str += 'DO_HOMOPOLYMER_TRIM=0'
        if params.get('close_gaps', None):
            if param_str != '':
                param_str += '\n'
            if params['close_gaps'] == 1:
                param_str += 'CLOSE_GAPS=1'
            else:
                param_str += 'CLOSE_GAPS=0'
        if params.get('soap_assembly', None):
            if param_str != '':
                param_str += '\n'
            if params['soap_assembly'] == 1:
                param_str += 'SOAP_ASSEMBLY=1'
            else:
                param_str += 'SOAP_ASSEMBLY=0'
        return param_str

    def _replaceSectionText(self, orig_txt, begin_patn, end_patn, repl_txt):
        """
        replace a section of text of orig_txt between lines begin-patn and end-patn with repl_text
        examples of parameters:
            begin_patn1 = "DATA\n"
            begin_patn2 = "PARAMETERS\n"
            end_patn1 = "END\nPARAMETERS\n"
            end_patn2 = "END\n"
            repl_txt1 = ('PE= pe 500 50 /kb/module/work/testReads/small.forward.fq' +
                          ' /kb/module/work/testReads/small.reverse.fq\n')
            repl_txt2 = ('GRAPH_KMER_SIZE=auto\nUSE_LINKING_MATES=1\nLIMIT_JUMP_COVERAGE = 60\n' +
                          'CA_PARAMETERS = cgwErrorRate=0.15\nNUM_THREADS= 64\nJF_SIZE=100000000\n
                          DO_HOMOPOLYMER_TRIM=0\n')
        """
        if repl_txt != '':
            # create regular expression pattern
            repl = re.compile(begin_patn + '.*?' + end_patn, re.DOTALL)
            repl_txt = begin_patn + repl_txt + '\n' + end_patn
            # replace the text between begin_patn and end_patn with repl_txt
            txt_replaced = repl.sub(repl_txt, orig_txt)
            # pprint(txt_replaced)
            return txt_replaced
        else:
            return orig_txt

    def _unique_prefix_check(self, pfix, refs):
        prefix_lookup = {}
        for ref in refs:
            pre = ref[pfix][0:2]
            if pre not in prefix_lookup:
                prefix_lookup[pre] = 1
            else:
                raise ValueError('The first two characters in \'' + ref[pfix] +
                                 '\' has been used.')

    def _get_pereads_info(self, input_params):
        """
        _get_pereads_info--from a list of paired_readsParams structures fetches the
        corresponding reads info with the paired_readsParams[pe_id]
        returns a list of reads data in the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'pe_prefix': the two-letter prefix for the reads library,
                'pe_mean': the average reads length for the reads library,
                'pe_stdev': the standard deviation for the reads library,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        rds_params = copy.deepcopy(input_params)
        wsname = rds_params[self.PARAM_IN_WS]
        rds_refs = []
        rds_data = []

        # reads_libraries grouped params
        if rds_params.get(self.PARAM_IN_READS_LIBS, None):
            pe_reads_libs = rds_params[self.PARAM_IN_READS_LIBS]

            for pe_lib in pe_reads_libs:
                if pe_lib.get('pe_id', None):
                    rds_refs.append(pe_lib['pe_id'])
            rds_data = self._get_kbreads_info(wsname, rds_refs)

            for pe_lib in pe_reads_libs:
                i = 0
                for rds in rds_data:
                    i += 1
                    if 'pe_id' in pe_lib and pe_lib['pe_id'] == rds[
                            'reads_ref']:
                        if pe_lib.get('pe_prefix', None):
                            rds['pe_prefix'] = pe_lib['pe_prefix'][0]
                        else:
                            rds['pe_prefix'] = 'p'
                        rds['pe_prefix'] += str(i)
                        pe_lib['pe_prefix'] = rds['pe_prefix']

                        if pe_lib.get('pe_mean', None) is None:
                            pe_lib['pe_mean'] = 500
                        rds['pe_mean'] = pe_lib['pe_mean']

                        if pe_lib.get('pe_stdev', None) is None:
                            pe_lib['pe_stdev'] = 50
                        rds['pe_stdev'] = pe_lib['pe_stdev']

            self._unique_prefix_check('pe_prefix', pe_reads_libs)
        else:
            raise ValueError("Parameter {} is required.".format(
                self.PARAM_IN_READS_LIBS))
        return rds_data

    def _get_jpreads_info(self, input_params):
        """
        _get_jpreads_info--from a list of jump_readsParams structures fetches the corresponding
        reads info with the paired_readsParams[pe_id]
        returns a list of reads data in the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'jp_prefix': the two-letter prefix for the reads library,
                'jp_mean': the average reads length for the reads library,
                'jp_stdev': the standard deviation for the reads library,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        rds_params = copy.deepcopy(input_params)
        wsname = rds_params[self.PARAM_IN_WS]
        rds_refs = []
        rds_data = []

        # jump_libraries grouped params
        if rds_params.get(self.PARAM_IN_JUMP_LIBS, None):
            jp_reads_libs = rds_params[self.PARAM_IN_JUMP_LIBS]
            for jp_lib in jp_reads_libs:
                if jp_lib.get('jp_id', None):
                    rds_refs.append(jp_lib['jp_id'])
            rds_data = self._get_kbreads_info(wsname, rds_refs)

            for jp_lib in jp_reads_libs:
                i = 0
                for rds in rds_data:
                    i += 1
                    if 'jp_id' in jp_lib and jp_lib['jp_id'] == rds[
                            'reads_ref']:
                        if jp_lib.get('jp_prefix', None):
                            rds['jp_prefix'] = jp_lib['jp_prefix'][0]
                        else:
                            rds['jp_prefix'] = 's'
                        rds['jp_prefix'] += str(i)
                        jp_lib['jp_prefix'] = rds['jp_prefix']

                        if jp_lib.get('jp_mean', None) is None:
                            jp_lib['jp_mean'] = 3600
                        rds['jp_mean'] = jp_lib['jp_mean']

                        if jp_lib.get('jp_stdev', None) is None:
                            jp_lib['jp_stdev'] = 200
                        rds['jp_stdev'] = jp_lib['jp_stdev']

            self._unique_prefix_check('jp_prefix', jp_reads_libs)
        return rds_data

    def _get_kbreads_info(self, wsname, reads_refs):
        """
        _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding
         reads info with as deinterleaved fastq files and returns a list of reads data in
         the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        obj_ids = []
        for r in reads_refs:
            if r:
                obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})

        if not obj_ids:
            return []

        ws_info = self.ws_client.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = self.ru.download_reads({
                'read_libraries': reads_params,
                'interleaved': 'false'
            })['files']
        except ServerError as se:
            log('logging stacktrace from dynamic client error')
            log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.SingleEndLibrary ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'KBaseFile.SingleEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        # log('Downloaded reads data from KBase:\n' + pformat(reads))
        reads_data = []
        for ref in reads_refs:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            seq_tech = reads[ref]['sequencing_tech']
            rds_info = {
                'fwd_file': f['fwd'],
                'reads_ref': ref,
                'type': f['type'],
                'seq_tech': seq_tech,
                'reads_name': reads_name
            }
            if f.get('rev', None) is not None:
                rds_info['rev_file'] = f['rev']
            reads_data.append(rds_info)

        return reads_data

    def _generate_output_file_list(self, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.proj_dir, str(uuid.uuid4()))
        mkdir_p(output_directory)
        masurca_output = os.path.join(output_directory, 'masurca_output.zip')
        self._zip_folder(out_dir, masurca_output)

        output_files.append({
            'path':
            masurca_output,
            'name':
            os.path.basename(masurca_output),
            'label':
            os.path.basename(masurca_output),
            'description':
            'Output file(s) generated by MaSuRCA'
        })

        return output_files

    def _zip_folder(self, folder_path, output_path):
        """
        _zip_folder: Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders could be included in the archive as well
        if the commented portion is used.
        """
        with zipfile.ZipFile(output_path,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    # print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print("{} created successfully.".format(output_path))
        # with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size

    def _load_stats(self, input_file_name):
        log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly')
        log('Building Object.')
        if not os.path.isfile(input_file_name):
            raise Exception('The input file name {0} is not a file!'.format(
                input_file_name))
        with open(input_file_name, 'r') as input_file_handle:
            contig_id = None
            sequence_len = 0
            fasta_dict = dict()
            first_header_found = False
            # Pattern for replacing white space
            pattern = re.compile(r'\s+')
            for current_line in input_file_handle:
                if current_line[0] == '>':
                    # found a header line
                    # Wrap up previous fasta sequence
                    if not first_header_found:
                        first_header_found = True
                    else:
                        fasta_dict[contig_id] = sequence_len
                        sequence_len = 0
                    fasta_header = current_line.replace('>', '').strip()
                    try:
                        contig_id = fasta_header.strip().split(' ', 1)[0]
                    except (IndexError, KeyError, ValueError):
                        contig_id = fasta_header.strip()
                else:
                    sequence_len += len(re.sub(pattern, '', current_line))
        # wrap up last fasta sequence
        if not first_header_found:
            raise Exception("There are no contigs in this file")
        else:
            fasta_dict[contig_id] = sequence_len
        return fasta_dict

    def _check_reference(self, ref):
        """
        Tests the given ref string to make sure it conforms to the expected
        object reference format. Returns True if it passes, False otherwise.
        """
        obj_ref_regex = re.compile(
            "^(?P<wsid>\d+)\/(?P<objid>\d+)(\/(?P<ver>\d+))?$")
        ref_path = ref.strip().split(";")
        for step in ref_path:
            if not obj_ref_regex.match(step):
                return False
        return True

    def _check_ref_type(self, ref, allowed_types):
        """
        Validates the object type of ref against the list of allowed types. If it passes, this
        returns True, otherwise False.
        Really, all this does is verify that at least one of the strings in allowed_types is
        a substring of the ref object type name.
        Ex1:
        ref = "KBaseGenomes.Genome-4.0"
        allowed_types = ["assembly", "KBaseFile.Assembly"]
        returns False
        Ex2:
        ref = "KBaseGenomes.Genome-4.0"
        allowed_types = ["assembly", "genome"]
        returns True
        """
        obj_type = self._get_object_type(ref).lower()
        for t in allowed_types:
            if t.lower() in obj_type:
                return True
        return False

    def _get_object_type(self, ref):
        """
        Fetches and returns the typed object name of ref from the given workspace url.
        If that object doesn't exist, or there's another Workspace error, this raises a
        RuntimeError exception.
        """
        info = self.ws_client.get_object_info3({'objects': [{'ref': ref}]})
        obj_info = info.get('infos', [[]])[0]
        if len(obj_info) == 0:
            raise RuntimeError(
                "An error occurred while fetching type info from the Workspace. "
                "No information returned for reference {}".format(ref))
        return obj_info[2]

    def _get_fasta_from_assembly(self, assembly_ref):
        """
        From an assembly or contigset, this uses a data file to build a FASTA file
        and return the path to it.
        """
        allowed_types = [
            'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly',
            'KBaseGenomes.ContigSet'
        ]
        if not self._check_ref_type(assembly_ref, allowed_types):
            raise ValueError(
                "The reference {} cannot be used to fetch a FASTA file".format(
                    assembly_ref))
        au = AssemblyUtil(self.callback_url)
        return au.get_assembly_as_fasta({'ref': assembly_ref})

    def generate_report(self, contig_file_name, params, out_dir, wsname):
        """
        generate_report: reporting results
        """
        log('Generating and saving report')

        contig_file_with_path = os.path.join(out_dir, contig_file_name)
        fasta_stats = self._load_stats(contig_file_with_path)
        lengths = [fasta_stats[contig_id] for contig_id in fasta_stats]

        assembly_ref = params[self.PARAM_IN_WS] + '/' + params[
            self.PARAM_IN_CS_NAME]

        report_text = ''
        report_text += 'MaSuRCA results saved to: ' + wsname + '/' + out_dir + '\n'
        report_text += 'Assembly saved to: ' + assembly_ref + '\n'
        report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report_text += 'Avg Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n'
        for c in range(bins):
            report_text += ('   ' + str(counts[c]) + '\t--\t' + str(edges[c]) +
                            ' to ' + str(edges[c + 1]) + ' bp\n')
        print('Running QUAST')
        quastret = self.kbq.run_QUAST({
            'files': [{
                'path': contig_file_with_path,
                'label': params[self.PARAM_IN_CS_NAME]
            }]
        })

        output_files = self._generate_output_file_list(out_dir)

        print('Saving report')
        report_output = self.kbr.create_extended_report({
            'message':
            report_text,
            'objects_created': [{
                'ref': assembly_ref,
                'description': 'Assembled contigs'
            }],
            'direct_html_link_index':
            0,
            'file_links':
            output_files,
            'html_links': [{
                'shock_id': quastret['shock_id'],
                'name': 'report.html',
                'label': 'QUAST report'
            }],
            'report_object_name':
            'kb_masurca_report_' + str(uuid.uuid4()),
            'workspace_name':
            params[self.PARAM_IN_WS]
        })
        report_name = report_output['name']
        report_ref = report_output['ref']
        return report_name, report_ref

    def validate_params(self, params):
        """
        validate_params: checks params passed to run_masurca_app method and set default values
        """
        # log('Start validating run_masurca_app parameters:\n{}'.format(
        # json.dumps(params, indent=1)))

        # check for mandatory parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError(self.PARAM_IN_WS + ' parameter is mandatory')
        if self.PARAM_IN_THREADN not in params:
            raise ValueError(self.PARAM_IN_THREADN + ' parameter is mandatory')

        if params.get(self.PARAM_IN_JF_SIZE, None) is None:
            raise ValueError(self.PARAM_IN_JF_SIZE + ' parameter is mandatory')
        if params.get(self.PARAM_IN_READS_LIBS, None) is None:
            raise ValueError(self.PARAM_IN_READS_LIBS +
                             ' parameter is mandatory')
        if type(params[self.PARAM_IN_READS_LIBS]) != list:
            raise ValueError(self.PARAM_IN_READS_LIBS + ' must be a list')

        if params.get(self.PARAM_IN_CS_NAME, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(
                self.PARAM_IN_CS_NAME))
        if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]):
            raise ValueError('Invalid workspace object name: {}.'.format(
                params[self.PARAM_IN_CS_NAME]))

        if 'dna_source' in params:
            dna_src = params.get('dna_source')
            if dna_src == 'bacteria':
                params['limit_jump_coverage'] = 60
                params['cgwErrorRate'] = 0.25
            else:
                params['limit_jump_coverage'] = 300
                params['cgwErrorRate'] = 0.15

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return params

    def construct_masurca_assembler_cfg(self, params):
        # STEP 1: get the working folder housing the config.txt file and the masurca results
        wsname = params[self.PARAM_IN_WS]
        config_file_path = os.path.join(self.proj_dir, 'config.txt')

        # STEP 2.1: retrieve the reads data from input parameter
        pe_reads_data = self._get_pereads_info(params)
        jp_reads_data = []
        if params.get(self.PARAM_IN_JUMP_LIBS, None):
            jp_reads_data = self._get_jpreads_info(params)
            if 'jp_mean' not in params or type(params['jp_mean']) != int:
                params['jp_mean'] = 3600
            if 'jp_stdev' not in params or type(params['jp_stdev']) != int:
                params['jp_stdev'] = 200

        # STEP 2.2: PACBIO reads must be in a single FASTA file and supplied as PACBIO=reads.fa;
        assbl_types = [
            'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly',
            'KBaseGenomes.ContigSet'
        ]
        reads_types = [
            'KBaseAssembly.SingleEndLibrary', 'KBaseFile.SingleEndLibrary',
            'KBaseAssembly.PairedEndLibrary', 'KBaseFile.PairedEndLibrary'
        ]
        pb_reads_file = ''
        if params.get('pacbio_reads', None):
            pb_ref = params['pacbio_reads']
            if self._check_ref_type(pb_ref, assbl_types):
                pb_reads_file = (self._get_fasta_from_assembly(pb_ref)).get(
                    'path', '')
            else:
                if self._check_ref_type(pb_ref, reads_types):
                    pb_rd = self._get_kbreads_info(wsname, [pb_ref])
                    pb_reads_file = pb_rd[0]['fwd_file']
                    if pb_rd[0].get('rev_file', None):
                        pb_reads_file += ' ' + pb_rd[0]['rev_file']

        # STEP 2.3: NANOPORE reads must be in a single FASTA/FASTQ file and supplied
        # as NANOPORE=reads.fa
        np_reads_file = ''
        if params.get('nanopore_reads', None):
            np_ref = params['nanopore_reads']
            if self._check_ref_type(np_ref, assbl_types):
                np_reads_file = (self._get_fasta_from_assembly(np_ref)).get(
                    'path', '')
            else:
                if self._check_ref_type(np_ref, reads_types):
                    np_rd = self._get_kbreads_info(wsname, [np_ref])
                    np_reads_file = np_rd[0]['fwd_file']
                    if np_rd[0].get('rev_file', None):
                        np_reads_file += ' ' + np_rd[0]['rev_file']

        # STEP 2.4: any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
        # converted into Celera Assembler compatible .frg files
        # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg
        other_frg = ''
        if params.get('other_frg_file', None):
            other_frg = params['other_frg_file']

        # STEP 3: construct and save the config.txt file for running masurca
        try:
            # STEP 3.1: replace the 'DATA...END' portion of the config_template.txt file
            data_str = self._get_data_portion(pe_reads_data, jp_reads_data,
                                              pb_reads_file, np_reads_file,
                                              other_frg)
            if data_str == '':  # no reads libraries are specified, no further actions
                return ''

            config_template = ''
            with codecs.open(os.path.join(os.path.dirname(__file__),
                                          'config_template.txt'),
                             mode='r',
                             encoding='utf-8') as config_template_file:
                config_template = config_template_file.read()

            begin_patn1 = "DATA\n"
            end_patn1 = "END\nPARAMETERS\n"
            config_with_data = self._replaceSectionText(
                config_template, begin_patn1, end_patn1, data_str)
            # log("\n***After DATA section replacement:\n{}\nSaved at {}".format(
            #             config_with_data.encode('utf-8').decode('utf-8'), config_file_path))

            with codecs.open(config_file_path, mode='w',
                             encoding='utf-8') as config_file:
                config_file.write(config_with_data)

            # STEP 3.2: replace the 'PARAMETERS...END' portion of the config_file file saved above
            param_str = self._get_parameters_portion(params)
            if param_str == '':  # no parameters are specified, no further actions
                return ''

            previous_config = ''
            with codecs.open(config_file_path, mode='r',
                             encoding='utf-8') as previous_config_file:
                previous_config = previous_config_file.read()

            begin_patn2 = "PARAMETERS\n"
            end_patn2 = "END\n"
            final_config = self._replaceSectionText(previous_config,
                                                    begin_patn2, end_patn2,
                                                    param_str)
            log("\n***Configuration file content:\n{}\nSaved at {}".format(
                final_config.encode('utf-8').decode('utf-8'),
                config_file_path))

            with codecs.open(config_file_path, mode='w',
                             encoding='utf-8') as config_file:
                config_file.write(final_config)
        except IOError as ioerr:
            log('Creation of the config.txt file raised error:\n')
            pprint(ioerr)
            return ''
        else:
            return config_file_path

    def generate_assemble_script(self, config_file):
        if os.path.isfile(config_file):
            f_dir, f_nm = os.path.split(config_file)
            m_cmd = [self.MaSuRCA_BIN]
            m_cmd.append(config_file)
            try:
                self.prog_runner.run(m_cmd, f_dir)
                assemble_file = os.path.join(f_dir, 'assemble.sh')
                log('Created the assemble.sh file at {}.\n'.format(
                    assemble_file))
                return assemble_file
            except ValueError as ve:
                log('Error generating assemble.sh file: \n{}'.format(ve))
                raise ValueError('Failed to generate assemble.sh file!')
        else:
            log("The config file {} is not found.\n".format(config_file))
            log('NO assemble.sh file created.\n')
        return ''

    def run_assemble(self, asmbl_file):
        exit_code = 1
        if os.path.isfile(asmbl_file):
            log("The assemble.sh file exists at {}\n".format(asmbl_file))
            f_dir, f_nm = os.path.split(asmbl_file)
            a_cmd = ['/bin/bash']
            a_cmd.append(asmbl_file)
            log("The working directory is {}\n".format(f_dir))
            log("The assembling command is {}\n".format(' '.join(a_cmd)))
            try:
                exit_code = self.prog_runner.run(a_cmd, f_dir)
            except ValueError as ve:
                log('Error running assemble: \n{}'.format(ve))
        else:
            log("The assemble.sh file {} is not found.".format(asmbl_file))
        return exit_code

    def save_assembly(self, contig_fa, wsname, a_name):
        if os.path.isfile(contig_fa):
            log('Uploading FASTA file to Assembly...')
            self.au.save_assembly_from_fasta({
                'file': {
                    'path': contig_fa
                },
                'workspace_name': wsname,
                'assembly_name': a_name
            })
        else:
            log("The contig file {} is not found.".format(contig_fa))
Ejemplo n.º 7
0
    def download_long(self, console, warnings, token, wsname, lib,
                      min_long_read_length):
        try:
            # object info
            try:
                wsClient = Workspace(self.workspaceURL, token=token)
            except Exception as e:
                raise ValueError("unable to instantiate wsClient. " + str(e))

            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple

            obj_id = {'ref': lib if '/' in lib else (wsname + '/' + lib)}
            lib_obj_info = wsClient.get_object_info_new({'objects':
                                                         [obj_id]})[0]
            lib_obj_type = lib_obj_info[TYPE_I]
            lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                  lib_obj_type)  # remove trailing version
            lib_ref = str(lib_obj_info[WSID_I])+'/' + \
                str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I])
            if lib_obj_type == 'KBaseGenomes.ContigSet' or lib_obj_type == 'KBaseGenomeAnnotations.Assembly':
                # download using assembly util / data file util
                self.log(console,
                         "Getting long reads (from contigs object).\n")
                auClient = AssemblyUtil(url=self.callbackURL, token=token)
                dfuClient = DataFileUtil(url=self.callbackURL, token=token)
                contigFile = auClient.get_assembly_as_fasta({
                    'ref': lib_ref
                }).get('path')
                long_reads_path = dfuClient.unpack_file(
                    {'file_path': contig_file})['file_path']
                self.log(
                    warnings,
                    "Warning:  Long reads are in FASTA format, so short read check was not performed."
                )

            else:
                ruClient = ReadsUtils(url=self.callbackURL, token=token)
                self.log(console,
                         "Getting long reads (from reads library object).\n")
                result = ruClient.download_reads({
                    'read_libraries': [lib_ref],
                    'interleaved': 'false'
                })
                long_reads_path = result['files'][lib_ref]['files']['fwd']
                [n_reads, n_reads_short
                 ] = self.filter_short_fastq(console, long_reads_path,
                                             min_long_read_length)
                if (n_reads_short > 0):
                    self.log(
                        warnings, "Warning:  Of " + str(n_reads) +
                        " long reads, " + str(n_reads_short) +
                        " are shorter than " + str(min_long_read_length) +
                        "; consider using the filtlong app to filter out shorter reads."
                    )

        except Exception as e:
            raise ValueError('Unable to download long reads\n' + str(e))
        return long_reads_path
Ejemplo n.º 8
0
    def download_short_unpaired(self, console, token, wsname,
                                short_unpaired_libraries):
        try:
            self.log(console, "Getting short unpaired reads.\n")
            ruClient = ReadsUtils(url=self.callbackURL, token=token)

            # first, unpack any ReadsSets into the actual SingleEndLibrary referencs
            reads_refs = []
            # object info
            try:
                wsClient = Workspace(self.workspaceURL, token=token)
            except Exception as e:
                raise ValueError("unable to instantiate wsClient. " + str(e))

            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple
            for lib in short_unpaired_libraries:
                try:
                    obj_id = {
                        'ref': lib if '/' in lib else (wsname + '/' + lib)
                    }
                    lib_obj_info = wsClient.get_object_info_new(
                        {'objects': [obj_id]})[0]
                    lib_obj_type = lib_obj_info[TYPE_I]
                    # remove trailing version
                    lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type)
                    lib_ref = str(lib_obj_info[WSID_I])+'/' + \
                        str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I])
                    if lib_obj_type == 'KBaseSets.ReadsSet':
                        # unpack it
                        try:
                            setAPIClient = SetAPI(url=self.serviceWizardURL,
                                                  token=token)
                            self.log(console, 'getting reads set ' + lib_ref)
                            readsSet = setAPIClient.get_reads_set_v1({
                                'ref':
                                lib_ref,
                                'include_item_info':
                                1
                            })
                        except Exception as e:
                            raise ValueError(
                                'SetAPI FAILURE: Unable to get read library set object: ('
                                + lib_ref + ')\n' + str(e))
                        for readsLibrary in readsSet['data']['items']:
                            reads_refs.append(readsLibrary['ref'])
                    else:
                        # use other reads objects "as is"
                        reads_refs.append(lib_ref)
                except Exception as e:
                    raise ValueError('Unable to get read library object: (' +
                                     str(lib) + ')' + str(e))

            result = ruClient.download_reads({
                'read_libraries': reads_refs,
                'interleaved': 'false'
            })
            # combine outputs
            short_unpaired_path = os.path.join(
                self.scratch, "short_unpaired_" + str(uuid.uuid4()) + ".fastq")

            self.log(console, "Combining short unpaired reads.\n")

            for reads_ref in reads_refs:
                files = result['files'][reads_ref]['files']

                if 'fwd' in files:
                    path = files['fwd']
                    if path.endswith('.gz'):
                        cmd = 'gzip -dc ' + path + ' >> ' + short_unpaired_path
                    else:
                        cmd = 'cat ' + path + ' >> ' + short_unpaired_path
                    self.log(console, "command: " + cmd)
                    cmdProcess = subprocess.Popen(cmd,
                                                  stdout=subprocess.PIPE,
                                                  stderr=subprocess.STDOUT,
                                                  shell=True)
                    cmdProcess.wait()
                    if cmdProcess.returncode != 0:
                        raise ValueError('Error running ' + cmd)
                    os.remove(path)
                else:
                    raise ValueError('File ' + reads_ref +
                                     ' missing forward reads file')

        except Exception as e:
            raise ValueError('Unable to download short unpaired reads\n' +
                             str(e))
        return short_unpaired_path
Ejemplo n.º 9
0
    def run_SPAdes(self, ctx, params):
        """
        Run SPAdes on paired end libraries
        :param params: instance of type "SPAdesParams" (Input parameters for
           running SPAdes. workspace_name - the name of the workspace from
           which to take input and store output. output_contigset_name - the
           name of the output contigset read_libraries - a list of Illumina
           PairedEndLibrary files in FASTQ or BAM format. dna_source -
           (optional) the source of the DNA used for sequencing
           'single_cell': DNA amplified from a single cell via MDA anything
           else: Standard DNA sample from multiple cells. Default value is
           None. min_contig_length - (optional) integer to filter out contigs
           with length < min_contig_length from the SPAdes output. Default
           value is 0 implying no filter. kmer_sizes - (optional) K-mer
           sizes, Default values: 33, 55, 77, 99, 127 (all values must be
           odd, less than 128 and listed in ascending order) In the absence
           of these values, K values are automatically selected.
           skip_error_correction - (optional) Assembly only (No error
           correction). By default this is disabled.) -> structure: parameter
           "workspace_name" of String, parameter "output_contigset_name" of
           String, parameter "read_libraries" of list of type
           "paired_end_lib" (The workspace object name of a PairedEndLibrary
           file, whether of the KBaseAssembly or KBaseFile type.), parameter
           "dna_source" of String, parameter "min_contig_length" of Long,
           parameter "kmer_sizes" of list of Long, parameter
           "skip_error_correction" of type "bool" (A boolean. 0 = false,
           anything else = true.)
        :returns: instance of type "SPAdesOutput" (Output parameters for
           SPAdes run. report_name - the name of the KBaseReport.Report
           workspace object. report_ref - the workspace reference of the
           report.) -> structure: parameter "report_name" of String,
           parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_SPAdes

        # A whole lot of this is adapted or outright copied from
        # https://github.com/msneddon/MEGAHIT
        self.log('Running run_SPAdes with params:\n' + pformat(params))

        token = ctx['token']

        # the reads should really be specified as a list of absolute ws refs
        # but the narrative doesn't do that yet
        self.process_params(params)

        # get absolute refs from ws
        wsname = params[self.PARAM_IN_WS]
        obj_ids = []
        for r in params[self.PARAM_IN_LIB]:
            obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})
        ws = Workspace(self.workspaceURL, token=token)
        ws_info = ws.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        readcli = ReadsUtils(self.callbackURL, token=ctx['token'])

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = readcli.download_reads({'read_libraries': reads_params,
                                            'interleaved': 'false',
                                            'gzipped': None
                                            })['files']
        except ServerError as se:
            self.log('logging stacktrace from dynamic client error')
            self.log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        self.log('Got reads data from converter:\n' + pformat(reads))

        phred_type = self.check_reads(params, reads, reftoname)

        reads_data = []
        for ref in reads:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
#            print ("REF:" + str(ref))
#            print ("READS REF:" + str(reads[ref]))
            seq_tech = reads[ref]["sequencing_tech"]
            if f['type'] == 'interleaved':
                reads_data.append({'fwd_file': f['fwd'], 'type': 'paired',
                                   'seq_tech': seq_tech})
            elif f['type'] == 'paired':
                reads_data.append({'fwd_file': f['fwd'], 'rev_file': f['rev'],
                                   'type': 'paired', 'seq_tech': seq_tech})
            elif f['type'] == 'single':
                reads_data.append({'fwd_file': f['fwd'], 'type': 'single',
                                   'seq_tech': seq_tech})
            else:
                raise ValueError('Something is very wrong with read lib' + reads_name)

        kmer_sizes = None
        if self.PARAM_IN_KMER_SIZES in params and params[self.PARAM_IN_KMER_SIZES] is not None:
            if (len(params[self.PARAM_IN_KMER_SIZES])) > 0:
                kmer_sizes = ",".join(str(num) for num in params[self.PARAM_IN_KMER_SIZES])

        skip_error_correction = 0
        if self.PARAM_IN_SKIP_ERR_CORRECT in params and params[self.PARAM_IN_SKIP_ERR_CORRECT] is not None:
            if params[self.PARAM_IN_SKIP_ERR_CORRECT] == 1:
                skip_error_correction = 1

        spades_out = self.exec_spades(params[self.PARAM_IN_DNA_SOURCE],
                                      reads_data,
                                      phred_type,
                                      kmer_sizes,
                                      skip_error_correction)

        self.log('SPAdes output dir: ' + spades_out)

        # parse the output and save back to KBase
        output_contigs = os.path.join(spades_out, 'scaffolds.fasta')

        self.log('Uploading FASTA file to Assembly')

        assemblyUtil = AssemblyUtil(self.callbackURL, token=ctx['token'], service_ver='release')

        if params.get('min_contig_length', 0) > 0:
            assemblyUtil.save_assembly_from_fasta(
                {'file': {'path': output_contigs},
                 'workspace_name': wsname,
                 'assembly_name': params[self.PARAM_IN_CS_NAME],
                 'min_contig_length': params['min_contig_length']
                 })
            # load report from scaffolds.fasta.filtered.fa
            report_name, report_ref = self.load_report(
                output_contigs + '.filtered.fa', params, wsname)
        else:
            assemblyUtil.save_assembly_from_fasta(
                {'file': {'path': output_contigs},
                 'workspace_name': wsname,
                 'assembly_name': params[self.PARAM_IN_CS_NAME]
                 })
            # load report from scaffolds.fasta
            report_name, report_ref = self.load_report(
                output_contigs, params, wsname)

        output = {'report_name': report_name,
                  'report_ref': report_ref
                  }
        #END run_SPAdes

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_SPAdes return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Ejemplo n.º 10
0
    def export_genome_as_genbank(self, ctx, params):
        """
        :param params: instance of type "ExportParams" (input and output
           structure functions for standard downloaders) -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_as_genbank
        print('export_genome_as_genbank -- paramaters = ')

        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot run export_genome_as_genbank- no "input_ref" field defined.'
            )

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.cfg.workspaceURL)
        info = ws.get_object_info_new({
            'objects': [{
                'ref': params['input_ref']
            }],
            'includeMetadata': 0,
            'ignoreErrors': 0
        })[0]

        genome_to_genbank_params = {'genome_ref': params['input_ref']}

        # export to file (building from KBase Genome Object)
        result = self.genome_to_genbank(
            ctx, genome_to_genbank_params)[0]['genbank_file']

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.cfg.sharedFolder, info[1])
        os.makedirs(export_package_dir)
        shutil.move(
            result['file_path'],
            os.path.join(export_package_dir,
                         os.path.basename(result['file_path'])))

        # export original uploaded GenBank file if it existed.
        exporter = GenomeToGenbank(self.cfg)
        original_result_full = exporter.export_original_genbank(
            ctx, genome_to_genbank_params)
        if original_result_full is not None:
            original_result = original_result_full['genbank_file']
            shutil.move(
                original_result['file_path'],
                os.path.join(export_package_dir,
                             os.path.basename(original_result['file_path'])))

        # Make warning file about genes only.
        warning_filename = "README.txt"
        with open(os.path.join(export_package_dir, warning_filename),
                  'w') as temp_file:
            temp_file.write(
                'This directory includes the KBase-derived GenBank file and also '
                + '(if you originally uploaded the genome from an annotated ' +
                'GenBank file) the original GenBank input.')

        # package it up and be done
        dfUtil = DataFileUtil(self.cfg.callbackURL)
        package_details = dfUtil.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        output = {'shock_id': package_details['shock_id']}

        print('export complete -- result = ')
        pprint(output)
        #END export_genome_as_genbank

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_as_genbank return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]