Beispiel #1
0
def load_fastas(config, scratch, upa):
    '''

    '''
    dfu = DataFileUtil(config['callback_url'])
    au = AssemblyUtil(config['callback_url'])
    ws = Workspace(config['workspace-url'])

    obj_data = dfu.get_objects({"object_refs":[upa]})['data'][0]
    obj_type  = obj_data['info'][2]

    if 'KBaseSets.GenomeSet' in obj_type:
        upas = [gsi['ref'] for gsi in obj_data['data']['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
    elif "KBaseGenomes.Genome" in obj_type:
        upas = [upa]
    elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
        # in this case we use the assembly file util to get the fasta file
        file_output = os.path.join(scratch, "input_fasta.fa")
        faf = au.get_assembly_as_fasta({"ref": upa})
        return [(faf['path'], upa)]

    fasta_paths = []
    for genome_upa in upas:
        if upa != genome_upa:
            genome_upa = upa + ';' + genome_upa
        genome_data = ws.get_objects2( {'objects':[{"ref":genome_upa}]})['data'][0]['data']
        target_upa = genome_data.get('contigset_ref') or genome_data.get('assembly_ref')
        assembly_upa = genome_upa + ';' + target_upa
        faf = au.get_assembly_as_fasta({"ref":assembly_upa})
        fasta_paths.append((faf['path'], assembly_upa))

    return fasta_paths
Beispiel #2
0
def load_fastas(config, scratch: str, upa: str):
    '''
    Returns list of (fasta_path, upa)
    '''
    dfu = DataFileUtil(config['callback_url'])
    au = AssemblyUtil(config['callback_url'])
    mgu = MetagenomeUtils(config['callback_url'])
    ws = Workspace(config['workspace-url'])

    obj_data = dfu.get_objects({"object_refs": [upa]})['data'][0]
    obj_type = obj_data['info'][2]

    if 'KBaseSets.GenomeSet' in obj_type:
        upas = [gsi['ref'] for gsi in obj_data['data']['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        upas = [gse['ref'] for gse in obj_data['data']['elements'].values()]
    elif "KBaseGenomes.Genome" in obj_type:
        upas = [upa]
    elif "KBaseGenomes.ContigSet" in obj_type or "KBaseGenomeAnnotations.Assembly" in obj_type:
        # in this case we use the assembly file util to get the fasta file
        # file_output = os.path.join(scratch, "input_fasta.fa")
        faf = au.get_assembly_as_fasta({"ref": upa})
        return [(faf['path'], upa)]
    elif "KBaseSets.AssemblySet" in obj_type:
        fasta_paths = []
        for item_upa in obj_data['data']['items']:
            faf = au.get_assembly_as_fasta({"ref": item_upa['ref']})
            fasta_paths.append((faf['path'], item_upa['ref']))
        return fasta_paths
    elif 'KBaseMetagenomes.BinnedContigs' in obj_type:
        fasta_paths = []
        bin_file_dir = mgu.binned_contigs_to_file({
            'input_ref': upa,
            'save_to_shock': 0
        })['bin_file_directory']
        for (dirpath, dirnames, filenames) in os.walk(bin_file_dir):
            for fasta_file in filenames:
                fasta_path = os.path.join(scratch, fasta_file)
                fasta_path = os.path.splitext(fasta_path)[0] + ".fa"
                copyfile(os.path.join(bin_file_dir, fasta_file), fasta_path)
                # Should I verify that the bins have contigs?
                # is it possible to have empty bins?
                fasta_paths.append((fasta_path, upa))
            break
        return fasta_paths
    else:
        raise Error('Input genome/metagenome reference has unhandled type')

    fasta_paths = []
    for genome_upa in upas:
        genome_data = ws.get_objects2({'objects': [{
            "ref": genome_upa
        }]})['data'][0]['data']
        assembly_upa = genome_upa + ';' + str(
            genome_data.get('contigset_ref')
            or genome_data.get('assembly_ref'))
        faf = au.get_assembly_as_fasta({'ref': assembly_upa})
        fasta_paths.append((faf['path'], assembly_upa))

    return fasta_paths
Beispiel #3
0
class DownloadUtils:
    def __init__(self, callbackURL):
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        self.au = AssemblyUtil(self.callbackURL)
        self.vu = VariationUtil(self.callbackURL)
        self.gfu = GenomeFileUtil(self.callbackURL)
        pass

    def download_genome(self, genomeref, output_dir):
        '''
        this funciton downloads genome.
        :param genomeref:
        :param output_dir:
        :return:
        '''

        file = self.au.get_assembly_as_fasta({
            'ref':
            genomeref,
            'filename':
            os.path.join(output_dir, "ref_genome.fa")
        })
        return file

    def get_variation(self, variation_ref):
        '''
        This function downloads variations.
        :param variation_ref:
        :param filename:
        :return:
        '''

        filepath = self.vu.get_variation_as_vcf(
            {'variation_ref': variation_ref})['path']
        return filepath

    def get_gff(self, genome_ref):
        '''
        :param genome_ref:
        :return: gff file path
        '''

        file = self.gfu.genome_to_gff({'genome_ref': genome_ref})
        return file['file_path']

    def get_assembly(self, assembly_ref, output_dir):
        '''
        :param assembly_ref:
        :param output_dir:
        :return: assembly file path
        '''

        file = self.au.get_assembly_as_fasta({
            'ref':
            assembly_ref,
            'filename':
            os.path.join(output_dir, "ref_genome.fa")
        })
        return file['path']
Beispiel #4
0
    def _build_index(self, assembly_info, validated_params):
        # get the assembly as a fasta file using AssemblyUtil
        au = AssemblyUtil(self.callback_url)
        fasta_info = au.get_assembly_as_fasta({'ref': assembly_info['ref']})

        # make the target destination folder (check again it wasn't created yet)
        if os.path.exists(validated_params['output_dir']):
            raise ('Output directory name specified (' + validated_params['output_dir'] +
                   ') already exists. Will not overwrite, so aborting.')
        os.makedirs(validated_params['output_dir'])

        # configure the command line args and run it
        cli_params = self._build_cli_params(fasta_info['path'], fasta_info['assembly_name'], validated_params)
        self.bwa.run('index', cli_params)
        # self.bwa.run('index', cli_params)
        for file in glob.glob(r'/kb/module/work/tmp/' + fasta_info['assembly_name'] + '.*'):
            print(file)
            shutil.copy(file, validated_params['output_dir'])

        index_info = {'output_dir': validated_params['output_dir'],
                      'index_files_basename': fasta_info['assembly_name']}

        # cache the result, mark if it worked or not
        cache_success = self._put_cached_index(assembly_info,
                                               fasta_info['assembly_name'],
                                               validated_params['output_dir'],
                                               validated_params['ws_for_cache'])
        if cache_success:
            index_info['pushed_to_cache'] = 1
        else:
            index_info['pushed_to_cache'] = 0

        return index_info
    def BuildFastaFromSequenceSet(self, ctx, params):
        """
        :param params: instance of type "BuildSeqIn" -> structure: parameter
           "workspace_name" of String, parameter "SequenceSetRef" of String,
           parameter "fasta_outpath" of String
        :returns: instance of type "BuildSeqOut" -> structure: parameter
           "fasta_outpath" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN BuildFastaFromSequenceSet
        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        dfu = DataFileUtil(self.callback_url)

        bu = BackgroundUtils()
        TU = TestUtils()
        if params['TESTFLAG'] and params['background']:
            targetpath = '/kb/module/work/tmp/testgenome.fa'
            TU.GetGenome(targetpath)
            bu.BuildBackground(targetpath)
        elif params['background']:

            ws = Workspace('https://appdev.kbase.us/services/ws')
            subset = ws.get_object_subset([{
                                         'included':['/features/[*]/location', '/features/[*]/id','/assembly_ref'],
    'ref':params['genome_ref']}])
            aref = subset[0]['data']['assembly_ref']
            assembly_ref = {'ref': aref}
            print('Downloading Assembly data as a Fasta file.')
            assemblyUtil = AssemblyUtil(self.callback_url)
            fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)['path']
            bu.BuildBackground(fasta_file)


        get_objects_params = {'object_refs' : [params['SequenceSetRef']]}

        SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data']
        outFile = open(params['fasta_outpath'],'w')
        for s in SeqSet['sequences']:
            sname = '>' + s['sequence_id'] + '\n'
            outFile.write(sname)
            sseq = s['sequence'] + '\n'
            outFile.write(sseq)
        outFile.close()

        fu=FastaUtils()
        if params['mask_repeats']:
            fu.RemoveRepeats(params['fasta_outpath'],params['fasta_outpath'])

        output = {'fasta_outpath' : params['fasta_outpath']}
        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method BuildFastaFromSequenceSet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
class DownloadFastqUtils:
    def __init__(self):
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        self.au = AssemblyUtil(self.callbackURL)
        self.ru = ReadsUtils(self.callbackURL)
        pass

    def _stage_input_file(self, ref, reads_type):

        if reads_type == 'KBaseFile.PairedEndLibrary' or 'KBaseAssembly.PairedEndLibrary':
            input_file_info = self.ru.download_reads({
                'read_libraries': [ref],
                'interleaved': 'true'
            })['files'][ref]
        elif reads_type == 'KBaseFile.SingleEndLibrary' or 'KBaseAssembly.SingleEndLibrary':
            input_file_info = self.ru.download_reads({'read_libraries':
                                                      [ref]})['files'][ref]
        else:
            raise ValueError("Can't download_reads() for object type: '" +
                             str(reads_type) + "'")
        input_file_info['input_ref'] = ref
        file_location = input_file_info['files']['fwd']

        interleaved = False
        if input_file_info['files']['type'] == 'interleaved':
            interleaved = True

        return input_file_info

    def download_genome(self, genomeref):
        file = self.au.get_assembly_as_fasta({'ref': genomeref})
        return file
Beispiel #7
0
 def _get_assembly(self, genome):
     if 'assembly_ref' in genome:
         assembly_ref = genome['assembly_ref']
     else:
         assembly_ref = genome['contigset_ref']
     log('Assembly reference = ' + assembly_ref)
     log('Downloading assembly')
     dfu = DataFileUtil(self.cfg.callbackURL)
     log('object_refs:' + self.genome_ref + ";" + assembly_ref)
     assembly_data = dfu.get_objects(
         {'object_refs':
          [self.genome_ref + ";" + assembly_ref]})['data'][0]['data']
     if isinstance(assembly_data['contigs'], dict):  # is an assembly
         circular_contigs = set([
             x['contig_id'] for x in list(assembly_data['contigs'].values())
             if x.get('is_circ')
         ])
     else:  # is a contig set
         circular_contigs = set([
             x['id'] for x in assembly_data['contigs']
             if x.get('replicon_geometry') == 'circular'
         ])
     au = AssemblyUtil(self.cfg.callbackURL)
     assembly_file_path = au.get_assembly_as_fasta(
         {'ref': self.genome_ref + ";" + assembly_ref})['path']
     return assembly_file_path, circular_contigs
Beispiel #8
0
class DownloadUtils:
    def __init__(self, callback_url):
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        self.au = AssemblyUtil(self.callbackURL)
        self.vu = VariationUtil(self.callbackURL)
        pass

    def download_genome(self, genomeref, output_dir):
        '''
        this funciton downloads genome.
        :param genomeref:
        :param output_dir:
        :return:
        '''

        file = self.au.get_assembly_as_fasta({
          'ref': genomeref,
          'filename': os.path.join(output_dir, "ref_genome.fa")
        })
        return file

    def download_variations(self, variation_ref, filename):
        '''
        This function downloads variations.
        :param variation_ref:
        :param filename:
        :return:
        '''

        filepath = self.vu.get_variation_as_vcf({
            'variation_ref': variation_ref,
            'filename': filename
        })['path']
        return filepath
    def jayrbolton_contig_filter(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN jayrbolton_contig_filter
        if not params.get('assembly_input_ref'):
            raise TypeError("`assembly_input_ref` is required")
        if not params.get('min_length') or not isinstance(
                params['min_length'], int):
            raise TypeError("`min_length` is required and needs to be an int")
        min_length = params['min_length']
        # Initialize the assembly util client
        assembly_util = AssemblyUtil(self.callback_url)
        # download the fasta file to local disk
        fasta_file = assembly_util.get_assembly_as_fasta(
            {'ref': params['assembly_input_ref']})
        filtered_path = os.path.join(self.shared_folder, 'filtered.fasta')
        report_client = KBaseReport(self.callback_url)
        result = contig_filter(fasta_file['path'], filtered_path, min_length)
        assembly_obj = assembly_util.save_assembly_from_fasta({
            'workspace_name':
            params['workspace_name'],
            'file': {
                'path': filtered_path,
                'assembly_name': 'filtered_contigs'
            },
            'assembly_name':
            'filtered_assembly'
        })
        report = report_client.create_extended_report({
            'workspace_name':
            params['workspace_name'],
            'objects_created': [{
                'ref': assembly_obj,
                'description': 'filtered_assembly'
            }],
            'message':
            (f"Filtered out {result['n_total'] - result['n_remaining']} "
             f"records out of {result['n_total']} records.")
        })
        output = {'report_ref': report['ref'], 'report_name': report['name']}
        #END jayrbolton_contig_filter

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method jayrbolton_contig_filter return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Beispiel #10
0
    def run_mash_sketch(self, ctx, params):
        """
        Generate a sketch file from a fasta/fastq file
        :param params: instance of type "MashSketchParams" (* * Pass in **one
           of** input_path, assembly_ref, or reads_ref *   input_path -
           string - local file path to an input fasta/fastq *   assembly_ref
           - string - workspace reference to an Assembly type *   reads_ref -
           string - workspace reference to a Reads type * Optionally, pass in
           a boolean indicating whether you are using paired-end reads. *
           paired_ends - boolean - whether you are passing in paired ends) ->
           structure: parameter "input_path" of String, parameter
           "assembly_ref" of String, parameter "reads_ref" of String,
           parameter "paired_ends" of type "boolean" (params:
           input_upa: workspace reference to an assembly object
           workspace_name: name of current workspace search_db: database to
           search n_max_results: number of results to return, integer between
           1 and 100)
        :returns: instance of type "MashSketchResults" (* * Returns the local
           scratch file path of the generated sketch file. * Will have the
           extension '.msh') -> structure: parameter "sketch_path" of String
        """
        # ctx is the context object
        # return variables are: results
        #BEGIN run_mash_sketch
        if 'reads_ref' in params:
            reads_utils = ReadsUtils(self.callbackURL)
            result = reads_utils.download_reads({
                'read_libraries': [params['reads_ref']],
                'interleaved': 'true'
            })
            input_path = result['files'][params['reads_ref']]['files']['fwd']
        elif 'assembly_ref' in params:
            assembly_util = AssemblyUtil(self.callbackURL)
            result = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']})
            input_path = result['path']
        elif 'input_path' in params:
            input_path = params['input_path']
        else:
            raise ValueError(
                'Invalid params; must provide one of `reads_ref`, `assembly_ref`, or `input_path`.'
            )
        mash_utils = MashUtils(self.config, self.auth_token)
        output_file_path = mash_utils.mash_sketch(input_path, paired_ends=params.get('paired_ends'))
        results = {'sketch_path': output_file_path}
        #END run_mash_sketch

        # At some point might do deeper type checking...
        if not isinstance(results, dict):
            raise ValueError('Method run_mash_sketch return value ' +
                             'results is not type dict as required.')
        # return the results
        return [results]
Beispiel #11
0
    def download_assembly(self, token, assembly_ref):
        try:
            auClient = AUClient(self.callback_url, token=token, service_ver=self.SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate auClient with callback_url: '+ self.callback_url +' ERROR: ' + str(e))
        try:
            dfuClient = DFUClient(self.callback_url, token=token, service_ver=self.SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate dfuClient with callback_url: '+ self.callback_url +' ERROR: ' + str(e))

        contig_file = auClient.get_assembly_as_fasta({'ref':assembly_ref}).get('path')
        sys.stdout.flush()   # don't remember why this matters
        contig_file_path = dfuClient.unpack_file({'file_path': contig_file})['file_path']
        return contig_file_path
Beispiel #12
0
def fetch_fasta_from_assembly(assembly_ref, ws_url, callback_url):
    """
    From an assembly or contigset, this uses a data file util to build a FASTA file and return the
    path to it.
    """
    allowed_types = [
        'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly',
        'KBaseGenomes.ContigSet'
    ]
    if not check_ref_type(assembly_ref, allowed_types, ws_url):
        raise ValueError(
            "The reference {} cannot be used to fetch a FASTA file".format(
                assembly_ref))
    au = AssemblyUtil(callback_url)
    return au.get_assembly_as_fasta({'ref': assembly_ref})
class downloaddatautils:
    def __init__(self):
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        self.au = AssemblyUtil(self.callbackURL)
        self.vu = VariationUtil(self.callbackURL)
        pass

    def download_genome(self, params):
        file = self.au.get_assembly_as_fasta(
            {'ref': params['genome_or_assembly_ref']})
        return file

    def download_vcf(self, params):
        params['input_var_ref'] = params['vcf_ref']
        self.vu.export_variation_as_vcf(params)
    def get_assembly_sequence(self, assembly_input_ref):
        # Download the input data as a Fasta
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})
        cf = CreateFasta(self.config)

        string = ''
        for seq_record in SeqIO.parse(fasta_file['path'], 'fasta'):
            string += ">" + seq_record.id + "\n"
            string += cf.splitSequence(str(seq_record.seq))
            string += "\n"

        return string
Beispiel #15
0
class DownloadAlignmentUtils:
    def __init__(self, callback_url):
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        self.au = AssemblyUtil(self.callbackURL)
        pass

    def downloadreadalignment(self, source_ref, params, callback_url):
        '''
        downloadreadalignment: download alignment file
        :param source_ref:
        :param params:
        :param callback_url:
        :return:
        '''
        self.callback_url = callback_url
        self.ru = ReadsAlignmentUtils(self.callback_url)
        params['source_ref'] = source_ref
        params['downloadSAM'] = 1

        params['destination_dir'] = '/kb/module/work/tmp'
        params['stats'] = {
            "properly_paired": 1,
            "multiple_alignments": 1,
            "singletons": 1,
            "alignment_rate": 1,
            "unmapped_reads": 1,
            "mapped_reads": 1,
            "total_reads": 1
        }
        return self.ru.download_alignment(params)

    def download_genome(self, genomeref, output_dir):
        '''
        download_genome:download genome
        :param genomeref:
        :param output_dir:
        :return:
        '''
        file = self.au.get_assembly_as_fasta({
            'ref':
            genomeref,
            'filename':
            os.path.join(output_dir, "ref_genome.fa")
        })
        return file
    def test_import_fasta_as_assembly_from_staging(self, download_staging_file,
                                                   update_staging_service):

        fasta_file = 'small_fasta.fna'
        ws_obj_name = 'MyAssembly'

        params = {
            'staging_file_subdir_path': fasta_file,
            'workspace_name': self.getWsName(),
            'assembly_name': ws_obj_name
        }

        ref = self.getImpl().import_fasta_as_assembly_from_staging(
            self.getContext(), params)
        self.assertTrue('obj_ref' in ref[0])
        self.assertTrue('report_ref' in ref[0])
        self.assertTrue('report_name' in ref[0])

        fasta_file_path = os.path.join('/kb/module/work/tmp', fasta_file)
        assemblyUtil = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        fasta_assembly = assemblyUtil.get_assembly_as_fasta(
            {'ref': self.getWsName() + "/{}".format(ws_obj_name)})

        expected_data = None
        with open(fasta_file_path, 'r') as f:
            expected_data = f.read()
        actual_data = None
        with open(fasta_assembly['path'], 'r') as f:
            actual_data = f.read()
        self.assertEqual(actual_data, expected_data)

        get_objects_params = {
            'object_refs': [ref[0].get('obj_ref')],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)
        base_count = object_data.get('data')[0].get('data').get('base_counts')
        dna_size = object_data.get('data')[0].get('data').get('dna_size')

        self.assertEqual(dna_size, 2520)

        expected_base_count = {'A': 700, 'C': 558, 'T': 671, 'G': 591}
        self.assertDictContainsSubset(base_count, expected_base_count)
        self.assertDictContainsSubset(expected_base_count, base_count)
Beispiel #17
0
    def stage_assembly_files(self, object_list):
        """
        _stage_assembly_files: download the fasta files to the scratch area
        return list of file names
        """
        log('Processing assembly object list: {}'.format(object_list))

        auc = AssemblyUtil(self.callbackURL)
        staged_file_list = []

        for assembly_upa in object_list:
            try:
                filename = auc.get_assembly_as_fasta({'ref':
                                                      assembly_upa})['path']
            except ServerError as assembly_error:
                print(str(assembly_error))
                raise

            staged_file_list.append(filename)

        log('Created file list: {}'.format(staged_file_list))
        return staged_file_list
class DownloadUtils:
    def __init__(self):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.au = AssemblyUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        pass

    def get_gff(self, genome_ref, output_dir):
        '''
        function for downloaing gff file
        :param genome_ref:
        :param output_dir:
        :return:
        '''

        gff_filename = os.path.join(output_dir + "/snp_eff/data/kbase_v1",
                                    "gene.gff")
        file = self.gfu.genome_to_gff({
            'genome_ref': genome_ref,
            'filename': gff_filename
        })
        return file['file_path']

    def get_assembly(self, assembly_ref, output_dir):
        '''
        function for downloaing assembly file.
        :param assembly_ref:
        :param output_dir:
        :return:
        '''
        assembly_filename = os.path.join(output_dir + "/snp_eff/data/kbase_v1",
                                         "sequences.fa")
        file = self.au.get_assembly_as_fasta({
            'ref': assembly_ref,
            'filename': assembly_filename
        })
        return file['path']
class ProkkaUtils:
    def __init__(self, config):
        self.scratch = config["scratch"]
        self.ctx = config['ctx']
        self.callback_url = config["SDK_CALLBACK_URL"]

        self.ws_client = workspaceService(config["workspace-url"])
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)

        self.sso_ref = None
        self.sso_event = None
        self.ec_to_sso = {}
        self.output_workspace = None

    @staticmethod
    def _get_input_value(params, key):
        """Get value of key after checking for its existence

        :param params: Params dictionary haystack
        :param key: Key to search in Params
        :return: Parameter Value
        :raises ValueError: raises an exception if the key doesn"t exist
        """
        if not key in params:
            raise ValueError("Parameter " + key +
                             " should be set in input parameters")
        return params[key]

    @staticmethod
    def _get_qualifier_value(qualifier):
        """Get first qualifier from the list of qualifiers

        :param qualifier: list contents of the qualifier from BCBio GFF Tools
        :return: first element in the list
        """
        return qualifier[0] if (qualifier and len(qualifier) > 0) else None

    def download_seed_data(self):
        """Download Seed Data Ontology, and set the gene_ontology reference (sso_ref) and
        the create a table from ec numbers to sso (ec_to_sso)

        :return: None
        """
        # Download Seed Reference Data
        sso_ret = self.ws_client.get_objects([{
            "ref":
            "KBaseOntology/seed_subsystem_ontology"
        }])[0]
        sso = sso_ret["data"]
        for sso_id in sso["term_hash"]:
            sso_name = sso["term_hash"][sso_id]["name"]
            if "(EC " in sso_name and sso_name.endswith(")"):
                ec = sso_name[sso_name.index("(EC ") + 4:-1].strip()
                sso_list = self.ec_to_sso.get(ec, None)
                if not sso_list:
                    sso_list = []
                    self.ec_to_sso[ec] = sso_list
                sso_list.append(sso["term_hash"][sso_id])
        print("EC found in SSO: " + str(len(self.ec_to_sso)))
        sso_info = sso_ret["info"]
        sso_ref = str(sso_info[6]) + "/" + str(sso_info[0]) + "/" + str(
            sso_info[4])
        with open("/kb/module/work/seed_so.json", "w") as outfile:
            json.dump(sso, outfile, sort_keys=True, indent=4)
        self.sso_ref = sso_ref

    def inspect_assembly(self, assembly_meta, assembly_ref):
        """Check to see if assembly has too many contigs and might not be a metagenome or
        non prokaryotic dataset

        :param assembly_meta: information about the assembly reference
        :param assembly_ref: the assembly reference number
        :return: a tuple containing gc_content and dna_size
        """
        gc_content = float(assembly_meta.get("GC content"))
        dna_size = int(assembly_meta.get("Size"))
        n_contigs = 0
        if "N Contigs" in assembly_meta:
            n_contigs = int(assembly_meta.get("N Contigs"))
        else:
            contig = self.ws_client.get_objects([{"ref": assembly_ref}])[0]
            n_contigs = len(contig["data"]["contigs"])
        if n_contigs >= 30000:
            message = """
             Hmmm.  There are over 30,000 contigs in this Assembly. 
             It looks like you are trying to run Prokka on a metagenome or non-prokaryotic data set. 
             If this is a metagenome data set we recommend using an App like MaxBin to first bin the contigs into genome-like bins. 
             These bins can then be individually annotated as a single genome using Prokka. 
             If this data comes from a Eukaryotic sample, KBase does not currently have an annotation app designed for Eukaryotes. 
             Alternatively, you can try reducing the number of contigs using a filter app.")
             raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions
             """
            print(message)
            #raise ValueError("Too many contigs for Prokka.  See logs for details and suggestions")

        assembly_info = namedtuple("assembly_info", "gc_content dna_size")
        return assembly_info(gc_content, dna_size)

    @staticmethod
    def create_renamed_assembly(assembly_fasta_filepath):
        """Rename records to be in the format of contig_N and output a new fasta file

        :param assembly_fasta_filepath:
        :return: A tuple with The path to the fasta file with renamed contigs the number of contigs,
        the mapping from old ids to new ids, and the contigs as SeqRecords
        """
        records = []
        new_ids_to_old = {}
        contig_counter = 0
        for record in SeqIO.parse(assembly_fasta_filepath, "fasta"):
            contig_counter += 1
            old_id = record.id
            new_id = "contig_" + str(contig_counter)
            sequence = record.seq  # it has type "Seq"
            record = SeqRecord(sequence,
                               id=new_id,
                               description="(" + old_id + ")")
            records.append(record)
            new_ids_to_old[new_id] = old_id

        renamed_assembly_fasta_filepath = assembly_fasta_filepath + "_renamed.fna"
        SeqIO.write(records, renamed_assembly_fasta_filepath, "fasta")

        renamed_assembly = namedtuple(
            "renamed_assembly",
            "filepath contig_counter new_ids_to_old records")
        return renamed_assembly(renamed_assembly_fasta_filepath,
                                contig_counter, new_ids_to_old, records)

    def run_prokka(self, params, subject_fasta_filepath):
        """Run Prokka

        :param params: Prokka parameters
        :param subject_fasta_filepath: The contigs or genes to run prokka against
        :return: The directory with all of the prokka output files
        """
        output_dir = "/kb/module/work/tmp/temp_" + str(uuid.uuid4())

        # --kingdom [X]  Annotation mode: Archaea|Bacteria|Mitochondria|Viruses (default "Bacteria")
        kingdom = "Bacteria"
        if "kingdom" in params and params["kingdom"]:
            kingdom = params["kingdom"]

        prokka_cmd_list = [
            "perl", "/kb/prokka/bin/prokka", "--metagenome", "--outdir",
            output_dir, "--prefix", "mygenome", "--kingdom", kingdom
        ]

        # --genus [X]       Genus name (triggers to use --usegenus)
        if "genus" in params and params["genus"]:
            prokka_cmd_list.extend(
                ["--genus", str(params["genus"]), "--usegenus"])
        # --gcode [N]       Genetic code / Translation table (set if --kingdom is set) (default "0")
        if "gcode" in params and params["gcode"]:
            prokka_cmd_list.extend(["--gcode", str(params["gcode"])])
        else:
            prokka_cmd_list.extend(["--gcode", "0"])
        # --gram [X]        Gram: -/neg +/pos (default "")
        if "gram" in params and params["gram"]:
            raise ValueError(
                "gram parameter is not supported in current Prokka installation"
            )
        # --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
        if "metagenome" in params and params["metagenome"] == 1:
            prokka_cmd_list.append("--metagenome")
        # --rawproduct      Do not clean up /product annotation (default OFF)
        if "rawproduct" in params and params["rawproduct"] == 1:
            prokka_cmd_list.append("--rawproduct")
        # --fast            Fast mode - skip CDS /product searching (default OFF)
        if "fast" in params and params["fast"] == 1:
            prokka_cmd_list.append("--fast")
        # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default "1")
        if "mincontiglen" in params and params["mincontiglen"]:
            prokka_cmd_list.extend(
                ["--mincontiglen",
                 str(params["mincontiglen"])])
        # --evalue [n.n]    Similarity e-value cut-off (default "1e-06")
        if "evalue" in params and params["evalue"]:
            prokka_cmd_list.extend(["--evalue", str(params["evalue"])])
        # --rfam            Enable searching for ncRNAs with Infernal+Rfam (SLOW!) (default "0")
        if "rfam" in params and params["rfam"] == 1:
            prokka_cmd_list.append("--rfam")
        # --norrna          Don"t run rRNA search (default OFF)
        if "norrna" in params and params["norrna"] == 1:
            prokka_cmd_list.append("--norrna")
        # --notrna          Don"t run tRNA search (default OFF)
        if "notrna" in params and params["notrna"] == 1:
            prokka_cmd_list.append("--notrna")
        prokka_cmd_list.append(subject_fasta_filepath)
        print("Prokka command line: " + str(prokka_cmd_list))

        #tbl2asn or some other non essential prokka binary will fail, so supress that
        try:
            check_output(prokka_cmd_list, cwd=self.scratch)
        except CalledProcessError as e:
            pprint(e)
        return output_dir

    @staticmethod
    def retrieve_prokka_results(output_dir):
        """ Gather up the relevant prokka results, load the records from the results files

        :param output_dir:
        :return: A tuple containing Sequences from the .faa .ffn files and the gff_filepath
        """
        faa_file = output_dir + "/mygenome.faa"
        cds_to_prot = {}
        for record in SeqIO.parse(faa_file, "fasta"):
            cds_to_prot[record.id] = str(record.seq)
        ffn_file = output_dir + "/mygenome.ffn"
        cds_to_dna = {}
        for record in SeqIO.parse(ffn_file, "fasta"):
            cds_to_dna[record.id] = str(record.seq)
        gff_file = output_dir + "/mygenome.gff"
        if not os.path.isfile(gff_file):
            raise ValueError("PROKKA output GFF file is not found")

        prokka_results = namedtuple("prokka_results",
                                    "cds_to_prot cds_to_dna gff_filepath")
        return prokka_results(cds_to_prot, cds_to_dna, gff_file)

    def parse_prokka_results(self, **prokka_parse_parameters):
        """ Go through the prokka results from the input contigs and then
        create the features, mrnas and cdss components of the KbaseGenome.Genome object for
        genome annotation only.

        :param prokka_parse_parameters: gff_filepath, mappings
        :return: A tuple with Genome:features Genome:cdss  Genome:mrnas report_message of genes discovered
        """
        gff_filepath = prokka_parse_parameters["gff_filepath"]
        cds_to_dna = prokka_parse_parameters["cds_to_dna"]
        cds_to_prot = prokka_parse_parameters["cds_to_prot"]
        new_ids_to_old = prokka_parse_parameters["new_ids_to_old"]

        evidence = self.make_annotation_evidence()

        cdss = []
        mrnas = []
        features = []
        non_hypothetical = 0
        genes_with_ec = 0
        genes_with_sso = 0
        prot_lengths = []
        with open(gff_filepath, "r") as f1:
            for rec in GFF.parse(f1):
                contig_id = new_ids_to_old[str(rec.id)]
                for ft in rec.features:
                    loc = ft.location
                    min_pos = int(loc.start) + 1
                    max_pos = int(loc.end)
                    strand = "+" if loc.strand == 1 else "-"
                    flen = max_pos - min_pos + 1
                    start = min_pos if strand == "+" else max_pos
                    location = [[contig_id, start, strand, flen]]
                    qualifiers = ft.qualifiers
                    generated_id = self._get_qualifier_value(
                        qualifiers.get("ID"))
                    if not generated_id:
                        # Skipping feature with no ID (mostly repeat regions)
                        continue
                    dna = cds_to_dna.get(generated_id)
                    if not dna:
                        # Skipping feature with no DNA (mostly repeat regions)
                        continue
                    name = self._get_qualifier_value(qualifiers.get("Name"))
                    ec = self._get_qualifier_value(qualifiers.get("eC_number"))
                    gene = self._get_qualifier_value(qualifiers.get("gene"))
                    product = self._get_qualifier_value(
                        qualifiers.get("product"))
                    fid = generated_id
                    aliases = []
                    if name:
                        aliases.append(name)
                    if gene:
                        aliases.append(gene)
                    if ec:
                        aliases.append(ec)
                        genes_with_ec += 1
                    md5 = hashlib.md5(dna).hexdigest()
                    feature = {
                        "id": fid,
                        "location": location,
                        "type": "gene",
                        "aliases": aliases,
                        "md5": md5,
                        "dna_sequence": dna,
                        "dna_sequence_length": len(dna),
                    }
                    if product:
                        feature["function"] = product
                        if product != "hypothetical protein":
                            non_hypothetical += 1
                    if ec and ec in self.ec_to_sso:
                        sso_list = self.ec_to_sso[ec]
                        sso_terms = {}
                        for sso_item in sso_list:
                            sso_terms[sso_item["id"]] = {
                                "id": sso_item["id"],
                                "evidence": [evidence],
                                "term_name": sso_item["name"],
                                "ontology_ref": self.sso_ref,
                                "term_lineage": []
                            }
                        feature["ontology_terms"] = {"SSO": sso_terms}
                        genes_with_sso += 1
                    cds = None
                    mrna = None
                    prot = cds_to_prot.get(generated_id)
                    if prot:
                        cds_id = fid + "_CDS"
                        mrna_id = fid + "_mRNA"
                        prot_len = len(prot)
                        prot_lengths.append(prot_len)
                        feature["protein_translation"] = prot
                        feature["protein_translation_length"] = prot_len
                        feature["cdss"] = [cds_id]
                        feature["mrnas"] = [mrna_id]
                        cds = {
                            "id": cds_id,
                            "location": location,
                            "md5": md5,
                            "parent_gene": fid,
                            "parent_mrna": mrna_id,
                            "function": (product if product else ""),
                            "ontology_terms": {},
                            "protein_translation": prot,
                            "protein_translation_length": prot_len,
                            "aliases": aliases
                        }
                        mrna = {
                            "id": mrna_id,
                            "location": location,
                            "md5": md5,
                            "parent_gene": fid,
                            "cds": cds_id
                        }
                    features.append(feature)
                    if cds:
                        cdss.append(cds)
                    if mrna:
                        mrnas.append(mrna)

        # Prepare report
        report = ""
        report += "Number of genes predicted: " + str(len(features)) + "\n"
        report += "Number of protein coding genes: " + str(
            len(prot_lengths)) + "\n"
        report += "Number of genes with non-hypothetical function: " + str(
            non_hypothetical) + "\n"
        report += "Number of genes with EC-number: " + str(
            genes_with_ec) + "\n"
        report += "Number of genes with Seed Subsystem Ontology: " + str(
            genes_with_sso) + "\n"
        report += "Average protein length: " + str(
            int(sum(prot_lengths) / float(len(prot_lengths)))) + " aa.\n"

        annotated_assembly = namedtuple("annotated_assembly",
                                        "features cdss mrnas report_message")
        return annotated_assembly(features, cdss, mrnas, report)

    def get_new_annotations(self, gff_filepath):
        """

        :param gff_filepath: A dictionary of ids with products and ec numbers
        :return:
        """
        evidence = self.make_annotation_evidence()
        genome = {}
        with open(gff_filepath, "r") as f:
            for rec in GFF.parse(f):
                gid = rec.id
                gene_features = {"id": id}

                for feature in rec.features:
                    qualifiers = feature.qualifiers
                    if "product" in qualifiers:
                        gene_features["function"] = " ".join(
                            qualifiers["product"])

                    if "eC_number" in qualifiers:
                        ec_numbers = qualifiers["eC_number"]
                        sso_terms = dict()
                        for ec in ec_numbers:
                            sso_list = self.ec_to_sso.get(ec, [])
                            for sso_item in sso_list:
                                sso_terms[sso_item["id"]] = {
                                    "id": sso_item["id"],
                                    "evidence": [evidence],
                                    "term_name": sso_item["name"],
                                    "ontology_ref": self.sso_ref,
                                    "term_lineage": []
                                }

                        gene_features["ontology_terms"] = sso_terms
                genome[gid] = gene_features

        return genome

    def write_genome_to_fasta(self, genome_data):
        """

        :param genome_data:
        :return:
        """
        fasta_for_prokka_filepath = os.path.join(
            self.scratch, "features_" + str(uuid.uuid4()) + ".fasta")
        count = 0
        with open(fasta_for_prokka_filepath, "w") as f:
            for item in genome_data["data"]["features"]:
                if "id" not in item or "dna_sequence" not in item:
                    print("This feature does not have a valid dna sequence.")
                else:
                    f.write(">" + item["id"] + "\n" + item["dna_sequence"] +
                            "\n")
                    count += 1

        print("Finished printing to" + fasta_for_prokka_filepath)
        if os.stat(fasta_for_prokka_filepath).st_size == 0:
            raise Exception(
                "This genome does not contain features with DNA_SEQUENCES. Fasta file is empty."
            )

        return fasta_for_prokka_filepath

    def make_sso_ontology_event(self):
        """

        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation",
            "method_version": version,
            "timestamp": time_string,
            "id": "SSO",
            "ontology_ref": self.sso_ref
        }

    def make_annotation_evidence(self):
        """
        Create a dict for the evidence field for the genome
        :param sso_ref: Reference to the annotation library set
        :return: Ontology_event to be appended to the list of genome ontology events
        """
        time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        version = re.search("module-version:\n\W+(.+)\n", yml_text).group(1)

        return {
            "method": "Prokka Annotation (Evidence)",
            "method_version": version,
            "timestamp": time_string,
        }

    def create_genome_ontology_fields(self, genome_data):
        """
        Create ontology event fields for a genome object
        :param genome_data:  A genome object's data filed
        :return: a named tuple containg the modified genome object and a new ontology event index
        """
        # Make sure ontologies_events exist
        sso_event = self.make_sso_ontology_event()
        ontology_event_index = 0

        if 'ontology_events' in genome_data['data']:
            genome_data['data']['ontology_events'].append(sso_event)
            ontology_event_index += len(
                genome_data['data']['ontology_events']) - 1
        else:
            genome_data['data']['ontology_events'] = [sso_event]

        genome_obj_modified = namedtuple('genome_obj_modified',
                                         'genome_data ontology_event_index')
        return genome_obj_modified(genome_data, ontology_event_index)

    @staticmethod
    def old_genome_ontologies(feature, new_ontology):
        """
        Update the feature's ontologies for an old genome
        :param feature: Feature to update
        :param new_ontology: New Ontology to update with
        :return: The feature with the ontology updated, in the old style
        """
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}
        for key in new_ontology.keys():
            feature["ontology_terms"]["SSO"][key] = new_ontology[key]
        return feature

    @staticmethod
    def new_genome_ontologies(feature, new_ontology, ontology_event_index):
        """
        Update the feature's ontologies for a new genome
        :param feature: Feature to update
        :param new_ontology: New Ontology to update with
        :param ontology_event_index: Ontology index to update the feature with
        :return: the updated feature
        """
        if "ontology_terms" not in feature:
            feature["ontology_terms"] = {"SSO": {}}
        if "SSO" not in feature["ontology_terms"]:
            feature["ontology_terms"]["SSO"] = {}

        for key in new_ontology.keys():
            id = new_ontology[key]["id"]
            if id in feature["ontology_terms"]["SSO"]:
                feature["ontology_terms"]["SSO"][id].append(
                    ontology_event_index)
            else:
                feature["ontology_terms"]["SSO"][id] = [ontology_event_index]
        return feature

    def annotate_genome_with_new_annotations(self, **annotation_args):
        """
        Annotate the genome with new annotations for  Genome ReAnnotation
        :param annotation_args:  genome_data from the genome obj, new_annotations from prokka, and the output_genome_name
        :return: A tuple containg the genome_ref, filepaths for the function and ontology summary, and stats about the annotations
          """
        genome_data = annotation_args["genome_data"]
        new_annotations = annotation_args["new_annotations"]

        new_genome = False
        if 'feature_counts' in genome_data['data']:
            new_genome = True
            genome_obj_modified = self.create_genome_ontology_fields(
                genome_data)
            genome_data = genome_obj_modified.genome_data
            ontology_event_index = genome_obj_modified.ontology_event_index

        stats = {
            "current_functions": len(genome_data["data"]["features"]),
            "new_functions": 0,
            "found_functions": 0,
            "new_ontologies": 0
        }

        function_summary_fp = os.path.join(self.scratch, "ontology_report")
        ontology_summary_fp = os.path.join(self.scratch, "function_report")
        onto_r = open(function_summary_fp, "w")
        func_r = open(ontology_summary_fp, "w")
        func_r.write("function_id current_function new_function\n")
        onto_r.write("function_id current_ontology new_ontology\n")

        ontologies_present = {"SSO": {}}
        for i, feature in enumerate(genome_data["data"]["features"]):
            fid = feature["id"]
            current_function = feature.get("function", "")
            current_functions = feature.get("functions", [])
            current_ontology = feature.get("ontology_terms", None)
            new_function = ""
            new_ontology = dict()

            if fid in new_annotations:
                # Set Function
                new_function = new_annotations[fid].get("function", "")
                if new_function and "hypothetical protein" not in new_function:
                    if (new_function != current_function
                            and new_function not in current_functions):
                        stats['new_functions'] += 1
                    genome_data["data"]["features"][i][
                        "function"] = new_function
                    genome_data["data"]["features"][i]["functions"] = [
                        new_function
                    ]
                    stats['found_functions'] += 1

                # Set Ontologies
                new_ontology = new_annotations[fid].get("ontology_terms", None)
                if new_ontology:
                    stats['new_ontologies'] += 1
                    if new_genome:
                        # New style
                        genome_data["data"]["features"][i] = self. \
                            new_genome_ontologies(feature, new_ontology, ontology_event_index)

                        # Add to ontologies Present
                        for key in new_ontology.keys():
                            oid = new_ontology[key]["id"]
                            name = new_ontology[key].get("name", "Unknown")
                            ontologies_present["SSO"][oid] = name

                    else:
                        genome_data["data"]["features"][i] = self. \
                            old_genome_ontologies(feature, new_ontology)

            if current_function:
                func_r.write(
                    json.dumps([fid, [current_function], [new_function]]) +
                    "\n")
            else:
                func_r.write(
                    json.dumps([fid, current_functions, [new_function]]) +
                    "\n")

            onto_r.write(
                json.dumps([fid, current_ontology, new_ontology]) + "\n")

        func_r.close()
        onto_r.close()

        if ontologies_present:
            if "ontologies_present" in genome_data["data"]:
                if "SSO" in genome_data["data"]["ontologies_present"]:
                    for key, value in ontologies_present["SSO"].items():
                        genome_data["data"]["ontologies_present"]["SSO"][
                            key] = value
                else:
                    genome_data["data"][
                        "ontologies_present"] = ontologies_present["SSO"]

            else:
                genome_data["data"]["ontologies_present"] = ontologies_present

        info = self.gfu.save_one_genome({
            "workspace":
            self.output_workspace,
            "name":
            annotation_args["output_genome_name"],
            "data":
            genome_data["data"],
            "provenance":
            self.ctx.provenance()
        })["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])
        annotated_genome = namedtuple(
            "annotated_genome",
            "genome_ref function_summary_filepath ontology_summary_filepath stats"
        )

        return annotated_genome(genome_ref, function_summary_fp,
                                ontology_summary_fp, stats)

    def upload_file(self,
                    filepath,
                    message="Annotation report generated by kb_prokka"):
        """
        Upload a file to shock
        :param filepath: File to upload
        :param message: Optional Upload Message
        :return:
        """
        output_file_shock_id = self.dfu.file_to_shock({"file_path":
                                                       filepath})["shock_id"]
        print("Uploaded filepath" + filepath + "to shock and got id" +
              output_file_shock_id)
        return {
            "shock_id": output_file_shock_id,
            "name": os.path.basename(filepath),
            "label": os.path.basename(filepath),
            "description": message
        }

    def report_annotated_genome(self, genome):
        """ Create report output with newly reannotated genome, and some stats

        :param genome: Reannotated Genome Reference, Report Files and Stats
        :return: Reference to Report Object
        """
        genome_ref = genome.genome_ref
        stats = genome.stats

        file_links = [
            self.upload_file(genome.ontology_summary_filepath),
            self.upload_file(genome.function_summary_filepath)
        ]

        report_message = ("Genome Ref:{0}\n"
                          "Number of features sent into prokka:{1}\n"
                          "New functions found:{2}\n"
                          "Ontology terms found:{3}\n").format(
                              genome_ref, stats["current_functions"],
                              stats["new_functions"], stats["new_ontologies"])

        report_info = self.kbr.create_extended_report({
            "message":
            report_message,
            "objects_created": [{
                "ref": genome_ref,
                "description": "Annotated genome"
            }],
            "file_links":
            file_links,
            "report_object_name":
            "kb_prokka_report_" + str(uuid.uuid4()),
            "workspace_name":
            self.output_workspace
        })

        return {
            "output_genome_ref": genome_ref,
            "report_name": report_info["name"],
            "report_ref": report_info["ref"]
        }

    def annotate_genome(self, params):
        """ User input an existing genome to re-annotate.

        :param params: Reference to the genome, Output File Name, UI Parameters
        :return: Report with Reannotated Genome and Stats about it
        """
        self.download_seed_data()
        self.output_workspace = params["output_workspace"]

        genome_ref = self._get_input_value(params, "object_ref")
        output_name = self._get_input_value(params, "output_genome_name")
        # genome_data = self.dfu.get_objects({"object_refs": [genome_ref]})["data"][0]

        genome_data = \
            self.genome_api.get_genome_v1({"genomes": [{"ref": genome_ref}], 'downgrade': 0})[
                "genomes"][0]

        fasta_for_prokka_filepath = self.write_genome_to_fasta(genome_data)
        output_dir = self.run_prokka(params, fasta_for_prokka_filepath)
        prokka_results = self.retrieve_prokka_results(output_dir)
        new_annotations = self.get_new_annotations(prokka_results.gff_filepath)
        annotated_genome = self.annotate_genome_with_new_annotations(
            genome_data=genome_data,
            new_annotations=new_annotations,
            output_genome_name=output_name)
        return self.report_annotated_genome(annotated_genome)

    def annotate_assembly(self, params, assembly_info):
        """
        Annotate an assembly with Prokka. The steps include to download the assembly as a fasta file,
        rename the contigs, run prokka against the contigs, parse the results, and finally,
        create and upload a genome object.

        :param params: object reference, output_genome_name and output_workspace
        :param assembly_info: Information used to determine if the assembly is too big
        :return: Report with newly annotated assembly as a genome, and stats about it
        """
        self.download_seed_data()
        output_workspace = params["output_workspace"]

        assembly_ref = self._get_input_value(params, "object_ref")
        output_genome_name = self._get_input_value(params,
                                                   "output_genome_name")
        output_workspace = self._get_input_value(params, "output_workspace")
        assembly_info = self.inspect_assembly(assembly_info[10], assembly_ref)
        orig_fasta_file = self.au.get_assembly_as_fasta({"ref":
                                                         assembly_ref})["path"]

        # Rename Assembly and Keep Track of Old Contigs
        renamed_assembly = self.create_renamed_assembly(orig_fasta_file)
        # Run Prokka with the modified, renamed fasta file
        output_dir = self.run_prokka(params, renamed_assembly.filepath)
        # Prokka_results
        prokka_results = self.retrieve_prokka_results(output_dir)
        # Parse Results
        annotated_assembly = self.parse_prokka_results(
            gff_filepath=prokka_results.gff_filepath,
            cds_to_dna=prokka_results.cds_to_dna,
            cds_to_prot=prokka_results.cds_to_prot,
            new_ids_to_old=renamed_assembly.new_ids_to_old)

        # Force defaults for optional parameters that may be set to None
        scientific_name = 'Unknown'
        if 'scientific_name' in params and params['scientific_name']:
            scientific_name = params['scientific_name']
        domain = "Bacteria"
        if 'kingdom' in params and params['kingdom']:
            domain = params['kingdom']
        gcode = 0
        if 'gcode' in params and params['gcode']:
            gcode = params['gcode']

        genome = {
            "id": "Unknown",
            "features": annotated_assembly.features,
            "scientific_name": scientific_name,
            "domain": domain,
            "genetic_code": gcode,
            "assembly_ref": assembly_ref,
            "cdss": annotated_assembly.cdss,
            "mrnas": annotated_assembly.mrnas,
            "source": "PROKKA annotation pipeline",
            "gc_content": assembly_info.gc_content,
            "dna_size": assembly_info.dna_size,
            "reference_annotation": 0
        }

        info = self.gfu.save_one_genome({
            "workspace": output_workspace,
            "name": output_genome_name,
            "data": genome,
            "provenance": self.ctx.provenance()
        })["info"]

        genome_ref = str(info[6]) + "/" + str(info[0]) + "/" + str(info[4])

        report_message = "Genome saved to: " + output_workspace + "/" + \
                         output_genome_name + "\n" + annotated_assembly.report_message

        report_info = self.kbr.create_extended_report({
            "message":
            report_message,
            "objects_created": [{
                "ref": genome_ref,
                "description": "Annotated genome"
            }],
            "report_object_name":
            "kb_prokka_report_" + str(uuid.uuid4()),
            "workspace_name":
            output_workspace
        })

        return {
            "output_genome_ref": genome_ref,
            "report_name": report_info["name"],
            "report_ref": report_info["ref"]
        }
Beispiel #20
0
class VCFToVariation:
    def __init__(self, config, scratch, callback_url ):
        self.scratch = config['scratch']
        self.ws_url = config['workspace-url']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.wsc = Workspace(self.ws_url)
        self.scratch = scratch
        self.callback_url = callback_url
        self.au = AssemblyUtil(self.callback_url)
        self.gapi = GenericsAPI(self.callback_url)


    def _parse_vcf_data(self, params):
        vcf_filepath = self._stage_input(params)

        # file is validated by this point, can assume vcf_filepath is valid
        reader = vcf.Reader(open(vcf_filepath, 'r'))

        version = float(reader.metadata['fileformat'][4:6])
        genotypes = reader.samples
        chromosomes = []
        contigs = {}
        totalvars = 0

        for record in reader:
            totalvars += 1
            if record.CHROM not in chromosomes:
                chromosomes.append(record.CHROM)

            if record.CHROM not in contigs.keys():
                passvar = 1 if not record.FILTER else 0

                contigs[record.CHROM] = {
                    'contig_id': record.CHROM,
                    'totalvariants': 1,
                    'passvariants': passvar,
                    'length': int(record.affected_end-record.affected_start),
                }
            else:
                contigs[record.CHROM]['totalvariants'] += 1
                if not record.FILTER:
                    contigs[record.CHROM]['passvariants'] += 1

        vcf_info = {
            'version': version,
            'contigs': contigs,
            'total_variants': totalvars,
            'genotype_ids': genotypes,
            'chromosome_ids': chromosomes,
            'file_ref': vcf_filepath
        }

        return vcf_info


    def _validate_vcf_to_sample(self, vcf_genotypes, sample_ids):
        genos_not_found = []

        vgenotypes = [x.upper().strip() for x in vcf_genotypes]
        sids = [x.upper().strip() for x in sample_ids]

        for geno in vgenotypes:
            if geno not in sids:
                genos_not_found.append(geno)

        if not genos_not_found:
            return True
        else:
            return genos_not_found

    def _chk_if_vcf_ids_in_assembly(self, vcf_chromosomes, assembly_chromosomes):
        chromos_not_in_assembly = []

        pp(assembly_chromosomes)

        for chromo in vcf_chromosomes:
            if chromo not in assembly_chromosomes:
                chromos_not_in_assembly.append(chromo)

        if not chromos_not_in_assembly:
            return True
        else:
            return chromos_not_in_assembly

    def _get_vcf_version(self, vcf_filepath):
        with(gzip.open if is_gz_file(vcf_filepath) else open)(vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted.")
                raise ValueError("Invalid VCF.  ##fileformat line in meta is improperly formatted. "
                                 "Check VCF file specifications: https://samtools.github.io/hts-specs/")

            vcf_version = float(tokens[1][-4:].rstrip())

            return vcf_version

    def validate_vcf(self, params):
        if 'genome_or_assembly_ref' not in params:
            raise ValueError('Genome or Assembly reference not in input parameters: \n\n'+params)
        if 'vcf_staging_file_path' not in params:
            raise ValueError('VCF staging file path not in input parameters: \n\n' + params)


        vcf_filepath = self._stage_input(params)

        vcf_version = self._get_vcf_version(vcf_filepath)

        # setup directorys for validation output
        validation_output_dir = os.path.join(self.scratch, 'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        # vcftools (vcf-validator) supports VCF v4.0-4.2
        # https://github.com/vcftools/vcftools

        # EBIvariation/vcf-validator (vcf_validator_linux) supports VCF v4.1-4.3
        # https://github.com/EBIvariation/vcf-validator

        # vcftools is only to validate VCF v4.0

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-l")
            validator_cmd.append('error')
            print("VCF version "+str(vcf_version)+".")
        elif vcf_version >= 4.0:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version 4.0.")
        else:
            raise ValueError('VCF Version not in file, or fileformat line malformatted, or not version >=4.0. file format line must be the '
                             'first line of vcf file and in appropriate syntax. Check VCF file specifications: '
                             'https://samtools.github.io/hts-specs/')

        print("Validator command: {}".format(validator_cmd))

        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            if line.decode("utf-8").strip().startswith('[info]'):
                validator_output.append(line.decode("utf-8"))

        out, err = p.communicate()

        validation_output_filename = os.path.join(validation_output_dir, 'vcf_validation.txt')
        file_output_chk = []

        try:
            if validator_output[0][:6] == '[info]':
                # validation by vcf_validator_linux
                validation_output_filename = validator_output[1].split(' ')[6].strip('\n')
                vo = validator_output[2].split(' ')
                file_output_chk = ''.join(vo[9:]).strip('\n')

                if not os.path.exists(validation_output_filename):
                    raise ValueError(validation_output_filename+' does not exist!')

                if not file_output_chk == 'isvalid':
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))

                #TODO: more detailed validation parsing for vcf_validator_linux
            else:
                if validator_output:
                    with open(validation_output_filename, 'w') as f:
                        for line in validator_output:
                            f.write(str(line))
                        f.close()
                    print('\n'.join(validator_output))
                    raise ValueError('\n'.join(validator_output))
                else:
                    with open(validation_output_filename, 'w') as f:
                        f.write("vcftools used to validate vcf file:\n"+vcf_filepath+"\n\File is validate as of vcf spec v4.0")
                        f.close()

                # TODO: more detailed validation parsing for vcftools
        except IndexError:
            # if vcf file < v4.1, and valid it will produce index error on line 132
            if validator_output:
                with open(validation_output_filename, 'w') as f:
                    for line in validator_output:
                        f.write(str(line))
                    f.close()
                print('\n'.join(validator_output))
                raise ValueError('\n'.join(validator_output))
            else:
                with open(validation_output_filename, 'w') as f:
                    f.write("vcftools used to validate vcf file:\n" + vcf_filepath + "\n\File is validate as of vcf spec v4.0")
                    f.close()

        if not os.path.exists(validation_output_filename):
            print('Validator did not generate log file!')
            raise SystemError("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filename))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filename

    def _stage_input(self, params):
        # extract file location from input ui parameters
        if params['vcf_staging_file_path'].startswith('/kb/module/test/'):
            # variation utils unit test
            vcf_local_file_path = params['vcf_staging_file_path']

            if vcf_local_file_path.endswith('.gz'):
                with gzip.open(vcf_local_file_path, 'rb') as f_in:
                    with open(vcf_local_file_path[:-3], 'wb') as f_out:
                        shutil.copyfileobj(f_in, f_out)

                vcf_local_file_path = vcf_local_file_path[:-3]
        else:
            staging_dir = '/staging'
            vcf_local_file_path = os.path.join(staging_dir, params['vcf_staging_file_path'])

        if not os.path.exists(vcf_local_file_path):
            raise OSError('VCF input path does not exist, or is not readable')

        orig_file_path = os.path.join(self.scratch, 'original_' + os.path.basename(vcf_local_file_path))
        print(f'VCF: {vcf_local_file_path} Orig: {orig_file_path}')
        self.original_file = shutil.copy(vcf_local_file_path, orig_file_path)

        # TODO: use data file utils here, upload vcf to shock, use dfu.
        if is_gz_file(vcf_local_file_path):
            # /staging is read only, therefore have to copy before uncompressing
            if not vcf_local_file_path == os.path.join(self.scratch, params['vcf_staging_file_path']):
                copy = shutil.copy(vcf_local_file_path, os.path.join(self.scratch,params['vcf_staging_file_path']))
                unpack = self.dfu.unpack_file({'file_path': copy})
            else:
                unpack = {}
                unpack['file_path'] = os.path.join(self.scratch,params['vcf_staging_file_path'])
            params['vcf_local_file_path'] = unpack['file_path']
            return unpack['file_path']
        else:
            params['vcf_local_file_path'] = vcf_local_file_path 
            return vcf_local_file_path

    def _create_sample_attribute_file(self, vcf_file, sample_attribute_mapping_file):
        """
        function for creating sample attribute mapping file.
        """
        try:
            with open (vcf_file, 'r') as vcf_handle:
                Lines = vcf_handle.readlines()

                for line in Lines:
                    if(line.startswith("#CHROM")):
                       header = line.lstrip().split("\t")

                       try:
                          with open (sample_attribute_mapping_file, 'w') as attribute_mapping_handle:
                              attribute_mapping_handle.write("Attribute\tAttribute ontology ID\tUnit\tUnit ontology ID")

                              for i in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[i])
                              #attribute_mapping_handle.write("\n")


                              attribute_mapping_handle.write("label\t\t\t")
                              for j in range(9,len(header)):
                                  attribute_mapping_handle.write("\t"+header[j])
                              #attribute_mapping_handle.write("\n")
                       except IOError:
                           print("Could not write to file:", sample_attribute_mapping_file)

        except IOError:
               print("Could not read file:", vcf_file)

    def _validate_assembly_ids(self, params):
        # All chromosome ids from the vcf should be in assembly
        # but not all assembly chromosome ids should be in vcf


        if ('genome_ref' in params):
            subset = self.wsc.get_object_subset([{
                'included': ['/assembly_ref'],
                'ref': params['genome_or_assembly_ref']
            }])

            self.vcf_info['assembly_ref'] = subset[0]['data']['assembly_ref']

        if ('assembly_ref' in params):
            self.vcf_info['assembly_ref'] = params['assembly_ref']

        assembly_chromosome_ids_call = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])

        assembly_chromosomes = assembly_chromosome_ids_call[0]['data']['contigs'].keys()
        vcf_chromosomes = self.vcf_info['chromosome_ids']

        chk_assembly_ids =  self._chk_if_vcf_ids_in_assembly(vcf_chromosomes, assembly_chromosomes)

        if isinstance(chk_assembly_ids, list):
            failed_ids = ' '.join(chk_assembly_ids)
            print(f'VCF contig ids: {failed_ids} are not present in assembly.')
            raise ValueError(f'VCF contig ids: {failed_ids} are not present in assembly.')


        return assembly_chromosomes

    def _validate_sample_ids(self, params):
        # All samples within the VCF file need to be in sample attribute list


        vcf_genotypes = self.vcf_info['genotype_ids']

        sample_ids_subset = self.wsc.get_object_subset([{
            'included': ['/instances'],
            'ref': params['sample_attribute_ref']
        }])

        sample_ids = sample_ids_subset[0]['data']['instances'].keys()

        validate_genotypes = self._validate_vcf_to_sample(vcf_genotypes, sample_ids)

        if isinstance(validate_genotypes, list):
            failed_genos = ' '.join(validate_genotypes)
            print(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')
            raise ValueError(f'VCF genotypes: {failed_genos} are not present in sample attribute mapping.')

        return sample_ids

    def _construct_contig_info(self, params):
        """
            KBaseGwasData.Variations type spec

            /*
               Contig variation data
                 contig_id - contig identifier
                 totalvariants - total number of variants in each contig
                 passvariants - total number of variants that pass quality variation filter in contig
                 length - length of contig from assembly data
             */

             typdef structure {
               string contig_id;
               int totalvariants;
               int passvariants;
               int length; // from assembly
             } contig_info;
        """

        assembly_chromosome_dict = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': self.vcf_info['assembly_ref']
        }])[0]['data']['contigs']


        contigs = []

        contig_infos = self.vcf_info['contigs']


        for contig_id in contig_infos:
            length_contig = assembly_chromosome_dict[contig_id].get("length")
            contig_infos[contig_id]["length"] = length_contig
            contigs.append(contig_infos[contig_id])

        return contigs
   

    def _bgzip_vcf(self, vcf_filepath):

        if not os.path.exists(vcf_filepath):
           print (vcf_filepath + " does not exist")

        zip_cmd = ["bgzip", vcf_filepath]
        
        p = subprocess.Popen(zip_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()        
        
        bgzip_file_path = vcf_filepath + ".gz"
        print (bgzip_file_path)
          
        return bgzip_file_path
  
 
    def _index_vcf(self, bgzip_file):
 
        output_dir = self.scratch

        bgzip_filepath = os.path.join(self.scratch, bgzip_file)
        if not os.path.exists(bgzip_filepath):
           print (bgzip_filepath + " does not exist")

        index_cmd = ["tabix", "-p", "vcf", bgzip_filepath]       
        p = subprocess.Popen(index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()
         
        index_file_path = bgzip_filepath + ".tbi"
     
        return index_file_path

    def _index_assembly(self, assembly_file):
        if not os.path.exists(assembly_file):
           print (assembly_file + " does not exist")

        logging.info("indexing assembly file")

        assembly_index_cmd = ["samtools", "faidx", assembly_file]
        print(assembly_index_cmd)
        p = subprocess.Popen(assembly_index_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)

        out, err = p.communicate()

        logging.info("indexing of assembly file done!")

        return assembly_file + ".fai"

    def _download_assembly(self, assembly_ref):
        file = self.au.get_assembly_as_fasta({
          'ref': assembly_ref
        })
        return file
 
    def _construct_variation(self, params, contigs_info):
        
        """
            KBaseGwasData.Variations type spec
             /*
               Variation object data structure
                 num_genotypes - number of total genotypes within variant file
                 num_variants - number of total variants within variant file
                 contigs - list of contig ids and variant information
                 attribute_ref - KBase reference to attribute mapping workspace object
                 genome_ref - KBase reference to genome workspace object
                 assembly_ref - KBase reference to assemebly workspace object
                 vcf_handle_ref - VCF handle reference to VCF file

                 @optional genome_ref
             */
             typedef structure {
               int numgenotypes;
               int numvariants;
               list<contig_info> contigs;
               attribute_ref population; // KBaseExperiments.AttributeMapping
               genome_ref genome_ref; // KBaseGenomes.Genome
               assembly_ref assemby_ref; // KBaseGenomeAnnotations.Assembly
               vcf_handle_ref vcf_handle_ref;
             } Variations;

            :param params: KBase ui input parameters
            :param population: previoiusly constructed sample population data
            :return: constructed variation object (dictionary)
        """

        if not self.vcf_info['file_ref'].startswith(self.scratch):
            new_vcf_file = os.path.join(self.scratch, os.path.basename(self.vcf_info['file_ref']))
            self.vcf_info['file_ref'] = shutil.copy(self.vcf_info['file_ref'], new_vcf_file)
      

        vcf_staged_file = self.original_file

        bgzip_file_path = self._bgzip_vcf(vcf_staged_file)
        vcf_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': bgzip_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(bgzip_file_path, vcf_shock_file_ref)


        index_file_path = self._index_vcf(bgzip_file_path)
        vcf_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(index_file_path, vcf_index_shock_file_ref)


        assembly_file_path = self._download_assembly(self.vcf_info['assembly_ref'])['path']

        assembly_index_file_path = self._index_assembly(assembly_file_path)
        assembly_index_shock_file_ref = self.dfu.file_to_shock(
            {'file_path': assembly_index_file_path, 'make_handle': 1}
        )
        compare_md5_local_with_shock(assembly_index_file_path, assembly_index_shock_file_ref)
        
        variation_obj = {
            'numgenotypes': int(len(self.vcf_info['genotype_ids'])),
            'numvariants': int(self.vcf_info['total_variants']),
            'contigs': contigs_info,
            'population': params['sample_attribute_ref'],

            # TYPE SPEC CHANGE: need to change type spec to assembly_ref instead of assemby_ref
            'assemby_ref': self.vcf_info['assembly_ref'],
            'vcf_handle_ref': vcf_shock_file_ref['handle']['hid'],
            'vcf_handle' : vcf_shock_file_ref['handle'],
            'vcf_index_handle_ref': vcf_index_shock_file_ref['handle']['hid'],
            'vcf_index_handle': vcf_index_shock_file_ref['handle'],
            'assembly_index_handle_ref': assembly_index_shock_file_ref['handle']['hid'],
            'assembly_index_handle': assembly_index_shock_file_ref['handle']
        }
        if 'genome_ref' in params:
            variation_obj['genome_ref'] =  params['genome_ref']

        return variation_obj

    def _save_var_obj(self, params, var):
        """
        :param params:
        :param var:
        :return:
            DataFileUtils object_info:
                objid - the numerical id of the object.
                name - the name of the object.
                type - the type of the object.
                save_date - the save date of the object.
                ver - the version of the object.
                saved_by - the user that saved or copied the object.
                wsid - the id of the workspace containing the object.
                workspace - the name of the workspace containing the object.
                chsum - the md5 checksum of the object.
                size - the size of the object in bytes.
                meta - arbitrary user-supplied metadata about the object.
        """

        print('Saving Variation to workspace...\n')

        if var:
            if not 'variation_object_name' in params:
                var_obj_name = 'variation_'+str(uuid.uuid4())
            else:
                var_obj_name = params['variation_object_name']

            var_obj_info = self.dfu.save_objects({
                'id': self.dfu.ws_name_to_id(params['workspace_name']),
                'objects': [{
                    'type': 'KBaseGwasData.Variations',
                    'data': var,
                    'name': var_obj_name
                }]
            })[0]

            return var_obj_info
        else:
            raise ValueError('Variation object blank, cannot not save to workspace!')

    def _validate_sample_attribute_ref(self, params):

        #params["sample_attribute_ref"] = ''  #just for testing
        if not params['sample_attribute_ref']:
           sample_attribute_mapping_file = os.path.join(self.scratch ,"sample_attribute.tsv")   #hardcoded for testing
           self._create_sample_attribute_file(params['vcf_local_file_path'], sample_attribute_mapping_file)
          
           logging.info("Uploading sample attribute file to ref")
           vcf_sample_attribute_shock_file_ref = self.dfu.file_to_shock(
               {'file_path': sample_attribute_mapping_file, 'make_handle': 1}
           )
           shock_id = vcf_sample_attribute_shock_file_ref['shock_id']
           ws_id = self.dfu.ws_name_to_id(params['workspace_name'])
           import_params = {
                  'input_shock_id' : shock_id,
                  'output_ws_id': ws_id,
                  'output_obj_name': 'Sample_attribute'}

           ret = self.gapi.file_to_attribute_mapping(import_params)
           params['sample_attribute_ref'] = ret['attribute_mapping_ref']

    def import_vcf(self, params):
        # VCF validation
        # VCF file validation
        file_valid_result = self.validate_vcf(params)
        self._validate_sample_attribute_ref(params)
        # VCF file parsing
        self.vcf_info = self._parse_vcf_data(params)
        # Validate vcf chromosome ids against assembly chromosome ids
        self._validate_assembly_ids(params)
        # Validate vcf genotypes against sample meta data ids
        self._validate_sample_ids(params)

        # Variation object construction
        # construct contigs_info
        contigs_info = self._construct_contig_info(params)
        # construct variation
        var = self._construct_variation(params, contigs_info)

        # Save variation object to workspace
        var_wksp_obj = self._save_var_obj(params, var)

        return [var_wksp_obj, var]
Beispiel #21
0
    def run_kraken2(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_kraken2

        # Download input data as FASTA or FASTQ
        logging.info('Calling run_kraken2')
        logging.info(f'params {params}')
        # Check for presence of input file types in params
        input_genomes = 'input_genomes' in params and len(
            params['input_genomes']
        ) > 0 and None not in params['input_genomes']
        input_refs = 'input_refs' in params and len(
            params['input_refs']) > 0 and None not in params['input_refs']
        input_paired_refs = 'input_paired_refs' in params and len(
            params['input_paired_refs']
        ) > 0 and None not in params['input_paired_refs']
        for name in ['workspace_name', 'db_type']:
            if name not in params:
                raise ValueError('Parameter "' + name +
                                 '" is required but missing')
        if not input_genomes and not input_refs and not input_paired_refs:
            raise ValueError(
                'You must enter either an input genome or input reads')

        if input_refs and input_paired_refs:
            raise ValueError(
                'You must enter either single-end or paired-end reads, '
                'but not both')

        if input_genomes and (input_refs or input_paired_refs):
            raise ValueError(
                'You must enter either an input genome or input reads, '
                'but not both')

        if input_genomes and (not isinstance(params['input_genomes'][0], str)):
            raise ValueError('Pass in a valid input genome string')

        if input_refs and (not isinstance(params['input_refs'], list)):
            raise ValueError('Pass in a list of input references')

        if input_paired_refs and (not isinstance(params['input_paired_refs'],
                                                 list)):
            raise ValueError('Pass in a list of input references')

        logging.info(params['db_type'])
        logging.info(
            f'input_genomes {input_genomes} input_refs {input_refs} input_paired_refs {input_paired_refs}'
        )
        input_string = []
        if input_genomes:
            assembly_util = AssemblyUtil(self.callback_url)
            fasta_file_obj = assembly_util.get_assembly_as_fasta(
                {'ref': params['input_genomes'][0]})
            logging.info(fasta_file_obj)
            fasta_file = fasta_file_obj['path']
            input_string.append(fasta_file)

        if input_refs:
            logging.info('Downloading Reads data as a Fastq file.')
            logging.info(f"input_refs {params['input_refs']}")
            readsUtil = ReadsUtils(self.callback_url)
            download_reads_output = readsUtil.download_reads(
                {'read_libraries': params['input_refs']})
            print(
                f"Input parameters {params['input_refs']}, {params['db_type']}"
                f"download_reads_output {download_reads_output}")
            fastq_files = []
            fastq_files_name = []
            for key, val in download_reads_output['files'].items():
                if 'fwd' in val['files'] and val['files']['fwd']:
                    fastq_files.append(val['files']['fwd'])
                    fastq_files_name.append(val['files']['fwd_name'])
                if 'rev' in val['files'] and val['files']['rev']:
                    fastq_files.append(val['files']['rev'])
                    fastq_files_name.append(val['files']['rev_name'])
            logging.info(f"fastq files {fastq_files}")
            input_string.append(' '.join(fastq_files))

        if input_paired_refs:
            logging.info('Downloading Reads data as a Fastq file.')
            logging.info(f"input_refs {params['input_paired_refs']}")
            readsUtil = ReadsUtils(self.callback_url)
            download_reads_output = readsUtil.download_reads(
                {'read_libraries': params['input_paired_refs']})
            print(
                f"Input parameters {params['input_paired_refs']}, {params['db_type']}"
                f"download_reads_output {download_reads_output}")
            fastq_files = []
            fastq_files_name = []
            # input_string.append('--paired')
            for key, val in download_reads_output['files'].items():
                if 'fwd' in val['files'] and val['files']['fwd']:
                    fastq_files.append(val['files']['fwd'])
                    fastq_files_name.append(val['files']['fwd_name'])
                if 'rev' in val['files'] and val['files']['rev']:
                    fastq_files.append(val['files']['rev'])
                    fastq_files_name.append(val['files']['rev_name'])
            # if len(fastq_files) % 2 != 0:
            #     raise ValueError('There must be an even number of Paired-end reads files')
            logging.info(f"fastq files {fastq_files}")
            input_string.extend(fastq_files)

        logging.info(f'input_string {input_string}')

        output_dir = os.path.join(self.shared_folder, 'kraken2_output')
        report_file_name = 'report.txt'
        report_file = os.path.join(output_dir, report_file_name)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        outprefix = "kraken2"

        cmd = [
            '/kb/module/lib/kraken2/src/kraken2.sh', '-d',
            '/data/kraken2/' + params['db_type'], '-o', output_dir, '-p',
            outprefix, '-t', '1', '-i'
        ]
        cmd.extend(input_string)

        # cmd = ['kraken2', '--db', '/data/kraken2/' + params['db_type'],
        #        '--output', output_dir, '--report', report_file,
        #        '--threads', '1']
        # cmd.extend(['--confidence', str(params['confidence'])]) if 'confidence' in params else cmd

        logging.info(f'cmd {cmd}')
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        logging.info(f'subprocess {p.communicate()}')

        summary_file = os.path.join(output_dir, outprefix + '.report.csv')
        report_dir = os.path.join(output_dir, 'html_report')
        if not os.path.exists(report_dir):
            os.makedirs(report_dir)
        summary_file_dt = os.path.join(report_dir, 'kraken2.datatable.html')
        self._generate_DataTable(summary_file, summary_file_dt)
        shutil.copy2('/kb/module/lib/kraken2/src/index.html',
                     os.path.join(report_dir, 'index.html'))
        shutil.copy2(os.path.join(output_dir, outprefix + '.krona.html'),
                     os.path.join(report_dir, 'kraken2.krona.html'))
        shutil.move(os.path.join(output_dir, outprefix + '.tree.svg'),
                    os.path.join(report_dir, 'kraken2.tree.svg'))
        html_zipped = self.package_folder(report_dir, 'index.html',
                                          'index.html')

        # columns = [
        #     'Percentage of fragments covered by the clade rooted at this taxon',
        #     'Number of fragments covered by the clade rooted at this taxon',
        #     'Number of fragments assigned directly to this taxon', 'rank code',
        #     'taxid', 'name']
        # report_df = pd.read_csv(report_file, sep='\t',
        #                         header=None, names=columns)
        # code_dict = {'U': 'Unclassified', 'R': 'Root', 'D': 'Domain',
        #              'K': 'Kingdom', 'P': 'Phylum', 'C': 'Class', 'O': 'Order',
        #              'F': 'Family', 'G': 'Genus', 'S': 'Species'}
        # report_df['rank code'] = report_df['rank code'].apply(
        #     lambda x: code_dict[x[0]] + x[1] if len(x) > 1 else code_dict[x])

        # self._generate_report_table(report_df, report_html_file, output_dir)
        # report_df.to_html(report_html_file, classes='Kraken2_report', index=False)
        # html_zipped = self.package_folder(output_dir, 'report.html',
        #                                   'report')
        # Step 5 - Build a Report and return
        objects_created = []
        output_files = os.listdir(output_dir)
        output_files_list = []
        for output in output_files:
            if not os.path.isdir(output):
                output_files_list.append({
                    'path':
                    os.path.join(output_dir, output),
                    'name':
                    output
                })
        message = f"Kraken2 run finished on {input_string} against {params['db_type']}."
        report_params = {
            'message': message,
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'file_links': output_files_list,
            'html_links': [html_zipped],
            'direct_html_link_index': 0,
            'html_window_height': 460
        }

        # STEP 6: construct the output to send back
        kbase_report_client = KBaseReport(self.callback_url)
        report_output = kbase_report_client.create_extended_report(
            report_params)
        report_output['report_params'] = report_params
        logging.info(report_output)
        # Return references which will allow inline display of
        # the report in the Narrative
        output = {
            'report_name': report_output['name'],
            'report_ref': report_output['ref'],
            'report_params': report_output['report_params']
        }
        #END run_kraken2

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_kraken2 return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def run_cnelsonAppDemo(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_cnelsonAppDemo

        # Print statements to stdout/stderr are captured and available as the App log
        logging.info('Starting run_cnelsonAppDemo function. Params=' +
                     pformat(params))

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        logging.info('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        logging.info('Filtered Assembly to ' + str(n_remaining) +
                     ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        logging.info('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })

        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'text_message':
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': params['workspace_name']
        })

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'assembly_output': new_assembly,
            'n_initial_contigs': n_total,
            'n_contigs_removed': n_total - n_remaining,
            'n_contigs_remaining': n_remaining
        }
        logging.info('returning:' + pformat(output))

        #END run_cnelsonAppDemo

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_cnelsonAppDemo return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def run_ContigFilter_max(self, ctx, params):
        """
        New app which filters contigs in an assembly using both a minimum and a maximum contig length
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_ContigFilter_max
        # Check that the parameters are valid
        for name in [
                'min_length', 'max_length', 'assembly_ref', 'workspace_name'
        ]:
            if name not in params:
                raise ValueError('Parameter "' + name +
                                 '" is required but missing')
        if not isinstance(params['min_length'],
                          int) or (params['min_length'] < 0):
            raise ValueError('Min length must be a non-negative integer')
        if not isinstance(params['max_length'],
                          int) or (params['max_length'] < 0):
            raise ValueError('Max length must be a non-negative integer')
        if not isinstance(params['assembly_ref'], str) or not len(
                params['assembly_ref']):
            raise ValueError('Pass in a valid assembly reference string')

        print(params['min_length'], params['max_length'],
              params['assembly_ref'])
        output = {}

        assembly_util = AssemblyUtil(self.callback_url)
        fasta_file = assembly_util.get_assembly_as_fasta(
            {'ref': params['assembly_ref']})
        print(fasta_file)

        # Parse the downloaded file in FASTA format
        parsed_assembly = SeqIO.parse(fasta_file['path'], 'fasta')
        min_length = params['min_length']
        max_length = params['max_length']

        # Keep a list of contigs greater than min_length
        good_contigs = []
        # total contigs regardless of length
        n_total = 0
        # total contigs over the min_length
        n_remaining = 0
        for record in parsed_assembly:
            n_total += 1
            if len(record.seq) >= min_length and len(record.seq) <= max_length:
                good_contigs.append(record)
                n_remaining += 1
        # Create a file to hold the filtered data
        workspace_name = params['workspace_name']
        filtered_path = os.path.join(self.shared_folder, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_path, 'fasta')
        # Upload the filtered data to the workspace
        new_ref = assembly_util.save_assembly_from_fasta({
            'file': {
                'path': filtered_path
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })
        # Create an output summary message for the report
        text_message = "".join([
            'Filtered assembly to ',
            str(n_remaining), ' contigs out of ',
            str(n_total)
        ])
        # Data for creating the report, referencing the assembly we uploaded
        report_data = {
            'objects_created': [{
                'ref': new_ref,
                'description': 'Filtered contigs'
            }],
            'text_message':
            text_message
        }
        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create({
            'report': report_data,
            'workspace_name': workspace_name
        })
        # Return the report reference and name in our results
        output = {
            'report_ref': report['ref'],
            'report_name': report['name'],
            'n_total': n_total,
            'n_remaining': n_remaining,
            'filtered_assembly_ref': new_ref
        }
        #END run_ContigFilter_max

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_ContigFilter_max return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def run_metaphlan2(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_metaphlan2

        # Check parameters
        logging.info(f'params {params}')
        # Check for presence of input file types in params
        input_genomes = 'input_genomes' in params and len(
            params['input_genomes']
        ) > 0 and None not in params['input_genomes']
        input_refs = 'input_ref' in params and len(
            params['input_ref']) > 0 and None not in params['input_ref']

        # for name in ['workspace_name', 'db_type']:
        #     if name not in params:
        #         raise ValueError(
        #             'Parameter "' + name + '" is required but missing')
        if not input_genomes and not input_refs:
            raise ValueError(
                'You must enter either an input genome or input reads')

        if input_refs and input_genomes:
            raise ValueError(
                'You must enter either an input genome or input reads, '
                'but not both')

        if input_genomes and (not isinstance(params['input_genomes'][0], str)):
            raise ValueError('Pass in a valid input genome string')

        if input_refs and (not isinstance(params['input_ref'], list)
                           or not len(params['input_ref'])):
            raise ValueError('Pass in a list of input references')
            # Start with base cmd and add parameters based on user input

        cmd = [
            'metaphlan2.py', '--bowtie2db', '/data/metaphlan2/mpa_v20_m200',
            '--mpa_pkl', '/data/metaphlan2/mpa_v20_m200.pkl'
        ]

        if input_genomes:
            assembly_util = AssemblyUtil(self.callback_url)
            fasta_file_obj = assembly_util.get_assembly_as_fasta(
                {'ref': params['input_genomes'][0]})
            logging.info(fasta_file_obj)
            fasta_file = fasta_file_obj['path']

            cmd.extend(['--input_type', 'fasta', fasta_file])

        if input_refs:
            logging.info('Downloading Reads data as a Fastq file.')
            logging.info(f"Input parameters {params.items()}")
            readsUtil = ReadsUtils(self.callback_url)
            download_reads_output = readsUtil.download_reads(
                {'read_libraries': params['input_ref']})
            print(
                f"Input refs {params['input_ref']} download_reads_output {download_reads_output}"
            )
            fastq_files = []
            fastq_files_name = []
            for key, val in download_reads_output['files'].items():
                if 'fwd' in val['files'] and val['files']['fwd']:
                    fastq_files.append(val['files']['fwd'])
                    fastq_files_name.append(val['files']['fwd_name'])
                if 'rev' in val['files'] and val['files']['rev']:
                    fastq_files.append(val['files']['rev'])
                    fastq_files_name.append(val['files']['rev_name'])
            logging.info(f"fastq files {fastq_files}")
            fastq_files_string = ' '.join(fastq_files)
            cmd.extend(['--input_type', 'fastq', fastq_files_string])

        output_dir = os.path.join(self.scratch, 'metaphlan2_output')

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # insert into second to last position, before input file(s)
        cmd.insert(
            -1,
            '--min_alignment_len') if params['min_alignment_len'] > 0 else cmd
        cmd.insert(-1, str(params['min_alignment_len'])
                   ) if params['min_alignment_len'] > 0 else cmd
        cmd.insert(
            -1, '--ignore_viruses') if params['ignore_viruses'] == 1 else cmd
        cmd.insert(
            -1, '--ignore_bacteria') if params['ignore_bacteria'] == 1 else cmd
        cmd.insert(
            -1,
            '--ignore_eukaryotes') if params['ignore_eukaryotes'] == 1 else cmd
        cmd.insert(
            -1, '--ignore_archaea') if params['ignore_archaea'] == 1 else cmd
        cmd.insert(-1, '--stat_q')
        cmd.insert(-1, str(params['stat_q']))
        cmd.insert(-1, '--min_cu_len')
        cmd.insert(-1, str(params['min_cu_len']))

        # append output file
        cmd.extend(['--bowtie2out', os.path.join(output_dir, 'report.txt')])
        cmd00 = ["ls", '-la', '/data/metaphlan2/']
        logging.info(f'cmd00 {cmd00}')
        pls = subprocess.Popen(cmd00,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT)
        logging.info(f'subprocess {pls.communicate()}')

        # run pipeline
        logging.info(f'cmd {" ".join(cmd)}')
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        logging.info(f'subprocess {p.communicate()}')

        cmd = [
            '/kb/module/lib/metaphlan2/src/accessories.sh',
            os.path.join(output_dir, 'report.txt'), output_dir, 'metaphlan2'
        ]
        p = subprocess.Popen(cmd,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT)
        logging.info(f'subprocess {p.communicate()}')

        # get output file and convert to format for report
        # logging.info(f"params['input_ref'] {params['input_ref']}")
        report_df = pd.read_csv(os.path.join(output_dir, 'report.txt'),
                                sep='\t')
        taxa_list = [
            'kingdom', 'phylum', 'class', 'order', 'family', 'genus',
            'species', 'strain', 'unclassified'
        ]
        abbrev_list = ['k', 'p', 'c', 'o', 'f', 'g', 's', 't', 'unclassified']

        for taxa in taxa_list:
            report_df[taxa] = None
        tax_dict = dict(zip(abbrev_list, taxa_list))

        # split dunderscores to get tax level and name
        report_df['taxonomy'] = report_df['#SampleID'].apply(
            lambda x: x.split('|')).apply(lambda x: [y.split('__') for y in x])

        for idx, row in report_df.iterrows():
            for col in row['taxonomy']:
                try:
                    report_df.loc[idx, tax_dict[col[0]]] = col[1]
                except IndexError:
                    report_df.loc[idx, tax_dict[col[0]]] = col[0]

        report_df.drop(['taxonomy', '#SampleID'], axis=1, inplace=True)

        report_html_file = os.path.join(output_dir, 'report.html')
        self._generate_report_table(report_df, report_html_file, output_dir)
        # report_df.to_html(report_html_file, classes='Metaphlan2_report',
        #                   index=False)
        html_zipped = self.package_folder(output_dir, 'report.html', 'report')

        # Step 5 - Build a Report and return
        objects_created = []
        output_files = os.listdir(output_dir)
        output_files_list = []
        for output in output_files:
            if not os.path.isdir(output):
                output_files_list.append({
                    'path':
                    os.path.join(output_dir, output),
                    'name':
                    output
                })
        message = f"MetaPhlAn2 run finished."
        report_params = {
            'message': message,
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'file_links': output_files_list,
            'html_links': [html_zipped],
            'direct_html_link_index': 0,
            'html_window_height': 460
        }
        kbase_report_client = KBaseReport(self.callback_url)
        report_output = kbase_report_client.create_extended_report(
            report_params)
        report_output['report_params'] = report_params
        logging.info(report_output)
        # Return references which will allow inline display of
        # the report in the Narrative
        output = {
            'report_name': report_output['name'],
            'report_ref': report_output['ref'],
            'report_params': report_output['report_params']
        }
        #END run_metaphlan2

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_metaphlan2 return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Beispiel #25
0
class VirSorterUtils:
    def __init__(self, config):
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.mgu = MetagenomeUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.ws = Workspace(config['workspace-url'], token=config['token'])

    def VirSorter_help(self):
        command = 'wrapper_phage_contigs_sorter_iPlant.pl --help'
        self._run_command(command)

    def get_fasta(self, ref):
        # check type of object, i.e KBaseGenomeAnnotations.Assembly-3.0
        obj_type = self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0][2]
        if 'assembly' in obj_type.lower():
            genome_ref = ref
        elif 'kbasegenomes' in obj_type.lower():
            data = self.ws.get_objects2({
                'objects': [{
                    'ref': ref,
                    'included': ['assembly_ref'],
                    'strict_maps': 1
                }]
            })['data'][0]['data']
            genome_ref = data['assembly_ref']
        else:
            raise ValueError(
                f"Input reference {ref} is of type {obj_type}. Type KBaseGenomes.Genome or "
                f"KBaseGenomeAnnotations.Assembly required.")
        return self.au.get_assembly_as_fasta({'ref': genome_ref})['path']

    def run_VirSorter(self, params):

        params['SDK_CALLBACK_URL'] = self.callback_url
        params['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']

        # Get contigs from 'assembly'
        genome_fp = self.get_fasta(params['genomes'])

        command = 'wrapper_phage_contigs_sorter_iPlant.pl --data-dir /data/virsorter-data'

        # Add in first args
        command += f' -f {genome_fp} --db {params["database"]}'

        # Check if additional genomes were submitted
        if params.get('add_genomes'):
            add_genomes_fp = self.get_fasta(params['add_genomes'])
            print(f'Added genomes DETECTED: {add_genomes_fp}')
            command += f' --cp {add_genomes_fp}'

        bool_args = ['virome', 'diamond', 'keep_db',
                     'no_c']  # keep_db = keep-db

        for bool_arg in bool_args:
            if params[
                    bool_arg] == 1:  # 0 is true and therefore run... though for some reason it's reversed on json
                if bool_arg == 'keep_db':
                    bool_arg = 'keep-db'

                command += f' --{bool_arg}'

        self._run_command(command)

        report = self._generate_report(
            params)  # Basically, do everything that's after the tool runs

        return report

    def _run_command(self, command):
        """

        :param command:
        :return:
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, err = pipe.communicate()
        exitCode = pipe.returncode

        if exitCode == 0:
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nError: {}'.format(
                exitCode, output, err)
            raise RuntimeError(error_msg)

    def _parse_summary(self, virsorter_global_fp, affi_contigs_shock_id):
        columns = [
            'Contig_id',
            'Nb genes contigs',
            'Fragment',
            'Nb genes',
            'Category',
            'Nb phage hallmark genes',
            'Phage gene enrichment sig',
            'Non-Caudovirales phage gene enrichment sig',
            'Pfam depletion sig',
            'Uncharacterized enrichment sig',
            'Strand switch depletion sig',
            'Short genes enrichment sig',
        ]

        try:
            with open(virsorter_global_fp, 'r') as vir_fh:
                data = {}
                category = ''
                for line in vir_fh:
                    if line.startswith('## Contig_id'):
                        continue
                    elif line.startswith(
                            '## '
                    ):  # If 'header' lines are consumed by 1st if, then remaining should be good
                        category = line.split('## ')[-1].split(' -')[0]
                    else:
                        values = line.strip().split(',')
                        data[values[0]] = dict(zip(columns[1:], values[1:]))
        except:
            vir_path = os.path.join(os.getcwd(), 'virsorter-out')
            files = os.listdir(vir_path)
            raise RuntimeError(
                f"{virsorter_global_fp} is not a file. existing files {files}."
            )

        df = pd.DataFrame().from_dict(data, orient='index')
        df.index.name = columns[0]
        df.reset_index(inplace=True)

        html = df.to_html(index=False,
                          classes='my_class table-striped" id = "my_id')

        # Need to file write below
        direct_html = html_template.substitute(
            html_table=html, affi_contigs_shock_id=affi_contigs_shock_id)

        # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer
        start_header = Literal("<thead>")
        end_header = Literal("</thead>")

        text = start_header + SkipTo(end_header)

        new_text = ''
        for data, start_pos, end_pos in text.scanString(direct_html):
            new_text = ''.join(data).replace(
                ' style="text-align: right;"', '').replace(
                    'thead>', 'tfoot>\n  ') + '\n</tfoot>'

        # Get start and end positions to insert new text
        end_tbody = Literal("</tbody>")
        end_table = Literal("</table>")

        insertion_pos = end_tbody + SkipTo(end_table)

        final_html = ''
        for data, start_pos, end_pos in insertion_pos.scanString(direct_html):
            final_html = direct_html[:start_pos +
                                     8] + '\n' + new_text + direct_html[
                                         start_pos + 8:]

        return final_html

    def get_assembly_contig_ids(self, assembly_ref):
        """get contig ids from assembly_ref"""
        contigs = self.ws.get_objects2(
            {'objects': [{
                'ref': assembly_ref,
                'included': ['contigs']
            }]})['data'][0]['data']['contigs']
        return contigs.keys()

    def _generate_report(self, params):
        """

        :param params:
        :return:
        """

        # Get URL
        self.dfu = dfu(params['SDK_CALLBACK_URL'])

        # Output directory should be $PWD/virsorter-out - ASSUMES that's the output location
        virsorter_outdir = os.path.join(os.getcwd(), 'virsorter-out')

        print(
            f'VIRSorter output directory contents: {os.listdir(virsorter_outdir)}'
        )

        # Replacing individual download files with BinnedContigs

        # kb_deseq adds output files, then builds report files and sends all of them to the workspace
        output_files = []  # Appended list of dicts containing attributes

        # Collect all the files needed to report to end-user
        # Get all predicted viral sequences
        pred_fnas = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.fasta'))
        pred_gbs = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.gb'))
        # Summary 'table'
        glob_signal = os.path.join(virsorter_outdir,
                                   'VIRSorter_global-phage-signal.csv')

        print('Identified the following predicted viral sequences:\n{}'.format(
            '\n\t'.join(pred_fnas)))

        if len(pred_fnas) == 0:
            print(
                f"Unable to find predicted viral sequences, here are the directory's content:\n"
                f"{os.listdir(os.path.join(virsorter_outdir, 'Predicted_viral_sequences'))}"
            )

        if os.path.exists(glob_signal):

            print(f'Identified the global phage signal: {glob_signal}')

            lines = -1  # Don't count header
            with open(glob_signal) as fh:
                for ln in fh:
                    lines += 1

            if lines == 0:
                print('But it is EMPTY!')

        else:
            print(
                'Unable to find the global phage signal file. Was there an error during the run?'
            )

        # Append error and out files from VIRSorter
        err_fp = os.path.join(virsorter_outdir, 'logs/err')
        # if os.path.exists(err_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/err'),
        #         'name': 'VIRSorter_err',
        #         'label': 'VIRSorter_err',
        #         'description': 'VIRSorter error log file, generated from the tool itself.'
        #     })
        out_fp = os.path.join(virsorter_outdir, 'logs/out')
        # if os.path.exists(out_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/out'),
        #         'name': 'VIRSorter_out',
        #         'label': 'VIRSorter_out',
        #         'description': 'VIRSorter output log file, generated from the tool itself.'
        #     })

        if not (os.path.exists(err_fp) or os.path.exists(out_fp)):
            print(
                'Unable to find err and/or out files in LOG directory, contents:'
            )
            print(os.listdir(os.path.join(virsorter_outdir, 'logs')))

        # Make output directory
        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_dir)

        # Deal with nucleotide and protein fasta
        pred_fna_tgz_fp = os.path.join(output_dir,
                                       'VIRSorter_predicted_viral_fna.tar.gz')
        with tarfile.open(
                pred_fna_tgz_fp,
                'w:gz') as pred_fna_tgz_fh:  # Compress to minimize disk usage
            for pred_fna in pred_fnas:
                pred_fna_tgz_fh.add(pred_fna,
                                    arcname=os.path.basename(pred_fna))
        output_files.append({
            'path':
            pred_fna_tgz_fp,
            'name':
            os.path.basename(pred_fna_tgz_fp),
            'label':
            os.path.basename(pred_fna_tgz_fp),
            'description':
            'FASTA-formatted nucleotide sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_fna_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in FASTA format: '
                f'{pred_fna_tgz_fp}')

        pred_gb_tgz_fp = os.path.join(output_dir,
                                      'VIRSorter_predicted_viral_gb.tar.gz')
        with tarfile.open(pred_gb_tgz_fp, 'w:gz') as pred_gb_tgz_fh:
            for pred_gb in pred_gbs:
                pred_gb_tgz_fh.add(pred_gb, arcname=os.path.basename(pred_gb))
        output_files.append({
            'path':
            pred_gb_tgz_fp,
            'name':
            os.path.basename(pred_gb_tgz_fp),
            'label':
            os.path.basename(pred_gb_tgz_fp),
            'description':
            'Genbank-formatted sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_gb_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in Genbank format: '
                f'{pred_gb_tgz_fp}')

        # To create BinnedContig, need to create another directory with each of the "bins" as separate files?
        binned_contig_output_dir = os.path.join(self.scratch,
                                                str(uuid.uuid4()))
        self._mkdir_p(binned_contig_output_dir)

        # Before creating final HTML output, need to create BinnedContig object so other tools/users can take advantage
        # of its features, but also to feed more easily into other tools (e.g. vConTACT)
        created_objects = []  # Will store the objects that go to the workspace

        # load contig ids from the assembly input
        # assembly_contig_ids = self.get_assembly_contig_ids(self.assembly_ref)
        assembly_contig_ids = self.get_assembly_contig_ids(
            params['genomes'])  # Will fail for Genome

        summary_fp = os.path.join(
            binned_contig_output_dir,
            'VIRSorter.summary')  # Anything that ends in .summary
        with open(summary_fp, 'w') as summary_fh:

            summary_writer = csv.writer(summary_fh,
                                        delimiter='\t',
                                        quoting=csv.QUOTE_MINIMAL)
            summary_writer.writerow(
                ['Bin name', 'Completeness', 'Genome size', 'GC content'])

            for category_fp in pred_fnas:
                # _get_bin_ids from MetaGenomeUtils requires files to follow the header.0xx.fasta convention
                category = os.path.basename(category_fp).split(
                    'cat-')[-1].split('.')[0]
                dest_fn = 'VirSorter.{}.fasta'.format(category.zfill(3))
                dest_fp = os.path.join(output_dir, dest_fn)
                binned_contig_fp = os.path.join(binned_contig_output_dir,
                                                dest_fn)

                genome_size = 0
                gc_content = []

                # Need stats for summary file
                # Also need to adjust sequence name so binnedContig object can retrieve sequences
                adjusted_sequences = []
                with open(category_fp, 'rU') as category_fh:
                    for record in SeqIO.parse(category_fh, 'fasta'):
                        seq = record.seq
                        gc_content.append(SeqUtils.GC(seq))
                        genome_size += len(seq)

                        # This is very dirty, but need to change name to match original contigs
                        record.id = record.id.replace('VIRSorter_',
                                                      '').replace(
                                                          '-circular',
                                                          '').split('-cat_')[0]
                        if 'gene' in record.id:  # Prophage
                            record.id = record.id.split('_gene')[0]
                        record.id = record.id.rsplit('_', 1)[0]

                        # here we make sure that the id's line up with contig ids in the input assembly object
                        if record.id not in assembly_contig_ids:
                            for assembly_contig_id in assembly_contig_ids:
                                # first check if record.id is substring of current contig id,
                                # then check if current contig id is substring of record.id
                                # NOTE: this is not a perfect way of checking and will likely
                                #       fail in some circumstances.
                                #       A more complete check would be to make sure there is a 1:1
                                #       mapping of contig id's in the assembly object as compared to
                                #       the binned contig object (the fasta files defined here).
                                if (record.id in assembly_contig_id) or (
                                        assembly_contig_id in record.id):
                                    record.id = assembly_contig_id
                                    break

                        record.description = ''
                        record.name = ''
                        adjusted_sequences.append(record)

                if genome_size != 0:  # Empty file

                    summary_writer.writerow([
                        dest_fn, '100%', genome_size,
                        (sum(gc_content) / len(gc_content))
                    ])

                    print('Copying {} to results directory'.format(
                        os.path.basename(category_fp)))
                    # Yes, need both. One is to get file_links in report. Second is for binnedContigs object
                    shutil.copyfile(category_fp, dest_fp)

                    # Write renamed sequences
                    with open(binned_contig_fp, 'w') as binned_contig_fh:
                        SeqIO.write(adjusted_sequences, binned_contig_fh,
                                    'fasta')

                    result = self.au.save_assembly_from_fasta({
                        'file': {
                            'path': dest_fp
                        },
                        'workspace_name':
                        params['workspace_name'],
                        'assembly_name':
                        'VirSorter-Category-{}'.format(category)
                    })

                    created_objects.append({
                        "ref":
                        result,
                        "description":
                        "KBase Assembly object from VIRSorter"
                    })

        # Create BinnedContigs object, but 1st, a little metadata
        generate_binned_contig_param = {
            'file_directory': binned_contig_output_dir,
            'assembly_ref':
            params['genomes'],  # params.get('genomes'), self.assembly_ref
            'binned_contig_name': params['binned_contig_name'],
            'workspace_name': params['workspace_name']
        }
        binned_contig_object_ref = self.mgu.file_to_binned_contigs(
            generate_binned_contig_param).get('binned_contig_obj_ref')

        # Add binned contigs reference here, as it was already created above
        created_objects.append({
            "ref": binned_contig_object_ref,
            "description": "BinnedContigs from VIRSorter"
        })

        # Save VIRSorter_affi-contigs.tab for DRAM-v
        affi_contigs_fp = os.path.join(virsorter_outdir, 'Metric_files',
                                       'VIRSorter_affi-contigs.tab')
        affi_contigs_shock_id = self.dfu.file_to_shock(
            {'file_path': affi_contigs_fp})['shock_id']

        # Use global signal (i.e. summary) file and create HTML-formatted version
        raw_html = self._parse_summary(glob_signal, affi_contigs_shock_id)

        html_fp = os.path.join(output_dir, 'index.html')

        with open(html_fp, 'w') as html_fh:
            html_fh.write(raw_html)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_report = [{
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(html_fp),
            'label':
            os.path.basename(html_fp),
            'description':
            'HTML summary report for VIRSorter-predicted viral genomes.'
        }]

        report_params = {
            'message':
            'Here are the results from your VIRSorter run. Above, you\'ll find a report with '
            'all the identified (putative) viral genomes, and below, links to the report as '
            'well as files generated.',
            'workspace_name':
            params['workspace_name'],
            'html_links':
            html_report,
            'direct_html_link_index':
            0,
            'report_object_name':
            'VIRSorter_report_{}'.format(str(uuid.uuid4())),
            'file_links':
            output_files,
            'objects_created':
            created_objects,
        }

        kbase_report_client = KBaseReport(params['SDK_CALLBACK_URL'],
                                          token=params['KB_AUTH_TOKEN'])
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref'],
            'result_directory': binned_contig_output_dir,
            'binned_contig_obj_ref': binned_contig_object_ref
        }

        return report_output

    def _mkdir_p(self, path):
        """
        :param path:
        :return:
        """

        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise
Beispiel #26
0
class vConTACTUtils:

    def __init__(self, config):
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.token = os.environ['KB_AUTH_TOKEN']
        self.scratch = os.path.abspath(config['scratch'])
        self.ws = Workspace(config['workspace-url'], token=self.token)
        self.genome_api = GenomeAnnotationAPI(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)

    def vcontact_help(self):
        command = "vcontact --help"
        self._run_command(command)

    def execute(self, command: list):
        """
        :param command: Command suitable for running in subprocess, must use a ['ls', '-l'] format
        :return: Response from command
        """
        # logger.info('Running command: {}'.format(command))
        print('Running command: {}'.format(' '.join(command)))
        res = subprocess.run(command, shell=False, encoding='utf-8', check=True)

        return res

    def run_vcontact(self, params):

        # Determine KBase "inputs" for vConTACT2
        genome = params['genome']

        obj_type = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][2]

        if 'assembly' in obj_type.lower():  # If KBaseGenomeAnnotations.Assembly

            # Assembly requires annotation
            genome_fp = self.au.get_assembly_as_fasta({'ref': genome})['path']
            proteins_fp = os.path.join(self.scratch, 'proteins.faa')
            proteins_gbk = os.path.join(self.scratch, 'proteins.gbk')
            gene2genome_fp = os.path.join(self.scratch, 'gene2genome.csv')

            prodigal_cmd = ['prodigal', '-a', proteins_fp, '-o', proteins_gbk, '-f', 'gbk',
                            '-i', genome_fp, '-p', 'meta']
            res = self.execute(prodigal_cmd)

            records = {}
            with open(proteins_fp, 'r') as proteins_fh:
                for record in SeqIO.parse(proteins_fh, 'fasta'):

                    records[len(records)] = {
                        'protein_id': record.id,
                        'contig_id': record.id.rsplit('_', 1)[0],
                        'keywords': 'None'
                    }

            g2g_df = pd.DataFrame.from_dict(records, orient='index')
            g2g_df.to_csv(gene2genome_fp, index=False)

            # Pass filepaths to the app and run
            params['gene2genome'] = gene2genome_fp
            params['sequences'] = proteins_fp

        elif 'kbasegenomes' in obj_type.lower(): # If KBaseGenomes.Genome
            genome_data = self.genome_api.get_genome_v1({"genomes": [{"ref": genome}]})

            # Convert genome data into "reasonable" parse form and write to scratch filesystem
            gene2genome, sequences = self.genome_to_inputs(genome_data)
            gene2genome_fp, sequences_fp = self.write_inputs(gene2genome, sequences)

            # Pass filepaths to the app and run
            params['gene2genome'] = gene2genome_fp
            params['sequences'] = sequences_fp

        elif 'binnedcontigs' in obj_type.lower():  # If KBaseMetagenomes.BinnedContigs
            print('KBaseMetagenomes.BinnedContigs hasnt been enabled. Check back later.')
            exit(1)
        else:
            print('Unknown error in identifying object types')

        print('Available database files')
        print(os.listdir('/miniconda/lib/python3.7/site-packages/vcontact2/data/'))

        # Just iterate through all parameters
        mappings = {
            'gene2genome': '--proteins-fp',
            'sequences': '--raw-proteins',
            'db': '--db',
            'pcs_mode': '--pcs-mode',
            'vcs_mode': '--vcs-mode',
            'blast_evalue': '--blast-evalue',
            'pc_max_overlap': '--max-overlap',
            'pc_penalty': '--penalty',
            'pc_haircut': '--haircut',
            'pc_inflation': '--pc-inflation',
            'vc_inflation': '--vc-inflation',
            'vc_density': '--min-density',
            'vc_min_size': '--min-size',
            'vc_max_overlap': '--vc-overlap',
            'vc_penalty': '--vc-penalty',
            'vc_haircut': '--vc-haircut',
            'merge_method': '--merge-method',
            'similarity': '--similarity',
            'seed_method': '--seed-method',
            'min_significance': '--sig',
            'max_significance': '--max-sig',
            'module_inflation': '--mod-inflation',
            'mod_significance': '--mod-sig',
            'module_min_shared': '--mod-shared-min',
            'link_significance': '--link-sig',
            'link_proportion': '--link-prop'
        }

        bool_args = ['optimize', 'permissive']

        # Should create build_command?
        command = 'vcontact2 --output-dir outdir'
        # Binaries
        command += ' --diamond-bin /usr/local/bin/diamond --c1-bin /usr/local/bin/cluster_one-1.0.jar'

        for param, cmd in mappings.items():
            command += ' {} {}'.format(cmd, params[param])

        self._run_command(command)

        report = self._generate_report(params)

        return report

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def genome_to_inputs(self, genome):
        """
        genome_to_inputs: convert genome annotation data (~json) to file inputs required by vConTACT
        :param genome:
        :return:
        """

        records = []
        gene2genome = OrderedDict()

        genome_data = genome['genomes'][0]

        for item in genome_data['data']['features']:
            if 'id' not in item:
                continue
                print('This feature does not have a valid id')
            elif 'dna_sequence' not in item or 'protein_translation' not in item:
                continue
                print('This feature {} does not have a valid DNA sequence.'.format(item['id']))
            else:
                # Create FASTA file
                if item['type'] == 'gene':
                    desc = (item['functions'] if item.get('functions', None)
                                              else item.get('function', ''))
                    gene_record = SeqRecord(Seq(item['protein_translation']), id=item['id'],
                                            description=desc)
                    records.append(gene_record)

                    # Build gene2genome
                    gene2genome.update({
                        item['id']: {
                            # 'contig_id': genome_data['data']['contig_ids'][0],
                            'contig_id': item['location'][0][0],
                            'protein_id': item['id'],
                            'keywords': item['function']
                        }
                    })

        return gene2genome, records

    def write_inputs(self, mapping, sequences):

        fasta_for_proteins_fp = os.path.join(self.scratch, 'vConTACT_proteins.fasta')
        with open(fasta_for_proteins_fp, 'w') as fasta_for_proteins_fh:
            SeqIO.write(sequences, fasta_for_proteins_fh, 'fasta')

        genes_to_genomes_mapping_fp = os.path.join(self.scratch, 'vConTACT_gene2genome.csv')
        with open(genes_to_genomes_mapping_fp, 'w') as genes_to_genomes_mapping_fh:
            fields = ['contig_id', 'protein_id', 'keywords']
            writer = csv.DictWriter(genes_to_genomes_mapping_fh, fieldnames=fields)
            writer.writeheader()

            for gene in mapping.keys():
                writer.writerow(mapping[gene])

        return genes_to_genomes_mapping_fp, fasta_for_proteins_fp

    def _generate_report(self, params):
        """
        _generate_report: generate summary report

        This will contain ALL the logic to generate the report, including areas that should/will be re-factored later

        """

        # Get
        self.dfu = dfu(self.callback_url)

        # Get filepath of summary file
        summary_fp = os.path.join(os.getcwd(), 'outdir', 'genome_by_genome_overview.csv')

        summary_df = pd.read_csv(summary_fp, header=0, index_col=0)
        html = summary_df.to_html(index=False, classes='my_class table-striped" id = "my_id')

        # Need to file write below
        direct_html = html_template.substitute(html_table=html)

        # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer
        start_header = Literal("<thead>")
        end_header = Literal("</thead>")

        text = start_header + SkipTo(end_header)

        new_text = ''
        for data, start_pos, end_pos in text.scanString(direct_html):
            new_text = ''.join(data).replace(' style="text-align: right;"', '').replace('thead>',
                                                                                        'tfoot>\n  ') + '\n</tfoot>'

        # Get start and end positions to insert new text
        end_tbody = Literal("</tbody>")
        end_table = Literal("</table>")

        insertion_pos = end_tbody + SkipTo(end_table)

        final_html = ''
        for data, start_pos, end_pos in insertion_pos.scanString(direct_html):
            final_html = direct_html[:start_pos + 8] + '\n' + new_text + direct_html[start_pos + 8:]

        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_dir)
        result_fp = os.path.join(output_dir, 'index.html')

        with open(result_fp, 'w') as result_fh:
            result_fh.write(final_html)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_report = [{
            'shock_id': report_shock_id,
            'name': os.path.basename(result_fp),
            'label': os.path.basename(result_fp),
            'description': 'HTML summary report for vConTACT2'
        }]

        report_params = {'message': 'Basic message to show in the report',
                         'workspace_name': params['workspace_name'],
                         'html_links': html_report,
                         'direct_html_link_index': 0,
                         'report_object_name': 'vConTACT_report_{}'.format(str(uuid.uuid4())),
                         # Don't use until have files to attach to report
                         # 'file_links': [{}],
                         # Don't use until data objects that are created as result of running app
                         # 'objects_created': [{'ref': matrix_obj_ref,
                         #                      'description': 'Imported Matrix'}],
                         }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        # https://stackoverflow.com/a/600612/643675
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise
Beispiel #27
0
    def stage_input(self, input_ref, fasta_file_extension):
        '''
        Stage input based on an input data reference for CheckM

        input_ref can be a reference to an Assembly, BinnedContigs, or (not yet implemented) a Genome

        This method creates a directory in the scratch area with the set of Fasta files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input('124/15/1', 'fna')

            staged_input
            {"input_dir": '...'}
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'
        [OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I] = range(11)  # object_info tuple
        ws = Workspace(self.ws_url)

        # 1) generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'bins_' + suffix)
        all_seq_fasta = os.path.join(self.scratch, 'all_sequences_' + suffix + '.' + fasta_file_extension)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)


        # 2) based on type, download the files
        obj_name = self.get_data_obj_name (input_ref)
        type_name = self.get_data_obj_type (input_ref)

        # auClient
        try:
            auClient = AssemblyUtil(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate auClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            setAPI_Client = SetAPI (url=self.serviceWizardURL, token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError('Unable to instantiate setAPI_Client with serviceWizardURL: '+ self.serviceWizardURL +' ERROR: ' + str(e))

        # mguClient
        try:
            mguClient = MetagenomeUtils(self.callbackURL, token=self.ctx['token'], service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError('Unable to instantiate mguClient with callbackURL: '+ self.callbackURL +' ERROR: ' + str(e))


        # Standard Single Assembly
        #
        if type_name in ['KBaseGenomeAnnotations.Assembly', 'KBaseGenomes.ContigSet']:
            # create file data
            filename = os.path.join(input_dir, obj_name + '.' + fasta_file_extension)
            auClient.get_assembly_as_fasta({'ref': input_ref, 'filename': filename})
            if not os.path.isfile(filename):
                raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # AssemblySet
        #
        elif type_name == 'KBaseSets.AssemblySet':

            # read assemblySet
            try:
                assemblySet_obj = setAPI_Client.get_assembly_set_v1 ({'ref':input_ref, 'include_item_info':1})
            except Exception as e:
                raise ValueError('Unable to get object from workspace: (' + input_ref +')' + str(e))
            assembly_refs = []
            assembly_names = []
            for assembly_item in assemblySet_obj['data']['items']:
                this_assembly_ref = assembly_item['ref']
                # assembly obj info
                try:
                    this_assembly_info = ws.get_object_info_new ({'objects':[{'ref':this_assembly_ref}]})[0]
                    this_assembly_name = this_assembly_info[NAME_I]
                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' + this_assembly_ref +'): ' + str(e))
                assembly_refs.append(this_assembly_ref)
                assembly_names.append(this_assembly_name)

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(assembly_refs):
                this_name = assembly_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Binned Contigs
        #
        elif type_name == 'KBaseMetagenomes.BinnedContigs':

            # download the bins as fasta and set the input folder name
            bin_file_dir = mguClient.binned_contigs_to_file({'input_ref': input_ref, 'save_to_shock': 0})['bin_file_directory']
            os.rename(bin_file_dir, input_dir)
            # make sure fasta file isn't empty
            self.set_fasta_file_extensions(input_dir, fasta_file_extension)
            for (dirpath, dirnames, filenames) in os.walk(input_dir):
                for fasta_file in filenames:
                    fasta_path = os.path.join (input_dir,fasta_file)
                    min_fasta_len = 1
                    if not self.fasta_seq_len_at_least(fasta_path, min_fasta_len):
                        raise ValueError('Binned Assembly is empty for fasta_path: '+str(fasta_path))
                break

        # Genome and GenomeSet
        #
        elif type_name == 'KBaseGenomes.Genome' or type_name == 'KBaseSearch.GenomeSet':
            genome_obj_names = []
            genome_sci_names = []
            genome_assembly_refs = []

            if type_name == 'KBaseGenomes.Genome':
                genomeSet_refs = [input_ref]
            else:  # get genomeSet_refs from GenomeSet object
                genomeSet_refs = []
                try:
                    genomeSet_object = ws.get_objects2({'objects':[{'ref':input_ref}]})['data'][0]['data']
                except Exception as e:
                    raise ValueError('Unable to fetch '+str(input_ref)+' object from workspace: ' + str(e))
                    #to get the full stack trace: traceback.format_exc()

                # iterate through genomeSet members
                for genome_id in genomeSet_object['elements'].keys():
                    if 'ref' not in genomeSet_object['elements'][genome_id] or \
                       genomeSet_object['elements'][genome_id]['ref'] == None or \
                       genomeSet_object['elements'][genome_id]['ref'] == '':
                        raise ValueError('genome_ref not found for genome_id: '+str(genome_id)+' in genomeSet: '+str(input_ref))
                    else:
                        genomeSet_refs.append(genomeSet_object['elements'][genome_id]['ref'])

            # genome obj data
            for i,this_input_ref in enumerate(genomeSet_refs):
                try:
                    objects = ws.get_objects2({'objects':[{'ref':this_input_ref}]})['data']
                    genome_obj = objects[0]['data']
                    genome_obj_info = objects[0]['info']
                    genome_obj_names.append(genome_obj_info[NAME_I])
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError ("unable to fetch genome: "+this_input_ref)

                # Get genome_assembly_ref
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    raise ValueError (msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj['assembly_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING assembly_ref: "+str(genome_obj['assembly_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj['contigset_ref'] != None:
                    msg = "Genome "+genome_obj_names[i]+" (ref:"+input_ref+") "+genome_sci_names[i]+" USING contigset_ref: "+str(genome_obj['contigset_ref'])
                    print (msg)
                    genome_assembly_refs.append(genome_obj['contigset_ref'])

            # create file data (name for file is what's reported in results)
            for ass_i,assembly_ref in enumerate(genome_assembly_refs):
                this_name = genome_obj_names[ass_i]
                filename = os.path.join(input_dir, this_name + '.' + fasta_file_extension)
                auClient.get_assembly_as_fasta({'ref': assembly_ref, 'filename': filename})
                if not os.path.isfile(filename):
                    raise ValueError('Error generating fasta file from an Assembly or ContigSet with AssemblyUtil')
                # make sure fasta file isn't empty
                min_fasta_len = 1
                if not self.fasta_seq_len_at_least(filename, min_fasta_len):
                    raise ValueError('Assembly or ContigSet is empty in filename: '+str(filename))

        # Unknown type slipped through
        #
        else:
            raise ValueError('Cannot stage fasta file input directory from type: ' + type_name)


        # create summary fasta file with all bins
        self.cat_fasta_files(input_dir, fasta_file_extension, all_seq_fasta)

        return {'input_dir': input_dir, 'folder_suffix': suffix, 'all_seq_fasta': all_seq_fasta}
Beispiel #28
0
    def download_long(self, console, warnings, token, wsname, lib,
                      min_long_read_length):
        try:
            # object info
            try:
                wsClient = Workspace(self.workspaceURL, token=token)
            except Exception as e:
                raise ValueError("unable to instantiate wsClient. " + str(e))

            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple

            obj_id = {'ref': lib if '/' in lib else (wsname + '/' + lib)}
            lib_obj_info = wsClient.get_object_info_new({'objects':
                                                         [obj_id]})[0]
            lib_obj_type = lib_obj_info[TYPE_I]
            lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                  lib_obj_type)  # remove trailing version
            lib_ref = str(lib_obj_info[WSID_I])+'/' + \
                str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I])
            if lib_obj_type == 'KBaseGenomes.ContigSet' or lib_obj_type == 'KBaseGenomeAnnotations.Assembly':
                # download using assembly util / data file util
                self.log(console,
                         "Getting long reads (from contigs object).\n")
                auClient = AssemblyUtil(url=self.callbackURL, token=token)
                dfuClient = DataFileUtil(url=self.callbackURL, token=token)
                contigFile = auClient.get_assembly_as_fasta({
                    'ref': lib_ref
                }).get('path')
                long_reads_path = dfuClient.unpack_file(
                    {'file_path': contig_file})['file_path']
                self.log(
                    warnings,
                    "Warning:  Long reads are in FASTA format, so short read check was not performed."
                )

            else:
                ruClient = ReadsUtils(url=self.callbackURL, token=token)
                self.log(console,
                         "Getting long reads (from reads library object).\n")
                result = ruClient.download_reads({
                    'read_libraries': [lib_ref],
                    'interleaved': 'false'
                })
                long_reads_path = result['files'][lib_ref]['files']['fwd']
                [n_reads, n_reads_short
                 ] = self.filter_short_fastq(console, long_reads_path,
                                             min_long_read_length)
                if (n_reads_short > 0):
                    self.log(
                        warnings, "Warning:  Of " + str(n_reads) +
                        " long reads, " + str(n_reads_short) +
                        " are shorter than " + str(min_long_read_length) +
                        "; consider using the filtlong app to filter out shorter reads."
                    )

        except Exception as e:
            raise ValueError('Unable to download long reads\n' + str(e))
        return long_reads_path
    def run_rmrContigFilter(self, ctx, params):
        """
        Example app which filters contigs in an assembly using both a minimum contig length
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_rmrContigFilter

        # Print statements to stdout/stderr are captured and available as the App log
        logging.info('Starting run_rmrContigFilter function. Params=' +
                     pformat(params))

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        logging.info('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        logging.info('Filtered Assembly to ' + str(n_remaining) +
                     ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        logging.info('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })

        # Step 4b - Build html report
        # create html string
        # write string to file to self.shared_folder
        # upload to shock
        # send to report

        html_header = "<!DOCTYPE html><html><head><meta charset=\"UTF-8\"><title>title</title></head><body><table>"
        html_footer = "</table></body></html>"

        tableentries = "<tr><th>ID</th><th>A %</th><th>C %</th><th>T %</th><th>G %</th></tr>"
        for contig in good_contigs:
            Acount = contig.seq.upper().count('A')
            Ccount = contig.seq.upper().count('C')
            Tcount = contig.seq.upper().count('T')
            Gcount = contig.seq.upper().count('G')
            total = Acount + Ccount + Tcount + Gcount

            Aper = 100 * (Acount / total)
            Cper = 100 * (Ccount / total)
            Gper = 100 * (Gcount / total)
            Tper = 100 * (Tcount / total)

            tmprow = "<tr><td>" + contig.id + "</td><td>" + str(round(
                Aper,
                2)) + "</td><td>" + str(round(Cper, 2)) + "</td><td>" + str(
                    round(Tper, 2)) + "</td><td>" + str(round(
                        Gper, 2)) + "</td></tr>"

            tableentries += tmprow

        # Create the html string
        html_str = html_header + tableentries + html_footer

        # Write the html string to a file in the shared folder
        html_file_dir = os.path.join(self.shared_folder, 'html')
        if not os.path.isdir(html_file_dir):
            os.mkdir(html_file_dir)
        html_file_path = os.path.join(html_file_dir, 'output_table.html')
        html_file = open(html_file_path, "w")
        html_file.write(html_str)
        html_file.close()
        """
        Will try to not use shock first
        # Upload the html file to shock
        dfu = DataFileUtil(self.callback_url)

        try:
            shock_html_upload = dfu.file_to_shock({'file_path': html_file_dir, 'make_handle': 0, 'pack':'zip'})
        except:
            raise ValueError('Unable to upload html file to shock with DataFileUtil')
        """

        # Step 5 - Build a Report and return
        """
        Old Report .create method:
        https://github.com/kbaseapps/KBaseReportPy/blob/master/lib/KBaseReportPy/KBaseReportPyImpl.py

        reportObj = {
            'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
            'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})
        """

        # New report .create_extended_report
        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'message':
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total),
            'direct_html':
            None,
            'direct_html_link_index':
            0,
            'file_links': [],
            #'html_links': [{'shock-id': shock_html_upload['shock_id'], 'name': 'output-table.html', 'label': 'contig table'}],
            'html_links': [{
                'path': html_file_dir,
                'name': 'output_table.html',
                'description': 'HTML report for contig filtering'
            }],
            'workspace_name':
            params['workspace_name'],
        }

        report = KBaseReport(self.callback_url)
        report_info = report.create_extended_report(reportObj)

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'assembly_output': new_assembly,
            'n_initial_contigs': n_total,
            'n_contigs_removed': n_total - n_remaining,
            'n_contigs_remaining': n_remaining
        }
        logging.info('returning:' + pformat(output))

        #END run_rmrContigFilter

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_rmrContigFilter return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def run_rmrContigFilter_max(self, ctx, params):
        """
        New app which filters contigs in an assembly using both a minimum and a maximum contig length
        :param params: instance of type "rmrContigFiltermaxinput" ->
           structure: parameter "output_workspace" of String, parameter
           "assembly_input_ref" of type "data_obj_ref", parameter
           "output_assembly_name" of String, parameter "min_length" of Long,
           parameter "max_length" of Long, parameter "report_ref" of String,
           parameter "report_name" of String
        :returns: instance of type "ReportResultsmax" -> structure: parameter
           "objNameOrId" of type "assembly_ref", parameter "report_name" of
           String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_rmrContigFilter_max

        # Print statements to stdout/stderr are captured and available as the App log
        logging.info('Starting run_rmrContigFilter_max function. Params=' +
                     pformat(params))

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        logging.info('Validating parameters.')
        if 'output_workspace' not in params:
            raise ValueError(
                'Parameter output_workspace is not set in input arguments')
        workspace_name = params['output_workspace']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')
        if 'max_length' not in params:
            raise ValueError(
                'Parameter max_length is not set in input arguments')
        max_length_orig = params['max_length']
        max_length = None
        try:
            max_length = int(max_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from max_length parameter (' +
                str(max_length_orig) + ')')
        if max_length < 0:
            raise ValueError('max_length parameter cannot be negative (' +
                             str(max_length) + ')')
        if min_length >= max_length:
            raise ValueError(
                'max_length cannot be less than or equal to min_length')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length and len(record.seq) <= max_length:
                good_contigs.append(record)
                n_remaining += 1

        logging.info('Filtered Assembly to ' + str(n_remaining) +
                     ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        logging.info('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            #'assembly_name': fasta_file['assembly_name']
            'assembly_name':
            params['output_assembly_name']
        })

        # Step 5 - Build a Report and return
        report = KBaseReport(self.callback_url)

        # This is the old plain text report given in the SDK tutorial
        #reportObj = {
        #    'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
        #    'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        #}

        # This is the old plain text report, we need report.create_extended_report for our new output
        # report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})

        # STEP 6: contruct the output to send back

        # We want to output the new assembly in an assembly viewer, to show the dynamic table
        # associated with the new assembly. We also want to keep our report text.

        report_info = report.create_extended_report({
            "message":
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total),
            "objects_created": [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            #"workspace_id": params['workspace_id'],
            "workspace_name":
            params["output_workspace"]
        })

        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'objNameOrId': params["output_assembly_name"],
            #'n_initial_contigs': n_total,
            #'n_contigs_removed': n_total - n_remaining,
            #'n_contigs_remaining': n_remaining,
            'wsNameOrId': params['output_workspace'],
            #'workspace_id': report_info['ws_id']
        }

        logging.info('returning:' + pformat(output))

        # This will print the ref # to the new assembly created from the filter
        # print("\n\nNEW ASSEMBLY: "+new_assembly+"\n\n")

        #END run_rmrContigFilter_max

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_rmrContigFilter_max return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]