コード例 #1
0
    def import_file(self, ctx, params):

        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) define default parameters
        default_params = {
            'taxon_wsname': 'ReferenceTaxons',
            'scientific_name': "unknown_taxon",
            'taxon_reference': None,
            'source': 'User',
            'release': None,
            'type': 'User upload'
        }

        # 3) Add defaults if they don't exist
        for field in default_params:
            if field not in params:
                params[field] = default_params[field]

        # 4) Do the upload
        result = upload_genome(
            shock_service_url=self.cfg['shock-url'],
            handle_service_url=self.cfg['handle-service-url'],
            workspace_service_url=self.cfg['workspace-url'],
            callback_url=os.environ['SDK_CALLBACK_URL'],
            input_fasta_file=params["fasta_file"],
            input_gff_file=params["gff_file"],
            workspace_name=params['workspace_name'],
            core_genome_name=params['genome_name'],
            scientific_name=params['scientific_name'],
            taxon_wsname=params['taxon_wsname'],
            taxon_reference=params['taxon_reference'],
            source=params['source'],
            release=params['release'],
            genome_type=params['type'])

        # 4) Generate Report
        output_data_ref = params['workspace_name'] + "/" + params['genome_name']
        reportObj = {
            'objects_created': [{
                'ref': output_data_ref,
                'description': 'KBase Genome object'
            }],
            'text_message':
            result['report_string']
        }

        reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL'])
        report_info = reportClient.create({
            'report':
            reportObj,
            'workspace_name':
            params['workspace_name']
        })

        # 5) return the result
        info = result['genome_info']
        details = {
            'genome_ref':
            str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]),
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        return details
コード例 #2
0
    def filter_contigs(self, ctx, params):
        """
        The actual function is declared using 'funcdef' to specify the name
        and input/return arguments to the function.  For all typical KBase
        Apps that run in the Narrative, your function should have the
        'authentication required' modifier.
        :param params: instance of type "FilterContigsParams" (A 'typedef'
           can also be used to define compound or container objects, like
           lists, maps, and structures.  The standard KBase convention is to
           use structures, as shown here, to define the input and output of
           your function.  Here the input is a reference to the Assembly data
           object, a workspace to save output, and a length threshold for
           filtering. To define lists and maps, use a syntax similar to C++
           templates to indicate the type contained in the list or map.  For
           example: list <string> list_of_strings; mapping <string, int>
           map_of_ints;) -> structure: parameter "assembly_input_ref" of type
           "assembly_ref" (A 'typedef' allows you to provide a more specific
           name for a type.  Built-in primitive types include 'string',
           'int', 'float'.  Here we define a type named assembly_ref to
           indicate a string that should be set to a KBase ID reference to an
           Assembly data object.), parameter "workspace_name" of String,
           parameter "min_length" of Long
        :returns: instance of type "FilterContigsResults" (Here is the
           definition of the output of the function.  The output can be used
           by other SDK modules which call your code, or the output
           visualizations in the Narrative.  'report_name' and 'report_ref'
           are special output fields- if defined, the Narrative can
           automatically render your Report.) -> structure: parameter
           "report_name" of String, parameter "report_ref" of String,
           parameter "assembly_output" of type "assembly_ref" (A 'typedef'
           allows you to provide a more specific name for a type.  Built-in
           primitive types include 'string', 'int', 'float'.  Here we define
           a type named assembly_ref to indicate a string that should be set
           to a KBase ID reference to an Assembly data object.), parameter
           "n_initial_contigs" of Long, parameter "n_contigs_removed" of
           Long, parameter "n_contigs_remaining" of Long
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN filter_contigs

        # Print statements to stdout/stderr are captured and available as the App log
        print('Starting Filter Contigs function. Params=')
        pprint(params)

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        print('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
              str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        print('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })

        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'text_message':
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': params['workspace_name']
        })

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'assembly_output': new_assembly,
            'n_initial_contigs': n_total,
            'n_contigs_removed': n_total - n_remaining,
            'n_contigs_remaining': n_remaining
        }
        print('returning:' + pformat(output))

        #END filter_contigs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #3
0
    def filter_contigs(self, ctx, params):
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_contigs

        # Print statements to stdout/stderr are captured and available as the App log
        print('Starting Filter Contigs function. Params=')
        pprint(params)

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        print('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError('Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError('Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError('Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError('Cannot parse integer from min_length parameter (' + str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' + str(min_length) + ')')


        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta({'ref': assembly_input_ref})


        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        print('Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')


        # Step 4 - Save the new Assembly back to the system
        print('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({'file': {'path': filtered_fasta_file},
                                                              'workspace_name': workspace_name,
                                                              'assembly_name': fasta_file['assembly_name']
                                                              })


        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{'ref': new_assembly, 'description': 'Filtered contigs'}],
            'text_message': 'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' + str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})


        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref'],
                  'assembly_output': new_assembly,
                  'n_initial_contigs': n_total,
                  'n_contigs_removed': n_total - n_remaining,
                  'n_contigs_remaining': n_remaining
                  }
        print('returning:' + pformat(output))
                
        #END filter_contigs
        

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #4
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (run_megahit() ** ** 
           @optional megahit_parameter_preset **     @optional
           min_contig_len) -> structure: parameter "workspace_name" of
           String, parameter "input_reads_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "combined_assembly_flag" of Long, parameter
           "megahit_parameter_preset" of String, parameter "min_contig_len"
           of Long, parameter "kmer_params" of type "Kmer_Params" (Kmer
           Params **     @optional min_count **     @optional k_min **    
           @optional k_max **     @optional k_step **     @optional k_list)
           -> structure: parameter "min_count" of Long, parameter "k_min" of
           Long, parameter "k_max" of Long, parameter "k_step" of Long,
           parameter "k_list" of list of Long
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        console = []
        self.log(console, 'Running run_megahit() with params=')
        self.log(console, "\n" + pformat(params))

        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'

        ### STEP 1: basic parameter checks + parsing
        required_params = [
            'workspace_name', 'input_reads_ref', 'output_contigset_name'
        ]
        for required_param in required_params:
            if required_param not in params or params[required_param] == None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        ### STEP 2: call exec_megahit() - input params are the same, so just pass through
        exec_megahit_output = self.exec_megahit(ctx, params)[0]

        ### STEP 3: save the report
        reportObj = {
            'objects_created': [],
            'text_message': exec_megahit_output['report_text']
        }
        for obj_ref in exec_megahit_output['output_contigset_refs']:
            reportObj['objects_created'].append({
                'ref':
                obj_ref,
                'description':
                'Assembled contigs'
            })

        reportClient = KBaseReport(self.callbackURL,
                                   token=ctx['token'],
                                   service_ver=SERVICE_VER)
        report_info = reportClient.create({
            'report':
            reportObj,
            'workspace_name':
            params['workspace_name']
        })

        ### STEP 4: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #5
0
    def remove_adapters(self, ctx, params):
        """
        :param params: instance of type "RemoveAdaptersParams" -> structure:
           parameter "output_workspace" of String, parameter
           "output_object_name" of String, parameter "input_reads" of type
           "ws_ref" (@ref ws), parameter "five_prime" of type
           "FivePrimeOptions" (unfortunately, we have to name the fields
           uniquely between 3' and 5' options due to the current
           implementation of grouped parameters) -> structure: parameter
           "adapter_sequence_5P" of String, parameter "anchored_5P" of type
           "boolean" (@range (0, 1)), parameter "three_prime" of type
           "ThreePrimeOptions" -> structure: parameter "adapter_sequence_3P"
           of String, parameter "anchored_3P" of type "boolean" (@range (0,
           1)), parameter "error_tolerance" of Double, parameter
           "min_overlap_length" of Long, parameter "min_read_length" of Long,
           parameter "discard_untrimmed" of type "boolean" (@range (0, 1))
        :returns: instance of type "RemoveAdaptersResult" -> structure:
           parameter "report_ref" of String, parameter "output_reads_ref" of
           String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN remove_adapters
        console = []
        self.log(console, 'Running remove_adapters() with parameters: ')
        self.log(console, "\n" + pformat(params) + "\n")
        self.log(console, "-------------------------------------------\n")

        token = ctx['token']
        wsClient = workspaceService(self.config['workspace-url'], token=token)
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'

        # param checks
        required_params = [
            'output_workspace', 'input_reads', 'output_object_name',
            'min_read_length'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        provenance[0]['input_ws_objects'] = [str(params['input_reads'])]

        # RUN
        exec_remove_adapters_retVal = self.exec_remove_adapters(ctx, params)[0]

        # build report
        #
        reportObj = {'objects_created': [], 'text_message': ''}

        # text report
        try:
            reportObj['text_message'] = exec_remove_adapters_retVal['report']
        except:
            raise ValueError("no report generated by exec_remove_adapters()")

        # output object
        if exec_remove_adapters_retVal['output_reads_ref'] != None:
            reportObj['objects_created'].append({
                'ref':
                exec_remove_adapters_retVal['output_reads_ref'],
                'description':
                'Post Cutadapt Reads'
            })
        else:
            raise ValueError("no output generated by exec_remove_adapters()")

        # save report object
        report = KBaseReport(self.config['SDK_CALLBACK_URL'],
                             token=ctx['token'],
                             service_ver=SERVICE_VER)
        report_info = report.create({
            'report':
            reportObj,
            'workspace_name':
            params['output_workspace']
        })

        result = {
            'output_reads_ref':
            exec_remove_adapters_retVal['output_reads_ref'],
            'report_ref': report_info['ref'],
            'report_name': report_info['name']
        }
        #END remove_adapters

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method remove_adapters return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
コード例 #6
0
    def filter_contigs(self, ctx, params):
        """
        Main method
        :param params: instance of type "ContigFilterParams" (Input parameter
           types) -> structure: parameter "workspace_name" of String,
           parameter "assembly_ref" of String, parameter "min_length" of Long
        :returns: instance of type "ContigFilterResults" (Output result
           types) -> structure: parameter "report_name" of String, parameter
           "report_ref" of String, parameter "filtered_assembly_ref" of
           String, parameter "n_total" of Long, parameter "n_remaining" of
           Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_contigs
        for name in ['min_length', 'assembly_ref', 'workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name + '" is required but missing')
        if not isinstance(params['min_length'], int) or (params['min_length'] < 0):
            raise ValueError('Min length must be a non-negative integer')
        if not isinstance(params['assembly_ref'], basestring) or not len(params['assembly_ref']):
            raise ValueError('Pass in a valid assembly reference string')

        print("params['min_length']=%s, params['assembly_ref']=%s" % (params['min_length'], params['assembly_ref']))
        print("params['params['workspace_name']=%s" % (params['workspace_name']))
        print("self.callback_url=%s" % self.callback_url)
        print("self.scratch=%s" % self.scratch)
        print "config = "
        pprint.pprint(self.config)

        ###############
        # Download ref
        ##############
        assembly_util = AssemblyUtil(self.callback_url)
        file = assembly_util.get_assembly_as_fasta({'ref': params['assembly_ref']})
        print "assembly fasta file = "
        pprint.pprint(file)

        ###################################
        # Real business - filter the contig
        ###################################
        parsed_assembly = SeqIO.parse(file['path'], 'fasta')
        min_length = params['min_length']
        # Keep a list of contigs greater than min_length
        good_contigs = []
        # total contigs regardless of length
        n_total = 0
        # total contigs over the min_length
        n_remaining = 0
        for record in parsed_assembly:
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        # returnVal = {
        #     'n_total': n_total,
        #     'n_remaining': n_remaining
        # }

        # returnVal = {}

        ##################
        # Output
        ##################
        workspace_name = params['workspace_name']
        filtered_path = os.path.join(self.scratch, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_path, 'fasta')
        # Upload the filtered data to the workspace
        new_ref = assembly_util.save_assembly_from_fasta({
            'file': {
                'path': filtered_path
            },
            'workspace_name': workspace_name,
            'assembly_name': file['assembly_name']
        })

        # returnVal = {
        #     'n_total': n_total,
        #     'n_remaining': n_remaining,
        #     'filtered_assembly_ref': new_ref
        # }


        ################
        # Reporting
        ################
        text_message = "".join([
            'Filtered assembly to ', str(n_remaining),
            's contigs out of ', str(n_total)
        ])
        # Data for creating the report, referencing the assembly we uploaded
        report_data = {
            'objects_created': [
                {'ref': new_ref, 'description': 'Filtered contigs'}
            ],
            'text_message': text_message
        }
        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create({
            'report': report_data,
            'workspace_name': workspace_name
        })
        # Return the report reference and name in our results
        returnVal = {
            'report_ref': report['ref'],
            'report_name': report['name'],
            'n_total': n_total,
            'n_remaining': n_remaining,
            'filtered_assembly_ref': new_ref
        }

        ###############
        # BBtools test
        ###############
        # bbtools = BBTools(self.callback_url)
        bbtools = BBTools(self.callback_url, service_ver='beta')

        # set up input files
        print "file['path'] = "
        print file['path']
        # print new_ref['filtered_assembly_ref']
        rqc_filter_input = {
            "reads_file": file['path'] # /kb/module/work/tmp/Shewanella_oneidensis_MR-1_assembly.fa
        }
        # or, if you want to use a KBase Workspace UPA for your reads object:
        # rqc_filter_input = {
        #     "reads_library_ref": new_ref['filtered_assembly_ref']
        # }

        # set up parameters (example below, there are many more options, see BBTools.spec)
        rqc_filter_params = {
            "qtrim": "rl",
            "maxns": 3,
            "minlength": 40,
            "maxmem": 5
        }
        #"maxmem": 5

        # run the local RQCFilter function
        result = bbtools.run_RQCFilter_local(rqc_filter_input, rqc_filter_params)
        print "result = "
        pprint.pprint(result)
        #END filter_contigs

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method filter_contigs return value returnVal is not type dict as required.')

        # return the results
        return [returnVal]
コード例 #7
0
ファイル: kb_PRINSEQImpl.py プロジェクト: nlharris/kb_PRINSEQ
    def execReadLibraryPRINSEQ(self, ctx, input_params):
        """
        :param input_params: instance of type "inputPRINSEQ" (execPRINSEQ and
           execReadLibraryPRINSEQ input input_reads_ref : may be
           KBaseFile.PairedEndLibrary or KBaseFile.SingleEndLibrary output_ws
           : workspace to write to output_reads_name : obj_name to create
           lc_method : Low complexity method - value must be "dust" or
           "entropy" lc_entropy_threshold : Low complexity threshold - Value
           must be an integer between 0 and 100. Note a higher
           lc_entropy_threshold in entropy is more stringent.
           lc_dust_threshold : Low complexity threshold - Value must be an
           integer between 0 and 100. Note a lower lc_entropy_threshold is
           less stringent with dust) -> structure: parameter
           "input_reads_ref" of type "data_obj_ref", parameter "output_ws" of
           type "workspace_name" (Common Types), parameter
           "output_reads_name" of type "data_obj_name", parameter "lc_method"
           of String, parameter "lc_entropy_threshold" of Long, parameter
           "lc_dust_threshold" of Long
        :returns: instance of type "outputReadLibraryExecPRINSEQ" ->
           structure: parameter "output_filtered_ref" of type "data_obj_ref",
           parameter "output_unpaired_fwd_ref" of type "data_obj_ref",
           parameter "output_unpaired_rev_ref" of type "data_obj_ref",
           parameter "report" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN execReadLibraryPRINSEQ
        console = []
        #        self.log(console, 'Running execTrimmomatic with parameters: ')
        #        self.log(console, "\n"+pformat(input_params))
        report = ''
        returnVal = dict()
        #        retVal['output_filtered_ref'] = None
        #        retVal['output_unpaired_fwd_ref'] = None
        #        retVal['output_unpaired_rev_ref'] = None

        token = ctx['token']
        wsClient = workspaceService(self.ws_url, token=token)
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # param checks
        required_params = ['input_reads_ref', 'output_ws', 'lc_method']
        # output reads_name is optional. If not set will use old_objects name
        for required_param in required_params:
            if required_param not in input_params or input_params[
                    required_param] is None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        if (input_params['lc_method'] != 'dust') and (input_params['lc_method']
                                                      != 'entropy'):
            raise ValueError(
                "lc_method (low complexity method) must be 'dust' or 'entropy', "
                + "it is currently set to : " + input_params['lc_method'])

        if not ('lc_entropy_threshold' in input_params
                or 'lc_dust_threshold' in input_params):
            raise ValueError(
                ("A low complexity threshold needs to be " +
                 "entered for {}".format(input_params['lc_method'])))
        elif input_params['lc_method'] == 'dust':
            if 'lc_dust_threshold' not in input_params:
                raise ValueError(
                    ("A low complexity threshold needs to be " +
                     "entered for {}".format(input_params['lc_method'])))
            else:
                lc_threshold = input_params['lc_dust_threshold']
        else:
            if 'lc_entropy_threshold' not in input_params:
                raise ValueError(
                    ("A low complexity threshold needs to be " +
                     "entered for {}".format(input_params['lc_method'])))
            else:
                lc_threshold = input_params['lc_entropy_threshold']

        if (lc_threshold < 0.0) or (lc_threshold > 100.0):
            raise ValueError((
                "The threshold for {} must be between 0 and 100, it is currently "
                + "set to : {}").format(input_params['lc_method'],
                                        lc_threshold))
        reportObj = {'objects_created': [], 'text_message': ''}

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = [
            str(input_params['input_reads_ref'])
        ]

        # GET THE READS OBJECT
        # Determine whether read library or read set is input object
        #
        try:
            # object_info tuple
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': input_params['input_reads_ref']
                }]})[0]
            input_reads_obj_type = input_reads_obj_info[TYPE_I]
            # input_reads_obj_version = input_reads_obj_info[VERSION_I]
            # this is object version, not type version

        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                str(input_params['input_reads_ref']) + ')' + str(e))

        # self.log (console, "B4 TYPE: '" +
        #           str(input_reads_obj_type) +
        #           "' VERSION: '" + str(input_reads_obj_version)+"'")
        # remove trailing version
        input_reads_obj_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                      input_reads_obj_type)
        # self.log (console, "AF TYPE: '"+str(input_reads_obj_type)+"' VERSION: '" +
        # str(input_reads_obj_version)+"'")

        # maybe add below later "KBaseSets.ReadsSet",
        acceptable_types = [
            "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary",
            "KBaseAssembly.SingleEndLibrary", "KBaseFile.SingleEndLibrary"
        ]
        if input_reads_obj_type not in acceptable_types:
            raise ValueError("Input reads of type: '" + input_reads_obj_type +
                             "'.  Must be one of " +
                             ", ".join(acceptable_types))

        if input_reads_obj_type in [
                "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary"
        ]:
            read_type = 'PE'
        elif input_reads_obj_type in [
                "KBaseFile.SingleEndLibrary", "KBaseAssembly.SingleEndLibrary"
        ]:
            read_type = 'SE'

        # Instatiate ReadsUtils
        try:
            readsUtils_Client = ReadsUtils(url=self.callback_url,
                                           token=ctx['token'])  # SDK local
            self._log(None, 'Starting Read File(s) Download')
            readsLibrary = readsUtils_Client.download_reads({
                'read_libraries': [input_params['input_reads_ref']],
                'interleaved':
                'false'
            })
            self._log(None, 'Completed Read File(s) Downloading')
        except Exception as e:
            raise ValueError(
                ('Unable to get read library object from workspace: ({})\n'
                 ).format(str(input_params['input_reads_ref']), str(e)))

        # get WS metadata to get obj_name
        ws = workspaceService(self.ws_url)
        try:
            info = ws.get_object_info_new(
                {'objects': [{
                    'ref': input_params['input_reads_ref']
                }]})[0]
        except workspaceService as wse:
            self._log(console, 'Logging workspace exception')
            self._log(str(wse))
            raise

        #determine new object base name
        new_object_name = info[1]
        if ('output_reads_name' in input_params
                and input_params['output_reads_name'] != ''
                and input_params['output_reads_name'] is not None):
            new_object_name = input_params['output_reads_name']

        # MAKE A DIRECTORY TO PUT THE READ FILE(S)
        # create the output directory and move the file there
        # PUT FILES INTO THE DIRECTORY
        # Sanitize the file names
        tempdir = tempfile.mkdtemp(dir=self.scratch)
        export_dir = os.path.join(tempdir, info[1])
        os.makedirs(export_dir)

        if read_type == 'PE':
            # IF PAIRED END, potentially 6 files created
            # one of each for the two directions(good(paired), good_singletons, bad)
            # Take the good paired and (re)upload new reads object.
            # We throwout the bad reads

            input_files_info = self._setup_pe_files(readsLibrary, export_dir,
                                                    input_params)

            # RUN PRINSEQ with user options (lc_method and lc_threshold)
            cmd = (
                "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} "
                "-fastq2 {} -out_format 3 -lc_method {} "
                "-lc_threshold {}").format(
                    input_files_info["fastq_file_path"],
                    input_files_info["fastq2_file_path"],
                    input_params['lc_method'], lc_threshold)
            print "Command to be run : " + cmd
            args = shlex.split(cmd)
            perl_script = subprocess.Popen(args,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            output = perl_script.communicate()
            found_results = False
            file_names_dict = dict()
            for element in output:
                if "Input and filter stats:" in element:
                    found_results = True
                    element_parts = element.split("Input and filter stats:")
                    # PRINSEQ OUTPUT
                    report = "Input and filter stats:{}".format(
                        element_parts[1])
                    reportObj['text_message'] = report
                    read_files_list = os.listdir(export_dir)

                    # proc = subprocess.Popen(['ls', '-l', export_dir], stdout=subprocess.PIPE)
                    # proc_output = proc.stdout.read()
                    # print "PROC OUTPUT : " + proc_output

                    for read_filename in read_files_list:
                        file_direction = None
                        print "Read File : {}".format(read_filename)
                        # determine if forward(fastq) or reverse(fastq2) file
                        if input_files_info["fastq_filename"] in read_filename:
                            file_direction = "fwd"
                        elif input_files_info[
                                "fastq2_filename"] in read_filename:
                            file_direction = "rev"
                        if file_direction is not None:
                            # determine good singleton or good part of a pair.
                            print "TEST: {}_prinseq_good_".format(
                                input_files_info["fastq_filename"])
                            if ("{}_prinseq_good_singletons".format(
                                    input_files_info["fastq_filename"])
                                    in read_filename
                                    or "{}_prinseq_good_singletons".format(
                                        input_files_info["fastq2_filename"])
                                    in read_filename):
                                # Unpaired singletons that need to be
                                # saved as a new single end reads object
                                file_names_dict["{}_good_singletons".format(file_direction)] = \
                                    os.path.join(export_dir, read_filename)
                            elif ("{}_prinseq_good_".format(
                                    input_files_info["fastq_filename"])
                                  in read_filename
                                  or "{}_prinseq_good_".format(
                                      input_files_info["fastq2_filename"])
                                  in read_filename):
                                file_names_dict["{}_good_pair".format(file_direction)] = \
                                    os.path.join(export_dir, read_filename)
                    if (('fwd_good_pair' in file_names_dict)
                            and ('rev_good_pair' in file_names_dict)):
                        self._log(None, 'Saving new Paired End Reads')
                        returnVal['filtered_paired_end_ref'] = \
                            readsUtils_Client.upload_reads({'wsname':
                                                            str(input_params['output_ws']),
                                                            'name': new_object_name,
                                                            'source_reads_ref':
                                                            input_params['input_reads_ref'],
                                                            'fwd_file':
                                                                file_names_dict['fwd_good_pair'],
                                                            'rev_file':
                                                                file_names_dict['rev_good_pair']
                                                            }
                                                           )['obj_ref']
                        reportObj['objects_created'].append({
                            'ref':
                            returnVal['filtered_paired_end_ref'],
                            'description':
                            'Filtered Paired End Reads',
                            'object_name':
                            new_object_name
                        })
                        print "REFERENCE : " + str(
                            returnVal['filtered_paired_end_ref'])
                    else:
                        reportObj['text_message'] += \
                            "\n\nNo good matching pairs passed low complexity filtering.\n" + \
                            "Consider loosening the threshold value.\n"
                    if 'fwd_good_singletons' in file_names_dict:
                        self._log(None, 'Saving new Forward Unpaired Reads')
                        fwd_object_name = "{}_fwd_singletons".format(
                            new_object_name)
                        returnVal['output_filtered_fwd_unpaired_end_ref'] = \
                            readsUtils_Client.upload_reads({'wsname':
                                                            str(input_params['output_ws']),
                                                            'name': fwd_object_name,
                                                            'source_reads_ref':
                                                            input_params['input_reads_ref'],
                                                            'fwd_file':
                                                            file_names_dict['fwd_good_singletons']}
                                                           )['obj_ref']
                        reportObj['objects_created'].append({
                            'ref':
                            returnVal['output_filtered_fwd_unpaired_end_ref'],
                            'description':
                            'Filtered Forward Unpaired End Reads',
                            'object_name':
                            fwd_object_name
                        })
                        print "REFERENCE : " + \
                            str(returnVal['output_filtered_fwd_unpaired_end_ref'])
                    if 'rev_good_singletons' in file_names_dict:
                        self._log(None, 'Saving new Reverse Unpaired Reads')
                        rev_object_name = "{}_rev_singletons".format(
                            new_object_name)
                        returnVal['output_filtered_rev_unpaired_end_ref'] = \
                            readsUtils_Client.upload_reads({'wsname':
                                                            str(input_params['output_ws']),
                                                            'name': rev_object_name,
                                                            'source_reads_ref':
                                                            input_params['input_reads_ref'],
                                                            'fwd_file':
                                                            file_names_dict['rev_good_singletons']}
                                                           )['obj_ref']
                        reportObj['objects_created'].append({
                            'ref':
                            returnVal['output_filtered_rev_unpaired_end_ref'],
                            'description':
                            'Filtered Reverse Unpaired End Reads',
                            'object_name':
                            rev_object_name
                        })
                        print "REFERENCE : " + \
                            str(returnVal['output_filtered_rev_unpaired_end_ref'])
                    if len(reportObj['objects_created']) > 0:
                        reportObj['text_message'] += "\nOBJECTS CREATED :\n"
                        for obj in reportObj['objects_created']:
                            reportObj['text_message'] += "{} : {}\n".format(
                                obj['object_name'], obj['description'])
                    else:
                        reportObj['text_message'] += \
                            "\nFiltering filtered out all reads. No objects made.\n"
            if not found_results:
                raise Exception('Unable to execute PRINSEQ, Error: {}'.format(
                    str(output)))
            print "FILES DICT : {}".format(str(file_names_dict))
            print "REPORT OBJECT :"
            print str(reportObj)

        elif read_type == 'SE':
            # Download reads Libs to FASTQ files
            # IF SINGLE END INPUT 2 files created (good and bad)
            # Take good and (re)upload new reads object
            input_fwd_file_path = \
                readsLibrary['files'][input_params['input_reads_ref']]['files']['fwd']
            fastq_filename = self._sanitize_file_name(
                os.path.basename(input_fwd_file_path))
            fastq_file_path = os.path.join(export_dir, fastq_filename)
            shutil.move(input_fwd_file_path, fastq_file_path)

            # RUN PRINSEQ with user options (lc_method and lc_threshold)
            cmd = (
                "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} "
                "-out_format 3 -lc_method {} "
                "-lc_threshold {}").format(fastq_file_path,
                                           input_params['lc_method'],
                                           lc_threshold)
            print "Command to be run : " + cmd
            args = shlex.split(cmd)
            print "ARGS:  " + str(args)
            perl_script = subprocess.Popen(args,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            output = perl_script.communicate()
            print "OUTPUT: " + str(output)
            found_results = False
            found_se_filtered_file = False
            file_names_dict = dict()
            for element in output:
                if "Input and filter stats:" in element:
                    found_results = True
                    element_parts = element.split("Input and filter stats:")
                    # PRINSEQ OUTPUT
                    report = "Input and filter stats:{}".format(
                        element_parts[1])
                    reportObj['text_message'] = report
                    read_files_list = os.listdir(export_dir)

                    for read_filename in read_files_list:
                        print "Early Read File : {}".format(read_filename)

                    for read_filename in read_files_list:
                        print "Read File : {}".format(read_filename)
                        if ("{}_prinseq_good_".format(fastq_filename)
                                in read_filename):
                            #Found Good file. Save the Reads objects
                            self._log(None, 'Saving Filtered Single End Reads')
                            returnVal['output_filtered_single_end_ref'] = \
                                readsUtils_Client.upload_reads({'wsname':
                                                                str(input_params['output_ws']),
                                                                'name': new_object_name,
                                                                'source_reads_ref':
                                                                input_params['input_reads_ref'],
                                                                'fwd_file':
                                                                    os.path.join(export_dir,
                                                                                 read_filename)}
                                                               )['obj_ref']
                            reportObj['objects_created'].append({
                                'ref':
                                returnVal['output_filtered_single_end_ref'],
                                'description':
                                'Filtered Single End Reads'
                            })
                            print "REFERENCE : " + str(
                                returnVal['output_filtered_single_end_ref'])
                            found_se_filtered_file = True
                            break
            if not found_se_filtered_file:
                reportObj['text_message'] += \
                    "\n\nNone of the reads passed low complexity filtering.\n" + \
                    "Consider loosening the threshold value.\n"
            if not found_results:
                raise Exception('Unable to execute PRINSEQ, Error: {}'.format(
                    str(output)))
            print "FILES DICT : {}".format(str(file_names_dict))
            print "REPORT OBJECT :"
            print str(reportObj)

        # save report object
        #
        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report = KBaseReport(self.callback_url, token=ctx['token'], service_ver=SERVICE_VER)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': input_params['output_ws']
        })

        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END execReadLibraryPRINSEQ

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method execReadLibraryPRINSEQ return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #8
0
    def predict_amr_phenotype(self, ctx, params):
        """
        The AMR prediction function specification
        :param params: instance of type "AMRPredictionParams" (Structure of
           input data for AMR prediction) -> structure: parameter
           "assembly_input_ref" of type "assembly_ref", parameter "species"
           of String, parameter "workspace_name" of String
        :returns: instance of type "AMRPredictionResults" (Structure of
           output of AMR prediction) -> structure: parameter "report_name" of
           String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN predict_amr_phenotype

        # Input validation
        for name in ['assembly_input_ref', 'species', 'workspace_name']:
            if name not in params:
                raise ValueError('Parameter "' + name + '" is required but missing')
        if not (isinstance(params['assembly_input_ref'], string_types) or isinstance(params['assembly_input_ref'], list)) or not len(params['assembly_input_ref']):
            raise ValueError('Pass in a valid assembly reference string(s)')

        # Extract params
        if not isinstance(params["assembly_input_ref"], list):
            assemblies = [params["assembly_input_ref"]]
        else:
            assemblies = params["assembly_input_ref"]
        species = params["species"]

        # Get models for species
        scm_models = self.get_models_by_algorithm_and_species("scm", species)
        cart_models = self.get_models_by_algorithm_and_species("cart", species)

        # Process assemblies
        predictions = {}
        assembly_util = AssemblyUtil(self.callback_url)
        
        for assembly_ref in assemblies:
            
            assembly_predictions = {}

            # Get the fasta file path and other info
            assembly = assembly_util.get_assembly_as_fasta({'ref': assembly_ref})

            # Extract the k-mers
            kmers = self.extract_kmers(assembly["path"], k=31)
            print "Kmers --", assembly["assembly_name"], ":", len(kmers)

            # Make predictions (SCM)
            print "SCM models"
            assembly_predictions["scm"] = {}
            for antibiotic, model in scm_models.iteritems():
                p = model.predict(kmers)
                assembly_predictions["scm"][antibiotic] = {}
                assembly_predictions["scm"][antibiotic]["label"] = p[0]
                assembly_predictions["scm"][antibiotic]["why"] = p[1]

            # Make predictions (CART)
            print "CART models"
            assembly_predictions["cart"] = {}
            for antibiotic, model in cart_models.iteritems():
                p = model.predict(kmers)
                assembly_predictions["cart"][antibiotic] = {}
                assembly_predictions["cart"][antibiotic]["label"] = p[0]
                assembly_predictions["cart"][antibiotic]["why"] = p[1]

            predictions[assembly["assembly_name"]] = assembly_predictions
            del assembly_predictions

        # Generate report
        text_message = "This is a test report for kover amr (text)"

        # Data for creating the report, referencing the assembly we uploaded
        report_data = {
            'objects_created': [],
            'text_message': text_message,
            'direct_html': generate_html_prediction_report(predictions, species)
        }

        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create({
            'report': report_data,
            'workspace_name': params['workspace_name'],
            'file_links': generate_csv_prediction_report(predictions, species, self.scratch)
        })

        output = {
            'report_ref': report['ref'],
            'report_name': report['name']
        }

        #END predict_amr_phenotype

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method predict_amr_phenotype return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #9
0
ファイル: MEGAHITImpl.py プロジェクト: msneddon/kb_megahit
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT.  Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           min_k - minimum kmer size (<= 127), must be odd number, default 21
           max_k - maximum kmer size (<= 127), must be odd number, default 99
           k_step - increment of kmer size of each iteration (<= 28), must be
           even number, default 10 k_list - list of kmer size (all must be
           odd, in the range 15-127, increment <= 28); override `--k-min',
           `--k-max' and `--k-step' min_contig_length - minimum length of
           contigs to output, default 200 @optional megahit_parameter_preset
           @optional min_count @optional k_min @optional k_max @optional
           k_step @optional k_list @optional min_contig_len) -> structure:
           parameter "workspace_name" of String, parameter "read_library_ref"
           of String, parameter "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_len" of Long
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))
        if 'min_contig_len' in params:
            if params['min_contig_len']:
                megahit_cmd.append('--min-contig-len')
                megahit_cmd.append(str(params['min_contig_len']))

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            raise ValueError('Error running MEGAHIT, return code: ' +
                             str(retcode) + '\n')

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
                                                                'file': {'path': output_contigs},
                                                                'workspace_name': params['workspace_name'],
                                                                'assembly_name': params['output_contigset_name']
                                                                })


        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        reportObj = {
            'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}],
            'text_message': report
        }
        report = KBaseReport(self.callbackURL)
        report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
コード例 #10
0
    def import_file(self, params):

        # 1) validate parameters
        self._validate_import_file_params(params)

        # 2) construct the input directory staging area
        input_directory = os.path.join(self.cfg.sharedFolder,
                                       'fast_gff_upload_' + str(uuid.uuid4()))
        os.makedirs(input_directory)
        file_paths = self._stage_input(params, input_directory)

        # 3) extract out the parameters
        params = self._set_parsed_params(params)

        # 4) do the upload
        result = self.upload_genome(
            shock_service_url=self.cfg.shockURL,
            handle_service_url=self.cfg.handleURL,
            workspace_service_url=self.cfg.workspaceURL,
            callback_url=self.cfg.callbackURL,
            input_fasta_file=file_paths["fasta_file"],
            input_gff_file=file_paths["gff_file"],
            workspace_name=params['workspace_name'],
            core_genome_name=params['genome_name'],
            scientific_name=params['scientific_name'],
            taxon_wsname=params['taxon_wsname'],
            taxon_reference=params['taxon_reference'],
            source=params['source'],
            genome_type=params['type'],
            release=params['release'])

        # 5) generate report
        output_data_ref = params['workspace_name'] + "/" + params['genome_name']
        reportObj = {
            'objects_created': [{
                'ref': output_data_ref,
                'description': 'KBase Genome object'
            }],
            'text_message':
            result['report_string']
        }

        reportClient = KBaseReport(os.environ['SDK_CALLBACK_URL'])
        report_info = reportClient.create({
            'report':
            reportObj,
            'workspace_name':
            params['workspace_name']
        })

        # 6) clear the temp directory
        shutil.rmtree(input_directory)

        # 7) return the result
        info = result['genome_info']
        details = {
            'genome_ref':
            str(info[6]) + '/' + str(info[0]) + '/' + str(info[4]),
            'genome_info': info,
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        return details
コード例 #11
0
ファイル: NGSUtilsImpl.py プロジェクト: msneddon/ngsutils
    def fastqutils_stats(self, ctx, params):
        """
        :param params: instance of type "FastqUtilsStatsParams" -> structure:
           parameter "workspace_name" of type "workspace_name" (A string
           representing a workspace name.), parameter "read_library_ref" of
           type "read_library_ref" (A string representing a ContigSet id.)
        :returns: instance of type "FastqUtilsStatsResult" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN fastqutils_stats

        print('Running fastqutils_stats with params=')
        print(pformat(params))

        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')

        # Get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL, token=ctx['token'])
        reads = ru.download_reads(reads_params)['files']
        files = [reads[input_ref]['files']['fwd']]
        if reads[input_ref]['files']['rev']:
            files.append(reads[input_ref]['files']['rev'])
        print('running on files:')
        for f in files:
            print(f)

        # construct the command
        stats_cmd = [self.FASTQUTILS, 'stats']

        report = ''
        for f in files:
            cmd = stats_cmd
            cmd.append(f)

            report += '============== ' + f + ' ==============\n'
            print('running: ' + ' '.join(cmd))
            p = subprocess.Popen(cmd,
                                 cwd=self.scratch,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=False)

            while True:
                line = p.stdout.readline()
                if not line:
                    break
                report += line
                print(line.replace('\n', ''))

            p.stdout.close()
            p.wait()
            report += "\n\n"
            print('return code: ' + str(p.returncode))
            if p.returncode != 0:
                raise ValueError('Error running ' + self.FASTQUTILS + ', return code: ' + str(p.returncode))


        reportObj = {
            'objects_created': [],
            'text_message': report
        }
        report = KBaseReport(self.callbackURL)
        report_info = report.create({'report': reportObj, 'workspace_name': params['workspace_name']})
        returnVal = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END fastqutils_stats

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method fastqutils_stats return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]
    def filter_contigs(self, ctx, workspace_name, contigset, minimum):
        """
        :param workspace_name: instance of String
        :param contigset: instance of String
        :param minimum: instance of Long
        :returns: instance of type "FilterContigResults" -> structure:
           parameter "report_name" of String, parameter "report_ref" of
           String, parameter "assembly_ref" of String, parameter
           "contig_count" of Long, parameter "filtered_contig_count" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN filter_contigs
        print(workspace_name)
        print(contigset)
        print(minimum)

        def perform_filter(min_length, contigs):
            result_type = namedtuple(
                'filter_result',
                ['total_count', 'filtered_count', 'filtered_set'])
            total_count = 0
            filtered_count = 0
            filtered_set = set()
            for contig in contigs:
                if len(contig) > min_length:
                    filtered_count += 1
                    filtered_set.add(contig)
                total_count += 1
            return result_type(total_count, filtered_count, filtered_set)

        print('about to get fasta')
        fasta_file = self.dfu.get_assembly_as_fasta({'ref': contigset})
        print('got fasta')
        contigs = SeqIO.parse(fasta_file['path'], 'fasta')
        filtered_file = os.path.join(self.scratch, 'filtered.fasta')
        filtered = perform_filter(minimum, contigs)
        SeqIO.write(filtered.filtered_set, filtered_file, 'fasta')

        new_assembly = self.dfu.\
            save_assembly_from_fasta({'file': {'path': filtered_file},
                                      'workspace_name': workspace_name,
                                      'assembly_name': fasta_file['assembly_name']
                                      })

        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'text_message':
            'Filtered Assembly to ' + str(filtered.filtered_count) +
            ' contigs out of ' + str(filtered.total_count)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': workspace_name
        })

        returnVal = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'contig_count': filtered.total_count,
            'filtered_contig_count': filtered.filtered_count
        }
        #END filter_contigs

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method filter_contigs return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]