Python ReadsUtils.upload_reads Beispiele

Programmiersprache: Python

Namespace / Paketname: ReadsUtils.ReadsUtilsClient

Klasse / Typ: ReadsUtils

Methode / Funktion: upload_reads

Beispiele auf hotexamples.com: 23

Python ReadsUtils.upload_reads - 23 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die ReadsUtils.ReadsUtilsClient.ReadsUtils.upload_reads, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

ReadsUtils(30)

upload_reads(22)

download_reads(20)

Beispiel #1

Datei anzeigen

Datei: generic_set_access_test.py Projekt: pranjan77/SetAPI

    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        user_id = requests.post(
            'https://kbase.us/services/authorization/Sessions/Login',
            data='token={}&fields=user_id'.format(token)).json()['user_id']
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'SetAPI',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('SetAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = SetAPI(cls.cfg)
        cls.serviceWizardURL = cls.cfg['service-wizard']
        cls.dataPaletteServiceVersion = cls.cfg['datapaletteservice-version']


        # setup data at the class level for now (so that the code is run
        # once for all tests, not before each test case.  Not sure how to
        # do that outside this function..)
        suffix = int(time.time() * 1000)
        wsName = "test_SetAPI_" + str(suffix)
        ret = cls.wsClient.create_workspace({'workspace': wsName})
        cls.wsName = wsName

        # copy test file to scratch area
        fq_filename = "interleaved.fastq"
        fq_path = os.path.join(cls.cfg['scratch'], fq_filename)
        shutil.copy(os.path.join("data", fq_filename), fq_path)

        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        cls.read1ref = ru.upload_reads({
                'fwd_file': fq_path,
                'sequencing_tech': 'tech1',
                'wsname': wsName,
                'name': 'reads1',
                'interleaved':1
            })['obj_ref']
        cls.read2ref = ru.upload_reads({
                'fwd_file': fq_path,
                'sequencing_tech': 'tech2',
                'wsname': wsName,
                'name': 'reads2',
                'interleaved':1
            })['obj_ref']

Beispiel #2

Datei anzeigen

Datei: kb_PRINSEQ_server_test.py Projekt: rsutormin/kb_PRINSEQ

    def upload_test_reads(cls):
        """
        Seeding an initial SE and PE Reads objects to test filtering
        """
        header = dict()
        header["Authorization"] = "Oauth {0}".format(cls.token)
        # readsUtils_Client = ReadsUtils(url=self.callback_url, token=ctx['token'])  # SDK local
        readsUtils_Client = ReadsUtils(os.environ['SDK_CALLBACK_URL'],
                                       token=cls.token)

        temp_nodes = []
        fwdtf = 'small_forward.fq'
        revtf = 'small_reverse.fq'
        fwdtarget = os.path.join(cls.scratch, fwdtf)
        revtarget = os.path.join(cls.scratch, revtf)
        print "CWD: " + str(os.getcwd())
        shutil.copy('/kb/module/test/data/' + fwdtf, fwdtarget)
        shutil.copy('/kb/module/test/data/' + revtf, revtarget)

        # Upload single end reads
        cls.se_reads_reference = \
            readsUtils_Client.upload_reads({'wsname': cls.getWsName(),
                                            'name': "se_reads",
                                            'sequencing_tech': 'Illumina',
                                            'fwd_file': fwdtarget}
                                           )['obj_ref']

        se_data = cls.dfu.get_objects(
            {'object_refs':
             [cls.getWsName() + '/se_reads']})['data'][0]['data']

        temp_nodes.append(se_data['lib']['file']['id'])

        # Upload paired end reads
        cls.pe_reads_reference = \
            readsUtils_Client.upload_reads({'wsname': cls.getWsName(),
                                            'name': "pe_reads",
                                            'sequencing_tech': 'Illumina',
                                            'fwd_file': fwdtarget,
                                            'rev_file': revtarget,
                                            'insert_size_mean': 42,
                                            'insert_size_std_dev': 10,
                                            }
                                           )['obj_ref']
        pe_data = cls.dfu.get_objects(
            {'object_refs':
             [cls.getWsName() + '/pe_reads']})['data'][0]['data']
        temp_nodes.append(pe_data['lib1']['file']['id'])

        return temp_nodes

Beispiel #3

Datei anzeigen

    def loadPairedEndReads(self):
        if hasattr(self.__class__, 'pairedEndLibInfo'):
            return self.__class__.pairedEndLibInfo
        # 1) upload files to shock
        shared_dir = "/kb/module/work/tmp"
        forward_data_file = '../work/testReads/small.forward.fq'
        forward_file = os.path.join(shared_dir,
                                    os.path.basename(forward_data_file))
        shutil.copy(forward_data_file, forward_file)
        reverse_data_file = '../work/testReads/small.reverse.fq'
        reverse_file = os.path.join(shared_dir,
                                    os.path.basename(reverse_data_file))
        shutil.copy(reverse_data_file, reverse_file)

        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        pe_reads_ref = ru.upload_reads({
            'fwd_file': forward_file,
            'rev_file': reverse_file,
            'sequencing_tech': 'artificial reads',
            'interleaved': 0,
            'wsname': self.getWsName(),
            'name': 'test_pe_reads'
        })['obj_ref']

        self.__class__.pe_reads_ref = pe_reads_ref
        print('Loaded PairedEndReads: ' + pe_reads_ref)
        new_obj_info = self.wsClient.get_object_info_new(
            {'objects': [{
                'ref': pe_reads_ref
            }]})
        self.__class__.pairedEndLibInfo = new_obj_info[0]
        pprint(pformat(new_obj_info))
        #return new_obj_info[0]
        return pe_reads_ref

Beispiel #4

Datei anzeigen

Datei: kb_fastqc_server_test.py Projekt: nlharris/kb_fastqc

    def test_fastqc_app(self):
        # create ws, and load test reads
        wsName = self.getWsName()
        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        input_file_ref = ru.upload_reads({
            'fwd_file': self.small_fq_test_file2,
            'sequencing_tech': 'tech1',
            'wsname': wsName,
            'name': 'reads1',
            'interleaved': 1
        })['obj_ref']

        input_params = {'input_ws': wsName, 'input_file_ref': input_file_ref}
        output = self.getImpl().runFastQC(self.getContext(), input_params)[0]
        self.assertIn('report_name', output)
        self.assertIn('report_ref', output)
        #        pprint(output)

        report = self.getWsClient().get_objects2(
            {'objects': [{
                'ref': output['report_ref']
            }]})['data'][0]['data']
        #        pprint(report)

        self.assertIn('direct_html', report)
        self.assertIn('file_links', report)
        self.assertIn('html_links', report)
        self.assertIn('objects_created', report)
        self.assertIn('text_message', report)

Beispiel #5

Datei anzeigen

    def upload_fastq(self, ctx, params):
        """
        :param params: instance of type "UploadFastqParams" (testing
           invocation of ReadsUtils) -> structure: parameter "fwd_id" of
           String, parameter "wsid" of Long, parameter "wsname" of String,
           parameter "objid" of Long, parameter "name" of String, parameter
           "rev_id" of String, parameter "sequencing_tech" of String
        :returns: instance of type "UploadFastqObjref"
        """
        # ctx is the context object
        # return variables are: objref
        #BEGIN upload_fastq
        print("hai this is upload_fastq here, params are")
        pprint.pprint(params)
        ReadsUtils_instance = ReadsUtils(url=self.callbackURL,
                                         token=ctx['token'],
                                         service_ver='dev')
        print("got ReadsUtilsinstance")
        method_retVal = ReadsUtils_instance.upload_reads(params)
        print("back from ReadsUtils_instance.upload_reads")
        pprint(method_retVal)
        objref = "Vooch"
        #END upload_fastq

        # At some point might do deeper type checking...
        if not isinstance(objref, basestring):
            raise ValueError('Method upload_fastq return value ' +
                             'objref is not type basestring as required.')
        # return the results
        return [objref]

Beispiel #6

Datei anzeigen

    def _package_result(self, output_file, output_name, ws_name_or_id,
                        data_info, report):
        upload_params = {'fwd_file': output_file, 'name': output_name}

        if str(ws_name_or_id).isdigit():
            upload_params['wsid'] = int(ws_name_or_id)
        else:
            upload_params['wsname'] = str(ws_name_or_id)

        fields = [
            'sequencing_tech', 'strain', 'source', 'read_orientation_outward',
            'insert_size_mean', 'insert_size_std_dev'
        ]

        if 'input_ref' in data_info and data_info[
                'input_ref'] != None and data_info['sequencing_tech']:
            upload_params['source_reads_ref'] = data_info['input_ref']
        else:
            for f in fields:
                if f in data_info:
                    upload_params[f] = data_info[f]
            if 'single_genome' in data_info:
                if data_info['single_genome'] == 'true':
                    upload_params['single_genome'] = 1
                elif data_info['single_genome'] == 'false':
                    upload_params['single_genome'] = 0
            if 'sequencing_tech' not in upload_params:
                upload_params['sequencing_tech'] = 'unknown'
            if not upload_params['sequencing_tech']:
                upload_params['sequencing_tech'] = 'unknown'

        if data_info['files']['type'] == 'interleaved':
            upload_params['interleaved'] = 1

        ru = ReadsUtils(self.callbackURL)
        result = ru.upload_reads(upload_params)

        # THE REPORT MUST BE CREATED OUTSIDE SO THAT LIBS AND SETS ARE HANDLED
        """
        # create report
        kbreport = KBaseReport(self.callbackURL)
        rep = kbreport.create({
                              'report': {
                                  'text_message': report,
                                  'objects_created': [{
                                      "ref": str(ws_name_or_id) + '/' + upload_params['name'],
                                      "description": ''
                                  }]
                              },
                              "workspace_name": str(ws_name_or_id)
                              })

        return {
            'report_ref': rep['ref'],
            'report_name': rep['name'],
            'output_reads_ref': result['obj_ref']
        }
        """
        return {'report': report, 'output_reads_ref': result['obj_ref']}

Beispiel #7

Datei anzeigen

Datei: util.py Projekt: slebras/jgi_mg_assembly

def load_pe_reads(fwd_file, rev_file):
    """
    Copies from given dir to scratch. Then calls ReadsUtils to upload from scratch.
    """
    callback_url = os.environ['SDK_CALLBACK_URL']
    fwd_file_path = file_to_scratch(fwd_file, overwrite=True)
    rev_file_path = file_to_scratch(rev_file, overwrite=True)
    ru = ReadsUtils(callback_url)
    pe_reads_params = {
        'fwd_file': fwd_file_path,
        'rev_file': rev_file_path,
        'sequencing_tech': 'Illumina',
        'wsname': get_ws_name(),
        'name': 'MyPairedEndLibrary'
    }
    return ru.upload_reads(pe_reads_params)['obj_ref']

Beispiel #8

Datei anzeigen

Datei: file_util.py Projekt: briehl/kb_GenomeBrowser

 def load_reads_file(self, tech, file_fwd, file_rev, target_name):
     """
     Loads FASTQ files as either SingleEndLibrary or PairedEndLibrary. If file_rev is None,
     then we get a single end, otherwise, paired.
     """
     reads_util = ReadsUtils(self.callback_url)
     upload_params = {
         "wsname": self.ws_name,
         "fwd_file": file_fwd,
         "name": target_name,
         "sequencing_tech": tech
     }
     if file_rev is not None:
         upload_params["rev_file"] = file_rev
     reads_ref = reads_util.upload_reads(upload_params)
     return reads_ref["obj_ref"]

Beispiel #9

Datei anzeigen

Datei: kb_BatchApp_server_test.py Projekt: ugswork/kb_BatchApp

    def loadSingleEndReads(self):
        if hasattr(self.__class__, 'se_reads_ref'):
            return self.__class__.se_reads_ref
        # return '23735/2/1'
        fq_path = os.path.join(self.scratch, 'reads_1_se.fq')
        shutil.copy(os.path.join('data', 'reads_1.fq'), fq_path)

        ru = ReadsUtils(self.callback_url)
        se_reads_ref = ru.upload_reads({
            'fwd_file': fq_path,
            'wsname': self.getWsName(),
            'name': 'test_readsSE',
            'sequencing_tech': 'artificial reads'
        })['obj_ref']
        self.__class__.se_reads_ref = se_reads_ref
        print('Loaded SingleEndReads: ' + se_reads_ref)
        return se_reads_ref

Beispiel #10

Datei anzeigen

    def loadSEReads(self, reads_file_path):
        #if hasattr(self.__class__, 'reads_ref'):
        #return self.__class__.reads_ref
        se_reads_name = os.path.basename(reads_file_path)
        fq_path = os.path.join(self.scratch,
                               se_reads_name)  #'star_test_reads.fastq')
        shutil.copy(reads_file_path, fq_path)

        ru = ReadsUtils(self.callback_url)
        reads_ref = ru.upload_reads({
            'fwd_file': fq_path,
            'wsname': self.getWsName(),
            'name': se_reads_name.split('.')[0],
            'sequencing_tech': 'rnaseq reads'
        })['obj_ref']
        #self.__class__.reads_ref = reads_ref
        return reads_ref

Beispiel #11

Datei anzeigen

Datei: legacy_reads_conversionImpl.py Projekt: landml/legacy_reads_conversion

    def _upload_reads(self, refid, callbackURL, input_params):
        ref = [refid]
        DownloadReadsParams = {'read_libraries': ref}
        dfUtil = ReadsUtils(callbackURL)
        x = dfUtil.download_reads(DownloadReadsParams)

        uploadReadParams = {}
        fwd_file = x['files'][ref[0]]['files']['fwd']
        otype = x['files'][ref[0]]['files']['otype']
        #case of interleaved
        if (otype == 'interleaved'):
            uploadReadParams = {
                'fwd_file': fwd_file,
                'wsname': input_params['workspace_name'],
                'name': input_params['output'],
                'rev_file': '',
                'sequencing_tech': input_params['sequencing_tech'],
                'single_genome': input_params['single_genome'],
                'interleaved': 1
            }

        #case of separate pair
        if (otype == 'paired'):
            rev_file = x['files'][ref[0]]['files']['rev']
            uploadReadParams = {
                'fwd_file': fwd_file,
                'wsname': input_params['workspace_name'],
                'name': input_params['output'],
                'rev_file': rev_file,
                'sequencing_tech': input_params['sequencing_tech'],
                'single_genome': input_params['single_genome']
            }

        #case of single end
        if (otype == 'single'):
            uploadReadParams = {
                'fwd_file': fwd_file,
                'wsname': input_params['workspace_name'],
                'name': input_params['output'],
                'rev_file': '',
                'sequencing_tech': input_params['sequencing_tech'],
                'single_genome': input_params['single_genome']
            }
        y = dfUtil.upload_reads(uploadReadParams)
        return y['obj_ref']

Beispiel #12

Datei anzeigen

Datei: BBTools_server_test.py Projekt: briehl/BBTools

    def getPairedEndLibInfo(self):
        if hasattr(self.__class__, 'pairedEndLibInfo'):
            return self.__class__.pairedEndLibInfo

        # copy the local test file to the shared scratch space so that the ReadsUtils
        # container can see it.
        test_fastq_file_local = 'data/interleaved.fastq'
        test_fastq_file_scratch = os.path.join(self.scratch, os.path.basename(test_fastq_file_local))
        shutil.copy(test_fastq_file_local, test_fastq_file_scratch)

        # call the ReadsUtils libary to upload the test data to KBase
        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        paired_end_ref = ru.upload_reads({'fwd_file': test_fastq_file_scratch,
                                          'sequencing_tech': 'artificial reads',
                                          'interleaved': 1, 'wsname': self.getWsName(),
                                          'name': 'test.pe.reads'})['obj_ref']

        # get the object metadata for the new test dataset
        new_obj_info = self.ws.get_object_info_new({'objects': [{'ref': paired_end_ref}]})
        self.__class__.pairedEndLibInfo = new_obj_info[0]
        return new_obj_info[0]

Beispiel #13

Datei anzeigen

Datei: MegaHit_server_test.py Projekt: dcchivian/kb_megahit

    def getPairedEndLibInfo(self):
        if hasattr(self.__class__, 'pairedEndLibInfo'):
            return self.__class__.pairedEndLibInfo
        # 1) upload files to shock
        shared_dir = "/kb/module/work/tmp"
        forward_data_file = 'data/small.forward.fq'
        forward_file = os.path.join(shared_dir, os.path.basename(forward_data_file))
        shutil.copy(forward_data_file, forward_file)
        reverse_data_file = 'data/small.reverse.fq'
        reverse_file = os.path.join(shared_dir, os.path.basename(reverse_data_file))
        shutil.copy(reverse_data_file, reverse_file)

        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        paired_end_ref = ru.upload_reads({'fwd_file': forward_file, 'rev_file': reverse_file,
                                          'sequencing_tech': 'artificial reads',
                                          'interleaved': 0, 'wsname': self.getWsName(),
                                          'name': 'test.pe.reads'})['obj_ref']

        new_obj_info = self.ws.get_object_info_new({'objects': [{'ref': paired_end_ref}]})
        self.__class__.pairedEndLibInfo = new_obj_info[0]
        return new_obj_info[0]

Beispiel #14

Datei anzeigen

Datei: kb_mash_server_test.py Projekt: dcchivian/kb_mash

 def test_mash_sketch_valid_reads_ref(self):
     dir_path = os.path.dirname(os.path.realpath(__file__))
     reads_file_name = 'reads-example.fastq'
     reads_test_path = os.path.join(dir_path, 'data', reads_file_name)
     reads_scratch_path = os.path.join(self.scratch, reads_file_name)
     shutil.copy(reads_test_path, reads_scratch_path)
     reads_utils = ReadsUtils(self.callback_url)
     upload_result = reads_utils.upload_reads({
         'wsname': self.getWsName(),
         'interleaved': 'true',
         'fwd_file': reads_scratch_path,
         'name': 'example-reads',
         'sequencing_tech': 'illumina'
     })
     reads_ref = upload_result['obj_ref']
     params = {'reads_ref': reads_ref, 'paired_ends': True}
     result = self.getImpl().run_mash_sketch(self.getContext(), params)
     output_path = result[0]['sketch_path']
     with open(output_path, 'rb') as output_file:
         num_lines = sum(1 for line in output_file)
     self.assertTrue(os.path.exists(output_path))
     self.assertEqual(num_lines, 25)

Beispiel #15

Datei anzeigen

Datei: kb_Bowtie2_Aligner_test.py Projekt: r2sunita/kb_Bowtie2

    def loadPairedEndReads(self):
        if hasattr(self.__class__, 'pe_reads_ref'):
            return self.__class__.pe_reads_ref
        # return '23735/3/1'
        fq_path1 = os.path.join(self.scratch, 'reads_1.fq')
        shutil.copy(os.path.join('data', 'bt_test_data', 'reads_1.fq'),
                    fq_path1)
        fq_path2 = os.path.join(self.scratch, 'reads_2.fq')
        shutil.copy(os.path.join('data', 'bt_test_data', 'reads_2.fq'),
                    fq_path2)

        ru = ReadsUtils(self.callback_url)
        pe_reads_ref = ru.upload_reads({
            'fwd_file': fq_path1,
            'rev_file': fq_path2,
            'wsname': self.getWsName(),
            'name': 'test_readsPE',
            'sequencing_tech': 'artificial reads'
        })['obj_ref']
        self.__class__.pe_reads_ref = pe_reads_ref
        print('Loaded PairedEndReads: ' + pe_reads_ref)
        return pe_reads_ref

Beispiel #16

Datei anzeigen

    def upload_reads(self, file_path, workspace_name, reads_name, source_reads_upa):
        """
        Upload the given contigs file as an interleaved PE reads object.
        """
        if not file_path:
            raise ValueError("file_path must be defined")
        if not os.path.exists(file_path):
            raise ValueError("The given reads file '{}' does not exist".format(file_path))
        if not workspace_name:
            raise ValueError("workspace_name must be defined")
        if not reads_name:
            raise ValueError("reads_name must be defined")

        ru = ReadsUtils(self.callback_url)
        reads_upa = ru.upload_reads({
            "wsname": workspace_name,
            "fwd_file": file_path,
            "name": reads_name,
            "source_reads_ref": source_reads_upa,
            "interleaved": 1
        })["obj_ref"]
        return reads_upa

Beispiel #17

Datei anzeigen

    def _upload_file_path(self, params):
        """
        _upload_file_path: upload fastq file as reads from user's staging area

        params:
        fwd_staging_file_name:
        single-end fastq file name or forward/left paired-end fastq file name
        from user's staging area
        sequencing_tech: sequencing technology
        name: output reads file name
        workspace_name: workspace name/ID that reads will be stored to

        optional params:
        rev_staging_file_name: reverse/right paired-end fastq file name user's staging area
        single_genome: whether the reads are from a single genome or a metagenome
        insert_size_mean: mean (average) insert length
        insert_size_std_dev: standard deviation of insert lengths
        read_orientation_outward: whether reads in a pair point outward
        interleaved: whether reads is interleaved

        """
        log('---> running UploaderUtil._upload_file_path')

        upload_file_params = params

        workspace_name_or_id = params.get('workspace_name')

        if str(workspace_name_or_id).isdigit():
            upload_file_params['wsid'] = int(workspace_name_or_id)
        else:
            upload_file_params['wsname'] = str(workspace_name_or_id)

        log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
            json.dumps(upload_file_params, indent=1)))
        ru = ReadsUtils(self.callback_url)
        result = ru.upload_reads(upload_file_params)

        return result

Beispiel #18

Datei anzeigen

    def getPairedEndLibInfo(self):
        input_reads_file = '/kb/module/test/data/small_test_reads.fastq'
        #input_reads_file = '/kb/module/test/data/12040.half_million.fastq'
        shared_dir = "/kb/module/work/tmp"
        input_file = os.path.join(shared_dir,
                                  os.path.basename(input_reads_file))
        shutil.copy(input_reads_file, input_file)

        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])

        paired_end_ref = ru.upload_reads({
            'fwd_file': input_file,
            'sequencing_tech': 'artificial reads',
            'interleaved': 1,
            'wsname': self.getWsName(),
            'name': 'test.pe.reads'
        })['obj_ref']

        new_obj_info = self.wsClient.get_object_info_new(
            {'objects': [{
                'ref': paired_end_ref
            }]})
        return new_obj_info[0]

Beispiel #19

Datei anzeigen

Datei: file_util.py Projekt: briehl/BBTools

def upload_interleaved_reads(callback_url, reads_file, ws_name, reads_obj_name,
                             source_reads_upa):
    """
    callback_url = as usual.
    reads_file = full path to the reads file to upload
    ws_name = the workspace to use for uploading the reads file
    reads_obj_name = the name of the new reads object to save as
    source_reads = if not None, the source UPA for the original reads file.
    """
    # unfortunately, the ReadsUtils only accepts uncompressed fq files- this should
    # be fixed on the KBase side
    dfu = DataFileUtil(callback_url)
    reads_unpacked = dfu.unpack_file({'file_path': reads_file})['file_path']

    ru = ReadsUtils(callback_url)
    new_reads_upa = ru.upload_reads({
        'fwd_file': reads_unpacked,
        'interleaved': 1,
        'wsname': ws_name,
        'name': reads_obj_name,
        'source_reads_ref': source_reads_upa
    })['obj_ref']
    print('saved ' + str(reads_unpacked) + ' to ' + str(new_reads_upa))
    return new_reads_upa

Beispiel #20

Datei anzeigen

Datei: kb_PRINSEQImpl.py Projekt: nlharris/kb_PRINSEQ

    def execReadLibraryPRINSEQ(self, ctx, input_params):
        """
        :param input_params: instance of type "inputPRINSEQ" (execPRINSEQ and
           execReadLibraryPRINSEQ input input_reads_ref : may be
           KBaseFile.PairedEndLibrary or KBaseFile.SingleEndLibrary output_ws
           : workspace to write to output_reads_name : obj_name to create
           lc_method : Low complexity method - value must be "dust" or
           "entropy" lc_entropy_threshold : Low complexity threshold - Value
           must be an integer between 0 and 100. Note a higher
           lc_entropy_threshold in entropy is more stringent.
           lc_dust_threshold : Low complexity threshold - Value must be an
           integer between 0 and 100. Note a lower lc_entropy_threshold is
           less stringent with dust) -> structure: parameter
           "input_reads_ref" of type "data_obj_ref", parameter "output_ws" of
           type "workspace_name" (Common Types), parameter
           "output_reads_name" of type "data_obj_name", parameter "lc_method"
           of String, parameter "lc_entropy_threshold" of Long, parameter
           "lc_dust_threshold" of Long
        :returns: instance of type "outputReadLibraryExecPRINSEQ" ->
           structure: parameter "output_filtered_ref" of type "data_obj_ref",
           parameter "output_unpaired_fwd_ref" of type "data_obj_ref",
           parameter "output_unpaired_rev_ref" of type "data_obj_ref",
           parameter "report" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN execReadLibraryPRINSEQ
        console = []
        #        self.log(console, 'Running execTrimmomatic with parameters: ')
        #        self.log(console, "\n"+pformat(input_params))
        report = ''
        returnVal = dict()
        #        retVal['output_filtered_ref'] = None
        #        retVal['output_unpaired_fwd_ref'] = None
        #        retVal['output_unpaired_rev_ref'] = None

        token = ctx['token']
        wsClient = workspaceService(self.ws_url, token=token)
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # param checks
        required_params = ['input_reads_ref', 'output_ws', 'lc_method']
        # output reads_name is optional. If not set will use old_objects name
        for required_param in required_params:
            if required_param not in input_params or input_params[
                    required_param] is None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        if (input_params['lc_method'] != 'dust') and (input_params['lc_method']
                                                      != 'entropy'):
            raise ValueError(
                "lc_method (low complexity method) must be 'dust' or 'entropy', "
                + "it is currently set to : " + input_params['lc_method'])

        if not ('lc_entropy_threshold' in input_params
                or 'lc_dust_threshold' in input_params):
            raise ValueError(
                ("A low complexity threshold needs to be " +
                 "entered for {}".format(input_params['lc_method'])))
        elif input_params['lc_method'] == 'dust':
            if 'lc_dust_threshold' not in input_params:
                raise ValueError(
                    ("A low complexity threshold needs to be " +
                     "entered for {}".format(input_params['lc_method'])))
            else:
                lc_threshold = input_params['lc_dust_threshold']
        else:
            if 'lc_entropy_threshold' not in input_params:
                raise ValueError(
                    ("A low complexity threshold needs to be " +
                     "entered for {}".format(input_params['lc_method'])))
            else:
                lc_threshold = input_params['lc_entropy_threshold']

        if (lc_threshold < 0.0) or (lc_threshold > 100.0):
            raise ValueError((
                "The threshold for {} must be between 0 and 100, it is currently "
                + "set to : {}").format(input_params['lc_method'],
                                        lc_threshold))
        reportObj = {'objects_created': [], 'text_message': ''}

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = [
            str(input_params['input_reads_ref'])
        ]

        # GET THE READS OBJECT
        # Determine whether read library or read set is input object
        #
        try:
            # object_info tuple
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': input_params['input_reads_ref']
                }]})[0]
            input_reads_obj_type = input_reads_obj_info[TYPE_I]
            # input_reads_obj_version = input_reads_obj_info[VERSION_I]
            # this is object version, not type version

        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                str(input_params['input_reads_ref']) + ')' + str(e))

        # self.log (console, "B4 TYPE: '" +
        #           str(input_reads_obj_type) +
        #           "' VERSION: '" + str(input_reads_obj_version)+"'")
        # remove trailing version
        input_reads_obj_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                      input_reads_obj_type)
        # self.log (console, "AF TYPE: '"+str(input_reads_obj_type)+"' VERSION: '" +
        # str(input_reads_obj_version)+"'")

        # maybe add below later "KBaseSets.ReadsSet",
        acceptable_types = [
            "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary",
            "KBaseAssembly.SingleEndLibrary", "KBaseFile.SingleEndLibrary"
        ]
        if input_reads_obj_type not in acceptable_types:
            raise ValueError("Input reads of type: '" + input_reads_obj_type +
                             "'.  Must be one of " +
                             ", ".join(acceptable_types))

        if input_reads_obj_type in [
                "KBaseFile.PairedEndLibrary", "KBaseAssembly.PairedEndLibrary"
        ]:
            read_type = 'PE'
        elif input_reads_obj_type in [
                "KBaseFile.SingleEndLibrary", "KBaseAssembly.SingleEndLibrary"
        ]:
            read_type = 'SE'

        # Instatiate ReadsUtils
        try:
            readsUtils_Client = ReadsUtils(url=self.callback_url,
                                           token=ctx['token'])  # SDK local
            self._log(None, 'Starting Read File(s) Download')
            readsLibrary = readsUtils_Client.download_reads({
                'read_libraries': [input_params['input_reads_ref']],
                'interleaved':
                'false'
            })
            self._log(None, 'Completed Read File(s) Downloading')
        except Exception as e:
            raise ValueError(
                ('Unable to get read library object from workspace: ({})\n'
                 ).format(str(input_params['input_reads_ref']), str(e)))

        # get WS metadata to get obj_name
        ws = workspaceService(self.ws_url)
        try:
            info = ws.get_object_info_new(
                {'objects': [{
                    'ref': input_params['input_reads_ref']
                }]})[0]
        except workspaceService as wse:
            self._log(console, 'Logging workspace exception')
            self._log(str(wse))
            raise

        #determine new object base name
        new_object_name = info[1]
        if ('output_reads_name' in input_params
                and input_params['output_reads_name'] != ''
                and input_params['output_reads_name'] is not None):
            new_object_name = input_params['output_reads_name']

        # MAKE A DIRECTORY TO PUT THE READ FILE(S)
        # create the output directory and move the file there
        # PUT FILES INTO THE DIRECTORY
        # Sanitize the file names
        tempdir = tempfile.mkdtemp(dir=self.scratch)
        export_dir = os.path.join(tempdir, info[1])
        os.makedirs(export_dir)

        if read_type == 'PE':
            # IF PAIRED END, potentially 6 files created
            # one of each for the two directions(good(paired), good_singletons, bad)
            # Take the good paired and (re)upload new reads object.
            # We throwout the bad reads

            input_files_info = self._setup_pe_files(readsLibrary, export_dir,
                                                    input_params)

            # RUN PRINSEQ with user options (lc_method and lc_threshold)
            cmd = (
                "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} "
                "-fastq2 {} -out_format 3 -lc_method {} "
                "-lc_threshold {}").format(
                    input_files_info["fastq_file_path"],
                    input_files_info["fastq2_file_path"],
                    input_params['lc_method'], lc_threshold)
            print "Command to be run : " + cmd
            args = shlex.split(cmd)
            perl_script = subprocess.Popen(args,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            output = perl_script.communicate()
            found_results = False
            file_names_dict = dict()
            for element in output:
                if "Input and filter stats:" in element:
                    found_results = True
                    element_parts = element.split("Input and filter stats:")
                    # PRINSEQ OUTPUT
                    report = "Input and filter stats:{}".format(
                        element_parts[1])
                    reportObj['text_message'] = report
                    read_files_list = os.listdir(export_dir)

                    # proc = subprocess.Popen(['ls', '-l', export_dir], stdout=subprocess.PIPE)
                    # proc_output = proc.stdout.read()
                    # print "PROC OUTPUT : " + proc_output

                    for read_filename in read_files_list:
                        file_direction = None
                        print "Read File : {}".format(read_filename)
                        # determine if forward(fastq) or reverse(fastq2) file
                        if input_files_info["fastq_filename"] in read_filename:
                            file_direction = "fwd"
                        elif input_files_info[
                                "fastq2_filename"] in read_filename:
                            file_direction = "rev"
                        if file_direction is not None:
                            # determine good singleton or good part of a pair.
                            print "TEST: {}_prinseq_good_".format(
                                input_files_info["fastq_filename"])
                            if ("{}_prinseq_good_singletons".format(
                                    input_files_info["fastq_filename"])
                                    in read_filename
                                    or "{}_prinseq_good_singletons".format(
                                        input_files_info["fastq2_filename"])
                                    in read_filename):
                                # Unpaired singletons that need to be
                                # saved as a new single end reads object
                                file_names_dict["{}_good_singletons".format(file_direction)] = \
                                    os.path.join(export_dir, read_filename)
                            elif ("{}_prinseq_good_".format(
                                    input_files_info["fastq_filename"])
                                  in read_filename
                                  or "{}_prinseq_good_".format(
                                      input_files_info["fastq2_filename"])
                                  in read_filename):
                                file_names_dict["{}_good_pair".format(file_direction)] = \
                                    os.path.join(export_dir, read_filename)
                    if (('fwd_good_pair' in file_names_dict)
                            and ('rev_good_pair' in file_names_dict)):
                        self._log(None, 'Saving new Paired End Reads')
                        returnVal['filtered_paired_end_ref'] = \
                            readsUtils_Client.upload_reads({'wsname':
                                                            str(input_params['output_ws']),
                                                            'name': new_object_name,
                                                            'source_reads_ref':
                                                            input_params['input_reads_ref'],
                                                            'fwd_file':
                                                                file_names_dict['fwd_good_pair'],
                                                            'rev_file':
                                                                file_names_dict['rev_good_pair']
                                                            }
                                                           )['obj_ref']
                        reportObj['objects_created'].append({
                            'ref':
                            returnVal['filtered_paired_end_ref'],
                            'description':
                            'Filtered Paired End Reads',
                            'object_name':
                            new_object_name
                        })
                        print "REFERENCE : " + str(
                            returnVal['filtered_paired_end_ref'])
                    else:
                        reportObj['text_message'] += \
                            "\n\nNo good matching pairs passed low complexity filtering.\n" + \
                            "Consider loosening the threshold value.\n"
                    if 'fwd_good_singletons' in file_names_dict:
                        self._log(None, 'Saving new Forward Unpaired Reads')
                        fwd_object_name = "{}_fwd_singletons".format(
                            new_object_name)
                        returnVal['output_filtered_fwd_unpaired_end_ref'] = \
                            readsUtils_Client.upload_reads({'wsname':
                                                            str(input_params['output_ws']),
                                                            'name': fwd_object_name,
                                                            'source_reads_ref':
                                                            input_params['input_reads_ref'],
                                                            'fwd_file':
                                                            file_names_dict['fwd_good_singletons']}
                                                           )['obj_ref']
                        reportObj['objects_created'].append({
                            'ref':
                            returnVal['output_filtered_fwd_unpaired_end_ref'],
                            'description':
                            'Filtered Forward Unpaired End Reads',
                            'object_name':
                            fwd_object_name
                        })
                        print "REFERENCE : " + \
                            str(returnVal['output_filtered_fwd_unpaired_end_ref'])
                    if 'rev_good_singletons' in file_names_dict:
                        self._log(None, 'Saving new Reverse Unpaired Reads')
                        rev_object_name = "{}_rev_singletons".format(
                            new_object_name)
                        returnVal['output_filtered_rev_unpaired_end_ref'] = \
                            readsUtils_Client.upload_reads({'wsname':
                                                            str(input_params['output_ws']),
                                                            'name': rev_object_name,
                                                            'source_reads_ref':
                                                            input_params['input_reads_ref'],
                                                            'fwd_file':
                                                            file_names_dict['rev_good_singletons']}
                                                           )['obj_ref']
                        reportObj['objects_created'].append({
                            'ref':
                            returnVal['output_filtered_rev_unpaired_end_ref'],
                            'description':
                            'Filtered Reverse Unpaired End Reads',
                            'object_name':
                            rev_object_name
                        })
                        print "REFERENCE : " + \
                            str(returnVal['output_filtered_rev_unpaired_end_ref'])
                    if len(reportObj['objects_created']) > 0:
                        reportObj['text_message'] += "\nOBJECTS CREATED :\n"
                        for obj in reportObj['objects_created']:
                            reportObj['text_message'] += "{} : {}\n".format(
                                obj['object_name'], obj['description'])
                    else:
                        reportObj['text_message'] += \
                            "\nFiltering filtered out all reads. No objects made.\n"
            if not found_results:
                raise Exception('Unable to execute PRINSEQ, Error: {}'.format(
                    str(output)))
            print "FILES DICT : {}".format(str(file_names_dict))
            print "REPORT OBJECT :"
            print str(reportObj)

        elif read_type == 'SE':
            # Download reads Libs to FASTQ files
            # IF SINGLE END INPUT 2 files created (good and bad)
            # Take good and (re)upload new reads object
            input_fwd_file_path = \
                readsLibrary['files'][input_params['input_reads_ref']]['files']['fwd']
            fastq_filename = self._sanitize_file_name(
                os.path.basename(input_fwd_file_path))
            fastq_file_path = os.path.join(export_dir, fastq_filename)
            shutil.move(input_fwd_file_path, fastq_file_path)

            # RUN PRINSEQ with user options (lc_method and lc_threshold)
            cmd = (
                "perl /opt/lib/prinseq-lite-0.20.4/prinseq-lite.pl -fastq {} "
                "-out_format 3 -lc_method {} "
                "-lc_threshold {}").format(fastq_file_path,
                                           input_params['lc_method'],
                                           lc_threshold)
            print "Command to be run : " + cmd
            args = shlex.split(cmd)
            print "ARGS:  " + str(args)
            perl_script = subprocess.Popen(args,
                                           stdout=subprocess.PIPE,
                                           stderr=subprocess.PIPE)
            output = perl_script.communicate()
            print "OUTPUT: " + str(output)
            found_results = False
            found_se_filtered_file = False
            file_names_dict = dict()
            for element in output:
                if "Input and filter stats:" in element:
                    found_results = True
                    element_parts = element.split("Input and filter stats:")
                    # PRINSEQ OUTPUT
                    report = "Input and filter stats:{}".format(
                        element_parts[1])
                    reportObj['text_message'] = report
                    read_files_list = os.listdir(export_dir)

                    for read_filename in read_files_list:
                        print "Early Read File : {}".format(read_filename)

                    for read_filename in read_files_list:
                        print "Read File : {}".format(read_filename)
                        if ("{}_prinseq_good_".format(fastq_filename)
                                in read_filename):
                            #Found Good file. Save the Reads objects
                            self._log(None, 'Saving Filtered Single End Reads')
                            returnVal['output_filtered_single_end_ref'] = \
                                readsUtils_Client.upload_reads({'wsname':
                                                                str(input_params['output_ws']),
                                                                'name': new_object_name,
                                                                'source_reads_ref':
                                                                input_params['input_reads_ref'],
                                                                'fwd_file':
                                                                    os.path.join(export_dir,
                                                                                 read_filename)}
                                                               )['obj_ref']
                            reportObj['objects_created'].append({
                                'ref':
                                returnVal['output_filtered_single_end_ref'],
                                'description':
                                'Filtered Single End Reads'
                            })
                            print "REFERENCE : " + str(
                                returnVal['output_filtered_single_end_ref'])
                            found_se_filtered_file = True
                            break
            if not found_se_filtered_file:
                reportObj['text_message'] += \
                    "\n\nNone of the reads passed low complexity filtering.\n" + \
                    "Consider loosening the threshold value.\n"
            if not found_results:
                raise Exception('Unable to execute PRINSEQ, Error: {}'.format(
                    str(output)))
            print "FILES DICT : {}".format(str(file_names_dict))
            print "REPORT OBJECT :"
            print str(reportObj)

        # save report object
        #
        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report = KBaseReport(self.callback_url, token=ctx['token'], service_ver=SERVICE_VER)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': input_params['output_ws']
        })

        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END execReadLibraryPRINSEQ

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method execReadLibraryPRINSEQ return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

Beispiel #21

Datei anzeigen

Datei: ImportSRAUtil.py Projekt: vkcshl/kb_uploadmethods

class ImportSRAUtil:

    SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump'

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _check_fastq_dump_result(self, tmp_dir, sra_name):
        """
        _check_fastq_dump_result: check fastq_dump result is PE or SE
        """
        return os.path.exists(tmp_dir + '/' + sra_name + '/1')

    def _sra_to_fastq(self, scratch_sra_file_path, params):
        """
        _sra_to_fastq: convert SRA file to FASTQ file(s)
        """

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)

        command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O '
        command += tmp_dir + ' ' + scratch_sra_file_path

        self._run_command(command)

        sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0]
        paired_end = self._check_fastq_dump_result(tmp_dir, sra_name)

        if paired_end:
            self._validate_paired_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'

            rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq')
            os.rename(rev_file, rev_file + '.fastq')
            rev_file = rev_file + '.fastq'
        else:
            self._validate_single_end_advanced_params(params)
            fwd_file = os.path.join(tmp_dir, sra_name, 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'
            rev_file = None

        fastq_file_path = {'fwd_file': fwd_file, 'rev_file': rev_file}
        return fastq_file_path

    def _validate_single_end_advanced_params(self, params):
        """
        _validate_single_end_advanced_params: validate advanced params for single end reads
        """
        if (params.get('insert_size_mean') or params.get('insert_size_std_dev')
                or params.get('read_orientation_outward')):
            error_msg = 'Advanced params "Mean Insert Size", "St. Dev. of Insert Size" or '
            error_msg += '"Reads Orientation Outward" is Paried End Reads specific'
            raise ValueError(error_msg)

        if 'interleaved' in params:
            del params['interleaved']

    def _validate_paired_end_advanced_params(self, params):
        """
        _validate_paired_end_advanced_params: validate advanced params for paired end reads

        """
        sequencing_tech = params.get('sequencing_tech')

        if sequencing_tech in ['PacBio CCS', 'PacBio CLR']:
            error_msg = 'Sequencing Technology: "PacBio CCS" or "PacBio CLR" '
            error_msg += 'is Single End Reads specific'
            raise ValueError(error_msg)

    def _validate_upload_staging_file_availability(self,
                                                   staging_file_subdir_path):
        """
        _validate_upload_file_path_availability: validates file availability in user's staging area

        """
        pass
        # TODO ftp_server needs to be fixed for subdir
        # list = ftp_service(self.callback_url).list_files()
        # if staging_file_subdir_path not in list:
        #     error_msg = 'Target file: {} is NOT available.\n'.format(
        #                                         staging_file_subdir_path.rpartition('/')[-1])
        #     error_msg += 'Available files:\n {}'.format("\n".join(list))
        #     raise ValueError(error_msg)

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = os.path.join(config['scratch'],
                                    'import_SRA_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_sra_from_staging(self, params):
        '''
          import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          sequencing_tech: sequencing technology
          name: output reads file name
          workspace_name: workspace name/ID of the object

          Optional Params:
          single_genome: whether the reads are from a single genome or a metagenome.
          insert_size_mean: mean (average) insert length
          insert_size_std_dev: standard deviation of insert lengths
          read_orientation_outward: whether reads in a pair point outward

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_sra_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        log('Downloaded staging file to: {}'.format(scratch_sra_file_path))

        fastq_file_path = self._sra_to_fastq(scratch_sra_file_path, params)

        import_sra_reads_params = params
        import_sra_reads_params.update(fastq_file_path)

        workspace_name_or_id = params.get('workspace_name')
        if str(workspace_name_or_id).isdigit():
            import_sra_reads_params['wsid'] = int(workspace_name_or_id)
        else:
            import_sra_reads_params['wsname'] = str(workspace_name_or_id)

        log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
            json.dumps(import_sra_reads_params, indent=1)))
        returnVal = self.ru.upload_reads(import_sra_reads_params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            params.get('staging_file_subdir_path'), returnVal['obj_ref'])
        return returnVal

    def import_sra_from_web(self, params):
        '''
        import_sra_from_web: wrapper method for GenomeFileUtil.genbank_to_genome

        required params:
        download_type: download type for web source fastq file
                       ('Direct Download', 'FTP', 'DropBox', 'Google Drive')
        workspace_name: workspace name/ID of the object

        sra_urls_to_add: dict of SRA file URLs
            required params:
            file_url: SRA file URL
            sequencing_tech: sequencing technology
            name: output reads file name

            Optional Params:
            single_genome: whether the reads are from a single genome or a metagenome.
            insert_size_mean: mean (average) insert length
            insert_size_std_dev: standard deviation of insert lengths
            read_orientation_outward: whether reads in a pair point outward

        return:
        obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_web\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_web_params(params)

        download_type = params.get('download_type')
        workspace_name = params.get('workspace_name')

        obj_refs = []
        uploaded_files = []

        for sra_url_to_add in params.get('sra_urls_to_add'):
            download_web_file_params = {
                'download_type': download_type,
                'file_url': sra_url_to_add.get('file_url')
            }
            scratch_sra_file_path = self.dfu.download_web_file(
                download_web_file_params).get('copy_file_path')
            log('Downloaded web file to: {}'.format(scratch_sra_file_path))

            fastq_file_path = self._sra_to_fastq(scratch_sra_file_path,
                                                 sra_url_to_add)

            import_sra_reads_params = sra_url_to_add
            import_sra_reads_params.update(fastq_file_path)

            workspace_name_or_id = workspace_name
            if str(workspace_name_or_id).isdigit():
                import_sra_reads_params['wsid'] = int(workspace_name_or_id)
            else:
                import_sra_reads_params['wsname'] = str(workspace_name_or_id)

            log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
                json.dumps(import_sra_reads_params, indent=1)))

            obj_ref = self.ru.upload_reads(import_sra_reads_params).get(
                'obj_ref')
            obj_refs.append(obj_ref)
            uploaded_files.append(sra_url_to_add.get('file_url'))

        return {'obj_refs': obj_refs, 'uploaded_files': uploaded_files}

    def validate_import_sra_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'sequencing_tech', 'name',
                'workspace_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        self._validate_upload_staging_file_availability(
            params.get('staging_file_subdir_path'))

    def validate_import_sra_from_web_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['download_type', 'workspace_name', 'sra_urls_to_add']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        if not isinstance(params.get('sra_urls_to_add'), list):
            raise ValueError('sra_urls_to_add is not type list as required')

        for sra_url_to_add in params.get('sra_urls_to_add'):
            for p in ['file_url', 'sequencing_tech', 'name']:
                if p not in sra_url_to_add:
                    raise ValueError(
                        '"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_refs_list, params):
        """
        generate_report: generate summary report

        obj_refs: generated workspace object references. (return of import_sra_from_staging/web)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """
        uuid_string = str(uuid.uuid4())

        objects_created = list()
        objects_data = list()

        for obj_ref in obj_refs_list:
            get_objects_params = {
                'object_refs': [obj_ref],
                'ignore_errors': False
            }
            objects_data.append(self.dfu.get_objects(get_objects_params))

            objects_created.append({
                'ref': obj_ref,
                'description': 'Imported Reads'
            })

        output_html_files = self.generate_html_report(objects_data, params,
                                                      uuid_string)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 460,
            'report_object_name': 'kb_sra_upload_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def generate_html_report(self, reads_objs, params, uuid_string):
        """
        _generate_html_report: generate html summary report
        """
        log('Start generating html report')
        pprint(params)

        result_file_path = os.path.join(self.scratch, 'report.html')
        html_report = list()
        objects_content = ''

        for index, reads_obj in enumerate(reads_objs):

            idx = str(index)
            reads_data = reads_obj.get('data')[0].get('data')
            reads_info = reads_obj.get('data')[0].get('info')
            reads_ref = str(reads_info[6]) + '/' + str(
                reads_info[0]) + '/' + str(reads_info[4])
            reads_obj_name = str(reads_info[1])

            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template_sra/table_panel.html'),
                    'r') as object_content_file:
                report_template = object_content_file.read()
                report_template = report_template.replace('_NUM', str(idx))
                report_template = report_template.replace(
                    'OBJECT_NAME', reads_obj_name)
                if index == 0:
                    report_template = report_template.replace(
                        'panel-collapse collapse',
                        'panel-collapse collapse in')

            objects_content += report_template
            base_percentages = ''
            for key, val in reads_data.get('base_percentages').iteritems():
                base_percentages += '{}({}%) '.format(key, val)

            reads_overview_data = collections.OrderedDict()

            reads_overview_data['Name'] = '{} ({})'.format(
                reads_obj_name, reads_ref)
            reads_overview_data['Uploaded File'] = params.get(
                'uploaded_files')[index]
            reads_overview_data['Date Uploaded'] = time.strftime("%c")
            reads_overview_data['Number of Reads'] = '{:,}'.format(
                reads_data.get('read_count'))

            reads_type = reads_info[2].lower()
            if 'single' in reads_type:
                reads_overview_data['Type'] = 'Single End'
            elif 'paired' in reads_type:
                reads_overview_data['Type'] = 'Paired End'
            else:
                reads_overview_data['Type'] = 'Unknown'

            reads_overview_data['Platform'] = reads_data.get(
                'sequencing_tech', 'Unknown')

            reads_single_genome = str(
                reads_data.get('single_genome', 'Unknown'))
            if '0' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'No'
            elif '1' in reads_single_genome:
                reads_overview_data['Single Genome'] = 'Yes'
            else:
                reads_overview_data['Single Genome'] = 'Unknown'

            insert_size_mean = params.get('insert_size_mean', 'Not Specified')
            if insert_size_mean is not None:
                reads_overview_data['Insert Size Mean'] = str(insert_size_mean)
            else:
                reads_overview_data['Insert Size Mean'] = 'Not Specified'

            insert_size_std_dev = params.get('insert_size_std_dev',
                                             'Not Specified')
            if insert_size_std_dev is not None:
                reads_overview_data['Insert Size Std Dev'] = str(
                    insert_size_std_dev)
            else:
                reads_overview_data['Insert Size Std Dev'] = 'Not Specified'

            reads_outward_orientation = str(
                reads_data.get('read_orientation_outward', 'Unknown'))
            if '0' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'No'
            elif '1' in reads_outward_orientation:
                reads_overview_data['Outward Read Orientation'] = 'Yes'
            else:
                reads_overview_data['Outward Read Orientation'] = 'Unknown'

            reads_stats_data = collections.OrderedDict()

            reads_stats_data['Number of Reads'] = '{:,}'.format(
                reads_data.get('read_count'))
            reads_stats_data['Total Number of Bases'] = '{:,}'.format(
                reads_data.get('total_bases'))
            reads_stats_data['Mean Read Length'] = str(
                reads_data.get('read_length_mean'))
            reads_stats_data['Read Length Std Dev'] = str(
                reads_data.get('read_length_stdev'))
            dup_reads_percent = '{:.2f}'.format(float(reads_data.get('number_of_duplicates') * 100) / \
                                                reads_data.get('read_count'))
            reads_stats_data['Number of Duplicate Reads(%)'] = '{} ({}%)' \
                .format(str(reads_data.get('number_of_duplicates')),
                        dup_reads_percent)
            reads_stats_data['Phred Type'] = str(reads_data.get('phred_type'))
            reads_stats_data['Quality Score Mean'] = '{0:.2f}'.format(
                reads_data.get('qual_mean'))
            reads_stats_data['Quality Score (Min/Max)'] = '{}/{}'.format(
                str(reads_data.get('qual_min')),
                str(reads_data.get('qual_max')))
            reads_stats_data['GC Percentage'] = str(
                round(reads_data.get('gc_content') * 100, 2)) + '%'
            reads_stats_data['Base Percentages'] = base_percentages

            overview_content = ''
            for key, val in reads_overview_data.iteritems():
                overview_content += '<tr><td><b>{}</b></td>'.format(key)
                overview_content += '<td>{}</td>'.format(val)
                overview_content += '</tr>'

            stats_content = ''
            for key, val in reads_stats_data.iteritems():
                stats_content += '<tr><td><b>{}</b></td>'.format(key)
                stats_content += '<td>{}</td>'.format(val)
                stats_content += '</tr>'

            objects_content = objects_content.replace('###OVERVIEW_CONTENT###',
                                                      overview_content)
            objects_content = objects_content.replace('###STATS_CONTENT###',
                                                      stats_content)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template_sra/report_head.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '###TABLE_PANELS_CONTENT###', objects_content)
                result_file.write(report_template)
        result_file.close()

        shutil.copytree(
            os.path.join(os.path.dirname(__file__),
                         'report_template_sra/bootstrap-3.3.7'),
            os.path.join(self.scratch, 'bootstrap-3.3.7'))
        shutil.copy(
            os.path.join(os.path.dirname(__file__),
                         'report_template_sra/jquery-3.2.1.min.js'),
            os.path.join(self.scratch, 'jquery-3.2.1.min.js'))

        matched_files = []
        for root, dirnames, filenames in os.walk(self.scratch):
            for filename in fnmatch.filter(filenames, '*.gz'):
                matched_files.append(os.path.join(root, filename))

        for gz_file in matched_files:
            print('Removing ' + gz_file)
            os.remove(gz_file)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': self.scratch,
            'pack': 'zip'
        })['shock_id']
        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Imported Assembly'
        })
        return html_report

Beispiel #22

Datei anzeigen

    def KButil_Build_InSilico_Metagenomes_with_Grinder(self, ctx, params):
        """
        :param params: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Params"
           (KButil_Build_InSilico_Metagenomes_with_Grinder() ** **  Use
           Grinder to generate in silico shotgun metagenomes) -> structure:
           parameter "workspace_name" of type "workspace_name" (** The
           workspace object refs are of form: ** **    objects =
           ws.get_objects([{'ref':
           params['workspace_id']+'/'+params['obj_name']}]) ** ** "ref" means
           the entire name combining the workspace id and the object name **
           "id" is a numerical identifier of the workspace or object, and
           should just be used for workspace ** "name" is a string identifier
           of a workspace or object.  This is received from Narrative.),
           parameter "input_refs" of type "data_obj_ref", parameter
           "output_name" of type "data_obj_name", parameter "desc" of String,
           parameter "num_reads_per_lib" of Long, parameter
           "population_percs" of String, parameter "read_len_mean" of Long,
           parameter "read_len_stddev" of Double, parameter "pairs_flag" of
           Long, parameter "mate_orientation" of String, parameter
           "insert_len_mean" of Long, parameter "insert_len_stddev" of
           Double, parameter "mutation_dist" of String, parameter
           "mutation_ratio" of String, parameter "qual_good" of Long,
           parameter "qual_bad" of Long, parameter "len_bias_flag" of Long,
           parameter "random_seed" of Long
        :returns: instance of type
           "KButil_Build_InSilico_Metagenomes_with_Grinder_Output" ->
           structure: parameter "report_name" of type "data_obj_name",
           parameter "report_ref" of type "data_obj_ref"
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN KButil_Build_InSilico_Metagenomes_with_Grinder

        #### STEP 0: basic init
        ##
        console = []
        invalid_msgs = []
        report_text = ''
        self.log(console,
                 'Running KButil_Build_InSilico_Metagenomes_with_Grinder(): ')
        self.log(console, "\n" + pformat(params))

        # Auth
        token = ctx['token']
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # API Clients
        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'
        wsClient = workspaceService(self.workspaceURL, token=token)
        readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                       token=ctx['token'])  # SDK local
        #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token'])  # for SDK local.  local doesn't work for SetAPI
        setAPI_Client = SetAPI(url=self.serviceWizardURL,
                               token=ctx['token'])  # for dynamic service
        auClient = AssemblyUtil(self.callbackURL,
                                token=ctx['token'],
                                service_ver=SERVICE_VER)
        dfu = DFUClient(self.callbackURL)

        # param checks
        required_params = [
            'workspace_name', 'input_refs', 'output_name', 'num_reads_per_lib',
            'population_percs', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'mate_orientation', 'insert_len_mean',
            'insert_len_stddev', 'mutation_dist', 'mutation_ratio',
            'qual_good', 'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # cast to str unpredictable numerical params (mostly used in string context)
        numerical_params = [
            'num_reads_per_lib', 'read_len_mean', 'read_len_stddev',
            'pairs_flag', 'insert_len_mean', 'insert_len_stddev', 'qual_good',
            'qual_bad', 'len_bias_flag', 'random_seed'
        ]
        for arg in numerical_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                continue
            params[arg] = str(params[arg])

        # load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        provenance[0]['input_ws_objects'] = []
        for input_ref in params['input_refs']:
            provenance[0]['input_ws_objects'].append(input_ref)

        # set the output paths
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        html_output_dir = os.path.join(output_dir, 'html')
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)

        #### STEP 1: Parse population_percs and write to file
        ##
        abundance_str = params['population_percs'].strip()
        abundance_file_path = os.path.join(output_dir, 'my_abundances.txt')
        abundance_config_num_libs = 0
        abundance_config_num_libs_set = False
        grinder_genome_ids = []
        header = []
        out_buf = []

        for row in abundance_str.split("\n"):
            cols = re.split(r'\s+', row)
            if cols[0].upper() == "GENOME":
                for col in cols:
                    if col == '':
                        continue
                    header.append(col)
                continue
            grinder_genome_ids.append(cols[0])
            self.log(console, "GRINDER GENOME ID: '" + cols[0] + "'")  # DEBUG
            out_row = []
            for col in cols:
                if col == '':
                    continue
                elif col == '%':
                    continue
                elif col.endswith('%'):
                    col = col.rstrip('%')
                out_row.append(col)
            out_buf.append("\t".join(out_row))
            num_samples = len(out_row) - 1  # first col is genome id
            if not abundance_config_num_libs_set:
                abundance_config_num_libs_set = True
                abundance_config_num_libs = num_samples
            elif num_samples != abundance_config_num_libs:
                invalid_msgs.append(
                    "inconsistent number of samples in population_percs input field"
                )
        # data validation
        if abundance_config_num_libs == 0:
            invalid_msgs.append(
                "unable to find sample percentages in population_percs input field"
            )
        sample_sums = []
        for row_i, abund_row_str in enumerate(out_buf):
            abund_row = abund_row_str.split()
            for sample_i, abund in enumerate(abund_row[1:]):
                if row_i == 0:
                    sample_sums.append(0)
                #self.log (console, "row_i: "+str(row_i)+" sample_i: "+str(sample_i))  # DEBUG
                sample_sums[sample_i] += float(abund)
        for sample_i, sample_sum in enumerate(sample_sums):
            if sample_sum < 99.5 or sample_sum > 100.5:
                self.log(
                    invalid_msgs, "Sample: " + str(sample_i + 1) + " " +
                    header[sample_i + 1] +
                    " proportions is not summing to 100.0. Summing to: " +
                    str(sample_sum))

        if len(invalid_msgs) == 0:
            with open(abundance_file_path, 'w') as abundance_fh:
                for out_line in out_buf:
                    abundance_fh.write(out_line + "\n")
            # DEBUG
            with open(abundance_file_path, 'r') as abundance_fh:
                for out_line in abundance_fh.readlines():
                    out_line = out_line.rstrip()
                    self.log(console, "ABUNDANCE_CONFIG: '" + out_line + "'")

        #### STEP 2: get genome scaffold sequences
        ##
        if len(invalid_msgs) == 0:
            genomes_src_db_file_path = os.path.join(output_dir, 'genomes.fna')
            read_buf_size = 65536
            write_buf_size = 65536
            accepted_input_types = ["KBaseGenomes.Genome"]
            genome_refs = params['input_refs']
            genome_obj_names = []
            genome_sci_names = []
            assembly_refs = []

            for i, input_ref in enumerate(genome_refs):
                # genome obj info
                try:
                    [
                        OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I,
                        SAVED_BY_I, WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I,
                        META_I
                    ] = range(11)  # object_info tuple
                    input_obj_info = wsClient.get_object_info_new(
                        {'objects': [{
                            'ref': input_ref
                        }]})[0]
                    input_obj_type = re.sub(
                        '-[0-9]+\.[0-9]+$', "",
                        input_obj_info[TYPE_I])  # remove trailing version
                    genome_obj_names.append(input_obj_info[NAME_I])

                except Exception as e:
                    raise ValueError('Unable to get object from workspace: (' +
                                     input_ref + ')' + str(e))
                if input_obj_type not in accepted_input_types:
                    raise ValueError("Input object of type '" +
                                     input_obj_type +
                                     "' not accepted.  Must be one of " +
                                     ", ".join(accepted_input_types))

                # genome obj data
                try:
                    genome_obj = wsClient.get_objects([{
                        'ref': input_ref
                    }])[0]['data']
                    genome_sci_names.append(genome_obj['scientific_name'])
                except:
                    raise ValueError("unable to fetch genome: " + input_ref)

                # Get assembly_refs
                if ('contigset_ref' not in genome_obj or genome_obj['contigset_ref'] == None) \
                   and ('assembly_ref' not in genome_obj or genome_obj['assembly_ref'] == None):
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " MISSING BOTH contigset_ref AND assembly_ref.  Cannot process.  Exiting."
                    self.log(console, msg)
                    self.log(invalid_msgs, msg)
                    continue
                elif 'assembly_ref' in genome_obj and genome_obj[
                        'assembly_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING assembly_ref: " + str(
                                genome_obj['assembly_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['assembly_ref'])
                elif 'contigset_ref' in genome_obj and genome_obj[
                        'contigset_ref'] != None:
                    msg = "Genome " + genome_obj_names[
                        i] + " (ref:" + input_ref + ") " + genome_sci_names[
                            i] + " USING contigset_ref: " + str(
                                genome_obj['contigset_ref'])
                    self.log(console, msg)
                    assembly_refs.append(genome_obj['contigset_ref'])

        # get fastas for scaffolds
        if len(invalid_msgs) == 0:
            contig_file_paths = []

            for genome_i, input_ref in enumerate(genome_refs):
                contig_file = auClient.get_assembly_as_fasta({
                    'ref':
                    assembly_refs[genome_i]
                }).get('path')
                sys.stdout.flush()
                contig_file_path = dfu.unpack_file({'file_path':
                                                    contig_file})['file_path']
                contig_file_paths.append(contig_file_path)

            # reformat FASTA IDs for Grinder
            with open(genomes_src_db_file_path, 'w',
                      write_buf_size) as genomes_src_db_fh:
                for genome_i, contig_file_path in enumerate(contig_file_paths):
                    #self.log(console,str(genome_i)+" CONTIG_FILE: "+contig_file_path)  # DEBUG
                    #contig_ids = []
                    with open(contig_file_path, 'r',
                              read_buf_size) as contig_fh:
                        genome_seq = ''
                        contig_seq = ''
                        contig_seqs = []
                        for contig_line in contig_fh.readlines():
                            contig_line = contig_line.rstrip()
                            if contig_line.startswith('>'):
                                #contig_id = contig_line.strip()[1:].split(' ')[0]
                                #contig_ids.append(contig_id)
                                #genomes_src_db_fh.write(">"+grinder_genome_ids[genome_i]+"\n")
                                if contig_seq != '':
                                    contig_seqs.append(contig_seq)
                                    contig_seq = ''
                                    continue
                            else:
                                #genomes_src_db_fh.write(contig_line)
                                contig_seq += contig_line
                        if contig_seq != '':
                            contig_seqs.append(contig_seq)
                            contig_seq = ''

                    # write joined contigs to file
                    genome_seq = "NNNNNNNNNN".join(
                        contig_seqs
                    )  # NOTE: Using "-exclude_chars" grinder opt on N to avoid contig joins
                    genome_seq = genome_seq.upper(
                    )  # grinder might require upper case?
                    genomes_src_db_fh.write(">" +
                                            grinder_genome_ids[genome_i] +
                                            "\n")
                    genomes_src_db_fh.write(genome_seq + "\n")
                    genome_seq = ''
                    contig_seqs = []

                    # DEBUG
                    #for contig_id in contig_ids:
                    #    self.log(console, "\tCONTIG_ID: "+contig_id)  # DEBUG
            # DEBUG
            toggle = 0
            with open(genomes_src_db_file_path, 'r',
                      write_buf_size) as genomes_src_db_fh:
                for contig_line in genomes_src_db_fh.readlines():
                    contig_line = contig_line.rstrip()
                    if contig_line.startswith('>'):
                        self.log(console, 'GENOMES_SRC_DB: ' + contig_line)
                        genome_id = contig_line[1:]
                        toggle = 0
                    elif toggle == 0:
                        #elif genome_id == 'G3':
                        self.log(
                            console,
                            'GENOMES_SRC_DB: ' + contig_line[0:50] + '...')
                        toggle += 1

        #### STEP 3: Run Grinder
        ##
        if len(invalid_msgs) == 0:
            cmd = []
            cmd.append(self.GRINDER)
            # output
            cmd.append('-base_name')
            cmd.append(params['output_name'])
            cmd.append('-output_dir')
            cmd.append(output_dir)
            # contigs input
            cmd.append('-reference_file')
            cmd.append(genomes_src_db_file_path)
            # abundances
            cmd.append('-abundance_file')
            cmd.append(abundance_file_path)
            # library size
            cmd.append('-total_reads')
            cmd.append(str(params['num_reads_per_lib']))
            # num libraries (overridden by abundance file?)
            cmd.append('-num_libraries')
            cmd.append(str(abundance_config_num_libs))
            # read and insert lens
            cmd.append('-read_dist')
            cmd.append(str(params['read_len_mean']))
            cmd.append('normal')
            cmd.append(str(params['read_len_stddev']))
            if str(params['pairs_flag']) == '1':
                cmd.append('-insert_dist')
                cmd.append(str(params['insert_len_mean']))
                cmd.append('normal')
                cmd.append(str(params['insert_len_stddev']))
                # mate orientation
                cmd.append('-mate_orientation')
                cmd.append(params['mate_orientation'])
            # genome len bias
            cmd.append('-length_bias')
            cmd.append(str(params['len_bias_flag']))
            # mutation model
            cmd.append('-mutation_dist')
            cmd.append(str(params['mutation_dist']))
            cmd.append('-mutation_ratio')
            cmd.append(str(params['mutation_ratio']))
            # qual scores
            cmd.append('-fastq_output')
            cmd.append('1')
            cmd.append('-qual_levels')
            cmd.append(str(params['qual_good']))
            cmd.append(str(params['qual_bad']))
            # skip contig joins
            cmd.append('-exclude_chars')
            cmd.append('NX')
            # explicitly request bidirectional
            cmd.append('-unidirectional')
            cmd.append('0')
            # random seed
            if 'random_seed' in params and params[
                    'random_seed'] != None and params['random_seed'] != '':
                cmd.append('-random_seed')
                cmd.append(str(params['random_seed']))

            # RUN
            cmd_str = " ".join(cmd)
            self.log(console, "===========================================")
            self.log(console, "RUNNING: " + cmd_str)
            self.log(console, "===========================================")

            cmdProcess = subprocess.Popen(cmd_str,
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.STDOUT,
                                          shell=True)
            outputlines = []
            while True:
                line = cmdProcess.stdout.readline()
                outputlines.append(line)
                if not line: break
                self.log(console, line.replace('\n', ''))

            cmdProcess.stdout.close()
            cmdProcess.wait()
            self.log(console,
                     'return code: ' + str(cmdProcess.returncode) + '\n')
            if cmdProcess.returncode != 0:
                raise ValueError('Error running kb_grinder, return code: ' +
                                 str(cmdProcess.returncode) + '\n')

            #report_text += "\n".join(outputlines)
            #report_text += "cmdstring: " + cmdstring + " stdout: " + stdout + " stderr " + stderr

            # capture output for report and paths to out files
            report_text_buf = []
            struct_file_paths = []
            struct_file_names = []
            fastq_file_paths = []
            for out_line in outputlines:
                out_line = out_line.rstrip()
                if 'Community structure' in out_line:
                    clean_line = out_line.lstrip()
                    struct_file_path = re.split(r'\s+', clean_line)[3]
                    struct_file_paths.append(struct_file_path)
                    struct_file_names.append(struct_file_path.split('/')[-1])
                    self.log(console, "STRUCT_FILE_NAME: '" +
                             struct_file_path.split('/')[-1])  # DEBUG
                elif 'FASTQ file' in out_line:
                    clean_line = out_line.lstrip()
                    fastq_file_paths.append(re.split(r'\s+', clean_line)[3])
                else:
                    report_text_buf.append(out_line)
            report_text += "\n".join(report_text_buf)

        #### STEP 4: Upload Read Libs and create reads set
        ##
        if len(invalid_msgs) == 0:
            lib_obj_refs = []
            lib_obj_names = []
            readsSet_items = []

            for sample_i, fastq_file_path in enumerate(fastq_file_paths):

                if not os.path.isfile (fastq_file_path) \
                   or os.path.getsize (fastq_file_path) == 0:

                    raise ValueError("empty read lib generated: " +
                                     fastq_file_path)
                else:

                    # lib obj name
                    if len(fastq_file_paths) == 0:
                        output_obj_name = params['output_name']
                    else:
                        if str(params['pairs_flag']) == '1':
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".PairedEndLib"
                        else:
                            output_obj_name = params[
                                'output_name'] + '-sample' + str(
                                    sample_i + 1) + ".SingleEndLib"
                    lib_obj_names.append(output_obj_name)

                    # upload lib and get obj ref
                    self.log(
                        console,
                        'Uploading trimmed paired reads: ' + output_obj_name)
                    sequencing_tech = 'artificial reads'
                    if str(params['pairs_flag']) == '1':
                        interleaved = 1
                    else:
                        interleaved = 0
                    lib_obj_ref = readsUtils_Client.upload_reads({
                        'wsname':
                        str(params['workspace_name']),
                        'name':
                        output_obj_name,
                        'fwd_file':
                        fastq_file_path,
                        'interleaved':
                        interleaved,
                        'sequencing_tech':
                        sequencing_tech
                    })['obj_ref']
                    lib_obj_refs.append(lib_obj_ref)
                    os.remove(fastq_file_path)  # free up disk

                    # add to readsSet
                    readsSet_items.append({
                        'ref': lib_obj_ref,
                        'label': output_obj_name
                    })
            # create readsset
            readsSet_obj_ref = None
            if len(lib_obj_refs) > 1:
                readsSet_obj = {
                    'description':
                    "Grinder Metagenome from " + " ".join(genome_obj_names),
                    'items':
                    readsSet_items
                }
                readsSet_obj_name = params['output_name']
                readsSet_obj_ref = setAPI_Client.save_reads_set_v1({
                    'workspace_name':
                    params['workspace_name'],
                    'output_object_name':
                    readsSet_obj_name,
                    'data':
                    readsSet_obj
                })['set_ref']

        #### STEP 5: Build report
        ##
        reportName = 'kb_grinder_report_' + str(uuid.uuid4())
        reportObj = {
            'objects_created': [],
            #'text_message': '',  # or is it 'message'?
            'message': '',  # or is it 'text_message'?
            'direct_html': '',
            #'direct_html_link_index': 0,
            'file_links': [],
            'html_links': [],
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # message
        if len(invalid_msgs) > 0:
            report_text = "\n".join(invalid_msgs)
        reportObj['message'] = report_text

        if len(invalid_msgs) == 0:
            # objs
            if readsSet_obj_ref != None:
                reportObj['objects_created'].append({
                    'ref':
                    readsSet_obj_ref,
                    'desc':
                    params['output_name'] + " ReadsSet"
                })
            for lib_obj_i, lib_obj_ref in enumerate(lib_obj_refs):
                reportObj['objects_created'].append({
                    'ref':
                    lib_obj_ref,
                    'desc':
                    lib_obj_names[lib_obj_i]
                })
            # downloadable data
            for data_i, data_path in enumerate(struct_file_paths):
                try:
                    upload_ret = dfu.file_to_shock({
                        'file_path': data_path,
                        #'pack': 'zip'})
                        'make_handle': 0
                    })
                except:
                    raise ValueError('error uploading ' + data_path +
                                     ' file to shock')
                reportObj['file_links'].append({
                    'shock_id':
                    upload_ret['shock_id'],
                    'name':
                    struct_file_names[data_i],
                    'label':
                    struct_file_names[data_i]
                })

            # html report
            """
            try:
                html_upload_ret = dfu.file_to_shock({'file_path': html_output_dir,
                                                     'make_handle': 0,
                                                     'pack': 'zip'})
            except:
                raise ValueError ('error uploading html report to shock')
            reportObj['direct_html_link_index'] = 0
            reportObj['html_links'] = [{'shock_id': html_upload_ret['shock_id'],
                                        'name': html_file,
                                        'label': params['output_name']+' HTML'
                                    }
                                   ]
            """

        # save report object
        #
        SERVICE_VER = 'release'
        reportClient = KBaseReport(self.callbackURL,
                                   token=ctx['token'],
                                   service_ver=SERVICE_VER)
        #report_info = report.create({'report':reportObj, 'workspace_name':params['workspace_name']})
        report_info = reportClient.create_extended_report(reportObj)

        returnVal = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #END KButil_Build_InSilico_Metagenomes_with_Grinder

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError(
                'Method KButil_Build_InSilico_Metagenomes_with_Grinder return value '
                + 'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

Beispiel #23

Datei anzeigen

Datei: ImportSRAUtil.py Projekt: samseaver/kb_uploadmethods

class ImportSRAUtil:

    SRA_TOOLKIT_PATH = '/kb/deployment/bin/fastq-dump'

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _run_command(self, command):
        """
        _run_command: run command and print result
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output = pipe.communicate()[0]
        exitCode = pipe.returncode

        if (exitCode == 0):
            log('Executed commend:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running commend:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}'.format(exitCode, output)
            raise ValueError(error_msg)

    def _check_fastq_dump_result(self, tmp_dir, sra_name):
        """
        _check_fastq_dump_result: check fastq_dump result is PE or SE
        """
        return os.path.exists(tmp_dir + '/' + sra_name + '/1')

    def _sra_to_fastq(self, scratch_sra_file_path):
        """
        _sra_to_fastq: convert SRA file to FASTQ file(s)
        """

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(tmp_dir)

        command = self.SRA_TOOLKIT_PATH + ' --split-3 -T -O '
        command += tmp_dir + ' ' + scratch_sra_file_path

        self._run_command(command)

        sra_name = os.path.basename(scratch_sra_file_path).partition('.')[0]
        paired_end = self._check_fastq_dump_result(tmp_dir, sra_name)

        if paired_end:
            fwd_file = os.path.join(tmp_dir, sra_name, '1', 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'

            rev_file = os.path.join(tmp_dir, sra_name, '2', 'fastq')
            os.rename(rev_file, rev_file + '.fastq')
            rev_file = rev_file + '.fastq'
        else:
            fwd_file = os.path.join(tmp_dir, sra_name, 'fastq')
            os.rename(fwd_file, fwd_file + '.fastq')
            fwd_file = fwd_file + '.fastq'
            rev_file = None

        fastq_file_path = {'fwd_file': fwd_file, 'rev_file': rev_file}
        return fastq_file_path

    def _validate_upload_staging_file_availability(self,
                                                   staging_file_subdir_path):
        """
        _validate_upload_file_path_availability: validates file availability in user's staging area

        """
        pass
        # TODO ftp_server needs to be fixed for subdir
        # list = ftp_service(self.callback_url).list_files()
        # if staging_file_subdir_path not in list:
        #     error_msg = 'Target file: {} is NOT available.\n'.format(
        #                                         staging_file_subdir_path.rpartition('/')[-1])
        #     error_msg += 'Available files:\n {}'.format("\n".join(list))
        #     raise ValueError(error_msg)

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)

    def import_sra_from_staging(self, params):
        '''
          import_sra_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          sequencing_tech: sequencing technology
          name: output reads file name
          workspace_name: workspace name/ID of the object

          Optional Params:
          single_genome: whether the reads are from a single genome or a metagenome.
          insert_size_mean: mean (average) insert length
          insert_size_std_dev: standard deviation of insert lengths
          read_orientation_outward: whether reads in a pair point outward

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportSRAUtil.import_sra_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_sra_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_sra_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        log('Downloaded staging file to: {}'.format(scratch_sra_file_path))

        fastq_file_path = self._sra_to_fastq(scratch_sra_file_path)

        import_sra_reads_params = params
        import_sra_reads_params.update(fastq_file_path)

        workspace_name_or_id = params.get('workspace_name')
        if str(workspace_name_or_id).isdigit():
            import_sra_reads_params['wsid'] = int(workspace_name_or_id)
        else:
            import_sra_reads_params['wsname'] = str(workspace_name_or_id)

        log('--->\nrunning ReadsUtils.upload_reads\nparams:\n{}'.format(
            json.dumps(import_sra_reads_params, indent=1)))
        returnVal = self.ru.upload_reads(import_sra_reads_params)

        return returnVal

    def validate_import_sra_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method

        """

        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'sequencing_tech', 'name',
                'workspace_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        self._validate_upload_staging_file_availability(
            params.get('staging_file_subdir_path'))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report


        obj_ref: generated workspace object references. (return of import_sra_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}

        object_data = self.dfu.get_objects(get_objects_params)
        number_of_reads = object_data.get('data')[0].get('data').get(
            'read_count')

        upload_message += "Reads Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported Reads File: {}\n'.format(
            params.get('staging_file_subdir_path'))
        if isinstance(number_of_reads, (int, long)):
            upload_message += 'Number of Reads: {:,}\n'.format(number_of_reads)

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output