Esempio n. 1
0
    def fetch_reads_refs_from_sampleset(self, ref, info, validated_params):
        """
        Note: adapted from kbaseapps/kb_hisat2 - file_util.py

        From the given object ref, return a list of all reads objects that are a part of that
        object. E.g., if ref is a ReadsSet, return a list of all PairedEndLibrary or SingleEndLibrary
        refs that are a member of that ReadsSet. This is returned as a list of dictionaries as follows:
        {
            "ref": reads object reference,
            "condition": condition string associated with that reads object
        }
        The only one required is "ref", all other keys may or may not be present, based on the reads
        object or object type in initial ref variable. E.g. a RNASeqSampleSet might have condition info
        for each reads object, but a single PairedEndLibrary may not have that info.
        If ref is already a Reads library, just returns a list with ref as a single element.
        """
        obj_type = self.get_type_from_obj_info(info)
        refs = list()
        refs_for_ws_info = list()
        if "KBaseSets.ReadsSet" in obj_type or "KBaseRNASeq.RNASeqSampleSet" in obj_type:
            print("Looking up reads references in ReadsSet object")
            set_api = SetAPI(self.srv_wiz_url)
            reads_set = set_api.get_reads_set_v1({'ref': ref,
                                                  'include_item_info': 0,
                                                  'include_set_item_ref_paths': 1
                                                  })

            for reads in reads_set["data"]["items"]:
                refs.append({'ref': reads['ref_path'],
                             'condition': reads['label']
                             })
                refs_for_ws_info.append({'ref': reads['ref_path']})
        else:
            raise ValueError("Unable to fetch reads reference from object {} "
                             "which is a {}".format(ref, obj_type))

        # get object info so we can name things properly
        infos = self.ws.get_object_info3({'objects': refs_for_ws_info})['infos']

        name_ext = '_alignment'
        if 'output_alignment_suffix' in validated_params \
                and validated_params['output_alignment_suffix'] is not None:
            ext = validated_params['output_alignment_suffix'].replace(' ', '')
            if ext:
                name_ext = ext

        unique_name_lookup = {}
        for k in range(0, len(refs)):
            refs[k]['info'] = infos[k]
            name = infos[k][1]
            if name not in unique_name_lookup:
                unique_name_lookup[name] = 1
            else:
                unique_name_lookup[name] += 1
                name = name + '_' + str(unique_name_lookup[name])
            name = name + name_ext
            refs[k]['alignment_output_name'] = name

        return refs
Esempio n. 2
0
    def exec_remove_adapters(self, ctx, params):
        """
        :param params: instance of type "RemoveAdaptersParams" -> structure:
           parameter "output_workspace" of String, parameter
           "output_object_name" of String, parameter "input_reads" of type
           "ws_ref" (@ref ws), parameter "five_prime" of type
           "FivePrimeOptions" (unfortunately, we have to name the fields
           uniquely between 3' and 5' options due to the current
           implementation of grouped parameters) -> structure: parameter
           "adapter_sequence_5P" of String, parameter "anchored_5P" of type
           "boolean" (@range (0, 1)), parameter "three_prime" of type
           "ThreePrimeOptions" -> structure: parameter "adapter_sequence_3P"
           of String, parameter "anchored_3P" of type "boolean" (@range (0,
           1)), parameter "error_tolerance" of Double, parameter
           "min_overlap_length" of Long, parameter "min_read_length" of Long,
           parameter "discard_untrimmed" of type "boolean" (@range (0, 1))
        :returns: instance of type "exec_RemoveAdaptersResult" -> structure:
           parameter "report" of String, parameter "output_reads_ref" of
           String
        """
        # ctx is the context object
        # return variables are: result
        #BEGIN exec_remove_adapters
        console = []
        self.log(console, 'Running exec_remove_adapters() with parameters: ')
        self.log(console, "\n" + pformat(params))
        self.log(console, "-----------------------------------------------\n")
        report = ''
        returnVal = dict()
        returnVal['output_reads_ref'] = None

        token = ctx['token']
        wsClient = workspaceService(self.config['workspace-url'], token=token)
        ws = Workspace(self.config['workspace-url'], token=token)
        #setAPI_Client = SetAPI (url=self.config['SDK_CALLBACK_URL'], token=token) # for SDK local, doesn't work for SetAPI
        setAPI_Client = SetAPI(url=self.config['service-wizard-url'],
                               token=token)  # for dynamic service
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        # 0. param checks
        required_params = [
            'output_workspace', 'input_reads', 'output_object_name'
        ]
        for arg in required_params:
            if arg not in params or params[arg] == None or params[arg] == '':
                raise ValueError("Must define required param: '" + arg + "'")

        # 1. load provenance
        provenance = [{}]
        if 'provenance' in ctx:
            provenance = ctx['provenance']
        # add additional info to provenance here, in this case the input data object reference
        provenance[0]['input_ws_objects'] = [str(params['input_reads'])]

        # 2. Determine whether read library, ReadsSet or RNASeqSampleSet is input object
        #
        try:
            # object_info tuple
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': params['input_reads']
                }]})[0]
            input_reads_obj_type = input_reads_obj_info[TYPE_I]
            input_reads_obj_type = re.sub(
                '-[0-9]+\.[0-9]+$', "",
                input_reads_obj_type)  # remove trailing version
            #input_reads_obj_version = input_reads_obj_info[VERSION_I]  # this is object version, not type version
        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                str(params['input_reads']) + ')' + str(e))

        acceptable_types = [
            "KBaseSets.ReadsSet", "KBaseRNASeq.RNASeqSampleSet",
            "KBaseFile.PairedEndLibrary", "KBaseFile.SingleEndLibrary",
            "KBaseAssembly.PairedEndLibrary", "KBaseAssembly.SingleEndLibrary"
        ]
        if input_reads_obj_type not in acceptable_types:
            raise ValueError("Input reads of type: '" + input_reads_obj_type +
                             "'.  Must be one of " +
                             ", ".join(acceptable_types))

        # 3. Retrieve the set details
        #
        readsSet_ref_list = []
        readsSet_names_list = []
        readsSet_types_list = []
        if "KBaseSets.ReadsSet" in input_reads_obj_type:
            try:
                input_readsSet_obj = setAPI_Client.get_reads_set_v1({
                    'ref':
                    params['input_reads'],
                    'include_item_info':
                    1
                })

            except Exception as e:
                raise ValueError(
                    'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                    + str(params['input_reads']) + ")\n" + str(e))
            for readsLibrary_obj in input_readsSet_obj['data']['items']:
                readsSet_ref_list.append(readsLibrary_obj['ref'])
                NAME_I = 1
                TYPE_I = 2
                readsSet_names_list.append(readsLibrary_obj['info'][NAME_I])
                this_type = readsLibrary_obj['info'][TYPE_I]
                this_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                   this_type)  # remove trailing version
                readsSet_types_list.append(this_type)

        elif "KBaseRNASeq.RNASeqSampleSet" in input_reads_obj_type:
            sample_set = ws.get_objects2(
                {"objects": [{
                    "ref": params['input_reads']
                }]})["data"][0]["data"]
            sample_refs = list()
            for i in range(len(sample_set["sample_ids"])):
                readsSet_ref_list.append(sample_set["sample_ids"][i])
                sample_refs.append({"ref": sample_set["sample_ids"][i]})

            info = ws.get_object_info3({"objects": sample_refs})
            for j in range(len(info["infos"])):
                NAME_I = 1
                TYPE_I = 2
                readsSet_names_list.append(info["infos"][j][NAME_I])
                sample_type = info["infos"][j][TYPE_I]
                sample_type = re.sub('-[0-9]+\.[0-9]+$', "",
                                     sample_type)  # remove trailing version
                readsSet_types_list.append(sample_type)
        else:
            readsSet_ref_list = [params['input_reads']]
            readsSet_names_list = [params['output_object_name']]
            readsSet_types_list = [input_reads_obj_type]

        # 4. Iterate through readsLibrary memebers of set
        #
        report = ''
        cutadapt_readsSet_ref = None
        cutadapt_readsLib_refs = []

        for reads_item_i, input_reads_library_ref in enumerate(
                readsSet_ref_list):
            exec_remove_adapters_OneLibrary_params = {
                'output_workspace': params['output_workspace'],
                'input_reads': input_reads_library_ref,
                'reads_type': readsSet_types_list[reads_item_i]
            }
            if (input_reads_obj_type != "KBaseSets.ReadsSet"
                    and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"):
                exec_remove_adapters_OneLibrary_params[
                    'output_object_name'] = params['output_object_name']
            else:
                exec_remove_adapters_OneLibrary_params[
                    'output_object_name'] = readsSet_names_list[
                        reads_item_i] + "_cutadapt"

            optional_params = [
                'float error_tolerance', 'min_overlap_length',
                'min_read_length', 'discard_untrimmed'
            ]
            optional_g_params = {
                'five_prime': ['adapter_sequence_5P', 'anchored_5P'],
                'three_prime': ['adapter_sequence_3P', 'anchored_3P']
            }
            for arg in optional_params:
                if arg in params and params[arg] != None:
                    exec_remove_adapters_OneLibrary_params[arg] = params[arg]

            for group in optional_g_params.keys():
                if group in params and params[group] != None:
                    exec_remove_adapters_OneLibrary_params[group] = dict()
                    for arg in optional_g_params[group]:
                        if arg in params[group] and params[group][arg] != None:
                            exec_remove_adapters_OneLibrary_params[group][
                                arg] = params[group][arg]

            msg = "\n\nRUNNING exec_remove_adapters_OneLibrary() ON LIBRARY: " + str(
                input_reads_library_ref) + " " + str(
                    readsSet_names_list[reads_item_i]) + "\n"
            msg += "----------------------------------------------------------------------------\n"
            report += msg
            self.log(console, msg)

            # RUN
            exec_remove_adapters_OneLibrary_retVal = self.exec_remove_adapters_OneLibrary(
                ctx, exec_remove_adapters_OneLibrary_params)[0]

            report += exec_remove_adapters_OneLibrary_retVal['report'] + "\n\n"
            cutadapt_readsLib_refs.append(
                exec_remove_adapters_OneLibrary_retVal['output_reads_ref'])

        # 5. Conclude
        # Just one Library
        if (input_reads_obj_type != "KBaseSets.ReadsSet"
                and input_reads_obj_type != "KBaseRNASeq.RNASeqSampleSet"):

            # create return output object
            result = {
                'report': report,
                'output_reads_ref': cutadapt_readsLib_refs[0],
            }
        # ReadsSet or SampleSet
        else:
            # save cutadapt readsSet
            some_cutadapt_output_created = False
            items = []
            for i, lib_ref in enumerate(cutadapt_readsLib_refs):

                if lib_ref == None:
                    #items.append(None)  # can't have 'None' items in ReadsSet
                    continue
                else:
                    some_cutadapt_output_created = True
                    try:
                        label = input_readsSet_obj['data']['items'][i]['label']
                    except:
                        NAME_I = 1
                        label = ws.get_object_info3(
                            {'objects': [{
                                'ref': lib_ref
                            }]})['infos'][0][NAME_I]
                    label = label + "_cutadapt"

                    items.append({
                        'ref': lib_ref,
                        'label': label
                        #'data_attachment': ,
                        #'info':
                    })
            if some_cutadapt_output_created:
                reads_desc_ext = " + Cutadapt"
                #reads_name_ext = "_cutadapt"
                descText = ""
                reads_name_ext = ""
                try:
                    descText = input_readsSet_obj['data']['description']
                except:
                    NAME_I = 1
                    descText = ws.get_object_info3(
                        {'objects': [{
                            'ref': params['input_reads']
                        }]})['infos'][0][NAME_I]
                descText = descText + reads_desc_ext

                output_readsSet_obj = {'description': descText, 'items': items}
                output_readsSet_name = str(
                    params['output_object_name']) + reads_name_ext
                cutadapt_readsSet_ref = setAPI_Client.save_reads_set_v1({
                    'workspace_name':
                    params['output_workspace'],
                    'output_object_name':
                    output_readsSet_name,
                    'data':
                    output_readsSet_obj
                })['set_ref']
            else:
                raise ValueError("No cutadapt output created")

            # create return output object
            result = {
                'report': report,
                'output_reads_ref': cutadapt_readsSet_ref
            }
        #END exec_remove_adapters

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method exec_remove_adapters return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Esempio n. 3
0
    def download_short_unpaired(self, console, token, wsname,
                                short_unpaired_libraries):
        try:
            self.log(console, "Getting short unpaired reads.\n")
            ruClient = ReadsUtils(url=self.callbackURL, token=token)

            # first, unpack any ReadsSets into the actual SingleEndLibrary referencs
            reads_refs = []
            # object info
            try:
                wsClient = Workspace(self.workspaceURL, token=token)
            except Exception as e:
                raise ValueError("unable to instantiate wsClient. " + str(e))

            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple
            for lib in short_unpaired_libraries:
                try:
                    obj_id = {
                        'ref': lib if '/' in lib else (wsname + '/' + lib)
                    }
                    lib_obj_info = wsClient.get_object_info_new(
                        {'objects': [obj_id]})[0]
                    lib_obj_type = lib_obj_info[TYPE_I]
                    # remove trailing version
                    lib_obj_type = re.sub('-[0-9]+\.[0-9]+$', "", lib_obj_type)
                    lib_ref = str(lib_obj_info[WSID_I])+'/' + \
                        str(lib_obj_info[OBJID_I])+'/'+str(lib_obj_info[VERSION_I])
                    if lib_obj_type == 'KBaseSets.ReadsSet':
                        # unpack it
                        try:
                            setAPIClient = SetAPI(url=self.serviceWizardURL,
                                                  token=token)
                            self.log(console, 'getting reads set ' + lib_ref)
                            readsSet = setAPIClient.get_reads_set_v1({
                                'ref':
                                lib_ref,
                                'include_item_info':
                                1
                            })
                        except Exception as e:
                            raise ValueError(
                                'SetAPI FAILURE: Unable to get read library set object: ('
                                + lib_ref + ')\n' + str(e))
                        for readsLibrary in readsSet['data']['items']:
                            reads_refs.append(readsLibrary['ref'])
                    else:
                        # use other reads objects "as is"
                        reads_refs.append(lib_ref)
                except Exception as e:
                    raise ValueError('Unable to get read library object: (' +
                                     str(lib) + ')' + str(e))

            result = ruClient.download_reads({
                'read_libraries': reads_refs,
                'interleaved': 'false'
            })
            # combine outputs
            short_unpaired_path = os.path.join(
                self.scratch, "short_unpaired_" + str(uuid.uuid4()) + ".fastq")

            self.log(console, "Combining short unpaired reads.\n")

            for reads_ref in reads_refs:
                files = result['files'][reads_ref]['files']

                if 'fwd' in files:
                    path = files['fwd']
                    if path.endswith('.gz'):
                        cmd = 'gzip -dc ' + path + ' >> ' + short_unpaired_path
                    else:
                        cmd = 'cat ' + path + ' >> ' + short_unpaired_path
                    self.log(console, "command: " + cmd)
                    cmdProcess = subprocess.Popen(cmd,
                                                  stdout=subprocess.PIPE,
                                                  stderr=subprocess.STDOUT,
                                                  shell=True)
                    cmdProcess.wait()
                    if cmdProcess.returncode != 0:
                        raise ValueError('Error running ' + cmd)
                    os.remove(path)
                else:
                    raise ValueError('File ' + reads_ref +
                                     ' missing forward reads file')

        except Exception as e:
            raise ValueError('Unable to download short unpaired reads\n' +
                             str(e))
        return short_unpaired_path
Esempio n. 4
0
    def exec_megahit(self, ctx, params):
        """
        :param params: instance of type "ExecMegaHitParams" (exec_megahit()
           Actual execution of MEGAHIT Accepts ReadsSet or a ReadsLibrary as
           Input Creates Assembly object(s) as output. Will eventually also
           create AssemblySet object if input is a ReadsSet and not running a
           combined assembly Other vars same as run_megahit()) -> structure:
           parameter "workspace_name" of String, parameter "input_reads_ref"
           of String, parameter "output_contigset_name" of String, parameter
           "combined_assembly_flag" of Long, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_len" of Long
        :returns: instance of type "ExecMegaHitOutput" -> structure:
           parameter "report_text" of String, parameter
           "output_contigset_ref" of list of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN exec_megahit
        console = []
        self.log(console, 'Running exec_megahit() with params=')
        self.log(console, "\n" + pformat(params))

        #SERVICE_VER = 'dev'  # DEBUG
        SERVICE_VER = 'release'

        ### STEP 0: init
        token = ctx['token']
        wsClient = workspaceService(self.workspaceURL, token=token)
        headers = {'Authorization': 'OAuth ' + token}
        env = os.environ.copy()
        env['KB_AUTH_TOKEN'] = token

        ### STEP 1: basic parameter checks + parsing
        required_params = [
            'workspace_name', 'input_reads_ref', 'output_contigset_name'
        ]
        for required_param in required_params:
            if required_param not in params or params[required_param] == None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        ### STEP 2: determine if input is a ReadsLibrary or ReadsSet
        input_reads_ref = params['input_reads_ref']
        input_reads_name = None
        try:
            [
                OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
                WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
            ] = range(11)  # object_info tuple

            input_reads_obj_info = wsClient.get_object_info_new(
                {'objects': [{
                    'ref': input_reads_ref
                }]})[0]
            input_reads_obj_type = re.sub(
                '-[0-9]+\.[0-9]+$', "",
                input_reads_obj_info[TYPE_I])  # remove trailing version
            input_reads_name = input_reads_obj_info[NAME_I]

        except Exception as e:
            raise ValueError('Unable to get reads object from workspace: (' +
                             input_reads_ref + ')' + str(e))

        accepted_input_types = [
            "KBaseSets.ReadsSet", "KBaseFile.PairedEndLibrary"
        ]
        if input_reads_obj_type not in accepted_input_types:
            raise ValueError("Input reads of type '" + input_reads_obj_type +
                             "' not accepted.  Must be one of " +
                             ", ".join(accepted_input_types))

        if input_reads_obj_type == "KBaseSets.ReadsSet":
            required_param = 'combined_assembly_flag'
            if required_param not in params or params[required_param] == None:
                raise ValueError("Must define required param: '" +
                                 required_param + "'")

        ### STEP 3: get the list of library references
        if input_reads_obj_type == "KBaseFile.PairedEndLibrary":
            readsSet_ref_list = [input_reads_ref]
            readsSet_names_list = [input_reads_name]

        elif input_reads_obj_type == "KBaseSets.ReadsSet":
            readsSet_ref_list = []
            readsSet_names_list = []

            try:
                setAPI_Client = SetAPI(
                    url=self.serviceWizardURL,
                    token=ctx['token'])  # for dynamic service
                #setAPI_Client = SetAPI (url=self.callbackURL, token=ctx['token'])  # SDK local method
            except Exception as e:
                raise ValueError(
                    "SetAPI FAILURE: Unable to get SetAPI Client from serviceWizard: '"
                    + self.serviceWizardURL + "' token: '" + ctx['token'] +
                    "'" + str(e))
                #raise ValueError("SetAPI FAILURE: Unable to get SetAPI Client as local method callbackURL: '"+self.callbackURL+"' token: '"+ctx['token']+"'" + str(e))

            try:
                input_readsSet_obj = setAPI_Client.get_reads_set_v1({
                    'ref':
                    input_reads_ref,
                    'include_item_info':
                    1
                })
            except Exception as e:
                raise ValueError(
                    'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                    + str(input_reads_ref) + ")\n" + str(e))

            for readsLibrary_obj in input_readsSet_obj['data']['items']:
                readsSet_ref_list.append(readsLibrary_obj['ref'])
                NAME_I = 1
                readsSet_names_list.append(readsLibrary_obj['info'][NAME_I])

        else:
            raise ValueError("Input reads of type '" + input_reads_obj_type +
                             "' not accepted.  Must be one of " +
                             ", ".join(accepted_input_types))

        ### STEP 4: If doing a combined assembly on a ReadsSet, download reads one at a time and combine
        if input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] != 0:

            self.log(
                console,
                "MegaHit_Sets:run_megahit(): CREATING COMBINED INPUT FASTQ FILES"
            )

            # make dir
            timestamp = int(
                (datetime.utcnow() -
                 datetime.utcfromtimestamp(0)).total_seconds() * 1000)
            input_dir = os.path.join(self.scratch, 'input.' + str(timestamp))
            if self.mac_mode:  # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
                input_dir = os.path.join(self.host_scratch,
                                         'input.' + str(timestamp))
            if not os.path.exists(input_dir):
                os.makedirs(input_dir)

            # connect to ReadsUtils Client
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
            except:
                raise ValueError("Unable to get readsUtils_Client\n" + str(e))

            # start combined file
            read_buf_size = 65536
            write_buf_size = 65536
            combined_input_fwd_path = os.path.join(input_dir,
                                                   'input_reads_fwd.fastq')
            combined_input_rev_path = os.path.join(input_dir,
                                                   'input_reads_rev.fastq')
            combined_input_fwd_handle = open(combined_input_fwd_path, 'w',
                                             write_buf_size)
            combined_input_rev_handle = open(combined_input_rev_path, 'w',
                                             write_buf_size)

            # add libraries, one at a time
            for this_input_reads_ref in readsSet_ref_list:
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                try:
                    readsLibrary = readsUtils_Client.download_reads({
                        'read_libraries': [this_input_reads_ref],
                        'interleaved':
                        'false'
                    })
                except Exception as e:
                    raise ValueError(
                        'Unable to get reads object from workspace: (' +
                        this_input_reads_ref + ")\n" + str(e))

                this_input_fwd_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['fwd']
                this_input_rev_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['rev']

                # append fwd
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): APPENDING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                this_input_path = this_input_fwd_path
                cat_file_handle = combined_input_fwd_handle
                with open(this_input_path, 'r',
                          read_buf_size) as this_input_handle:
                    while True:
                        read_data = this_input_handle.read(read_buf_size)
                        if read_data:
                            cat_file_handle.write(read_data)
                        else:
                            break
                os.remove(
                    this_input_path
                )  # create space since we no longer need the piece file

                # append rev
                this_input_path = this_input_rev_path
                cat_file_handle = combined_input_rev_handle
                with open(this_input_path, 'r',
                          read_buf_size) as this_input_handle:
                    while True:
                        read_data = this_input_handle.read(read_buf_size)
                        if read_data:
                            cat_file_handle.write(read_data)
                        else:
                            break
                os.remove(
                    this_input_path
                )  # create space since we no longer need the piece file

            combined_input_fwd_handle.close()
            combined_input_rev_handle.close()

        ### STEP 5: finally run MegaHit_Sets
        exec_megahit_single_library_params = params
        output_assemblyset_contigset_paths = []
        output_contigset_path = None

        # PairedEndLibrary
        if input_reads_obj_type == "KBaseFile.PairedEndLibrary":
            self.log(
                console,
                "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsLibrary: "
                + str(input_reads_ref))
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
                readsLibrary = readsUtils_Client.download_reads({
                    'read_libraries': [input_reads_ref],
                    'interleaved':
                    'false'
                })
            except Exception as e:
                raise ValueError(
                    'Unable to get reads object from workspace: (' +
                    input_reads_ref + ")\n" + str(e))

            input_fwd_path = readsLibrary['files'][input_reads_ref]['files'][
                'fwd']
            input_rev_path = readsLibrary['files'][input_reads_ref]['files'][
                'rev']
            exec_megahit_single_library_params[
                'input_fwd_path'] = input_fwd_path
            exec_megahit_single_library_params[
                'input_rev_path'] = input_rev_path

            # the key line
            output_contigset_path = self.exec_megahit_single_library(
                exec_megahit_single_library_params)
            output_assemblyset_contigset_paths.append(output_contigset_path)

            os.remove(input_fwd_path)  # files can be really big
            os.remove(input_rev_path)

        # ReadsSet combined (already downloaded and combined fastqs)
        elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] != 0:

            input_fwd_path = combined_input_fwd_path
            input_rev_path = combined_input_rev_path
            exec_megahit_single_library_params[
                'input_fwd_path'] = input_fwd_path
            exec_megahit_single_library_params[
                'input_rev_path'] = input_rev_path

            # the key line
            output_contigset_path = self.exec_megahit_single_library(
                exec_megahit_single_library_params)
            output_assemblyset_contigset_paths.append(output_contigset_path)

            os.remove(input_fwd_path)  # files can be really big
            os.remove(input_rev_path)

        # ReadsSet uncombined (still have to download)
        elif input_reads_obj_type == "KBaseSets.ReadsSet" and params[
                'combined_assembly_flag'] == 0:
            # connect to ReadsUtils Client
            try:
                readsUtils_Client = ReadsUtils(url=self.callbackURL,
                                               token=ctx['token'])  # SDK local
            except:
                raise ValueError("Unable to get readsUtils_Client\n" + str(e))

            # get libraries, one at a time, and run MegaHit_Sets
            output_assemblyset_contigset_paths = []
            for this_input_reads_ref in readsSet_ref_list:
                self.log(
                    console,
                    "MegaHit_Sets:run_megahit(): DOWNLOADING FASTQ FILES FOR ReadsSet member: "
                    + str(this_input_reads_ref))
                try:
                    readsLibrary = readsUtils_Client.download_reads({
                        'read_libraries': [this_input_reads_ref],
                        'interleaved':
                        'false'
                    })
                except Exception as e:
                    raise ValueError(
                        'Unable to get reads object from workspace: (' +
                        this_input_reads_ref + ")\n" + str(e))

                this_input_fwd_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['fwd']
                this_input_rev_path = readsLibrary['files'][
                    this_input_reads_ref]['files']['rev']
                exec_megahit_single_library_params[
                    'input_fwd_path'] = this_input_fwd_path
                exec_megahit_single_library_params[
                    'input_rev_path'] = this_input_rev_path

                # the key line
                this_output_contigset_path = self.exec_megahit_single_library(
                    exec_megahit_single_library_params)
                output_assemblyset_contigset_paths.append(
                    this_output_contigset_path)

                os.remove(this_input_fwd_path)  # files can be really big
                os.remove(this_input_rev_path)

        # just in case we've confused ourselves
        else:
            raise ValueError("error in logic")

        ### STEP 6: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL,
                                    token=ctx['token'],
                                    service_ver=SERVICE_VER)
        output_contigset_refs = []
        output_contigset_names = []
        for i, this_output_contigset_path in enumerate(
                output_assemblyset_contigset_paths):
            if len(output_assemblyset_contigset_paths) == 1:
                assembly_name = params['output_contigset_name']
            else:
                assembly_name = readsSet_names_list[i] + '-' + params[
                    'output_contigset_name']

            this_output_data_ref = assemblyUtil.save_assembly_from_fasta({
                'file': {
                    'path': this_output_contigset_path
                },
                'workspace_name':
                params['workspace_name'],
                'assembly_name':
                assembly_name
            })

            output_contigset_refs.append(this_output_data_ref)
            output_contigset_names.append(assembly_name)

        ### STEP 7: generate the report text

        # compute a simple contig length distribution for the report
        report = ''
        for i, this_output_contigset_path in enumerate(
                output_assemblyset_contigset_paths):

            report += "MegaHit_Sets run for Read Library: " + readsSet_names_list[
                i] + "\n"
            report += "-------------------------------------------------------------\n"
            report += "\n"
            lengths = []
            for seq_record in SeqIO.parse(this_output_contigset_path, 'fasta'):
                lengths.append(len(seq_record.seq))

                report += 'ContigSet saved to: ' + params[
                    'workspace_name'] + '/' + output_contigset_names[i] + '\n'
                report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
                report += 'Avg Length: ' + str(
                    sum(lengths) / float(len(lengths))) + ' bp.\n'

                bins = 10
                counts, edges = np.histogram(lengths, bins)
                report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
                for c in range(bins):
                    report += '   ' + str(counts[c]) + '\t--\t' + str(
                        edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        ### STEP 8: contruct the output to send back
        output = {
            'report_text': report,
            'output_contigset_refs': output_contigset_refs
        }

        #END exec_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method exec_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Esempio n. 5
0
class DataStagingUtils(object):
    def __init__(self, config, ctx):
        self.ctx = ctx
        self.scratch = os.path.abspath(config['scratch'])
        self.ws_url = config['workspace-url']
        self.serviceWizardURL = config['srv-wiz-url']
        self.callbackURL = config['SDK_CALLBACK_URL']
        if not os.path.exists(self.scratch):
            os.makedirs(self.scratch)

        self.SE_flag = 'SE'
        self.PE_flag = 'PE'

        SERVICE_VER = 'release'

        # readsUtils_Client
        try:
            self.readsUtils_Client = ReadsUtils(self.callbackURL,
                                                token=self.ctx['token'],
                                                service_ver=SERVICE_VER)
        except Exception as e:
            raise ValueError(
                'Unable to instantiate readsUtils_Client with callbackURL: ' +
                self.callbackURL + ' ERROR: ' + str(e))

        # setAPI_Client
        try:
            #setAPI_Client = SetAPI (url=self.callbackURL, token=self.ctx['token'])  # for SDK local.  local doesn't work for SetAPI
            self.setAPI_Client = SetAPI(
                url=self.serviceWizardURL,
                token=self.ctx['token'])  # for dynamic service
        except Exception as e:
            raise ValueError(
                'Unable to instantiate setAPI_Client with serviceWizardURL: ' +
                self.serviceWizardURL + ' ERROR: ' + str(e))

    def expand_input(self, input_refs):
        '''
        Expand input based on an input data reference for Kaiju

        input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet
        '''
        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # expand any sets and build a non-redundant list of reads input objs
        ws = Workspace(self.ws_url)
        expanded_input = []
        input_ref_seen = dict()
        SE_types = [
            'KBaseFile.SingleEndLibrary', 'KBaseAssembly.SingleEndLibrary'
        ]
        PE_types = [
            'KBaseFile.PairedEndLibrary', 'KBaseAssembly.PairedEndLibrary'
        ]

        [
            OBJID_I, NAME_I, TYPE_I, SAVE_DATE_I, VERSION_I, SAVED_BY_I,
            WSID_I, WORKSPACE_I, CHSUM_I, SIZE_I, META_I
        ] = range(11)  # object_info tuple
        for input_ref in input_refs:
            input_info = ws.get_object_info3({'objects': [{
                'ref': input_ref
            }]})['infos'][0]
            obj_name = input_info[NAME_I]
            type_name = input_info[TYPE_I].split('-')[0]

            # ReadsSet
            if type_name in ['KBaseSets.ReadsSet']:
                try:
                    input_readsSet_obj = self.setAPI_Client.get_reads_set_v1({
                        'ref':
                        input_ref,
                        'include_item_info':
                        1
                    })

                except Exception as e:
                    raise ValueError(
                        'SetAPI FAILURE: Unable to get read library set object from workspace: ('
                        + str(input_ref) + ")\n" + str(e))

                for readsLibrary_obj in input_readsSet_obj['data']['items']:
                    this_reads_ref = readsLibrary_obj['ref']
                    if this_reads_ref in input_ref_seen:
                        continue
                    input_ref_seen[this_reads_ref] = True

                    this_reads_name = readsLibrary_obj['info'][NAME_I]
                    reads_item_type = readsLibrary_obj['info'][TYPE_I]
                    reads_item_type = re.sub(
                        '-[0-9]+\.[0-9]+$', "",
                        reads_item_type)  # remove trailing version
                    if reads_item_type in PE_types:
                        this_reads_type = self.PE_flag
                    elif reads_item_type in SE_types:
                        this_reads_type = self.SE_flag
                    else:
                        raise ValueError("Can't handle read item type '" +
                                         reads_item_type + "' obj_name: '" +
                                         this_reads_name + " in Set: '" +
                                         str(input_ref) + "'")
                    expanded_input.append({
                        'ref': this_reads_ref,
                        'name': this_reads_name,
                        'type': this_reads_type
                    })
            # SingleEnd Library
            elif type_name in SE_types:
                this_reads_ref = input_ref
                if this_reads_ref in input_ref_seen:
                    continue
                input_ref_seen[this_reads_ref] = True
                this_reads_name = obj_name
                this_reads_type = self.SE_flag
                expanded_input.append({
                    'ref': this_reads_ref,
                    'name': this_reads_name,
                    'type': this_reads_type
                })
            # PairedEnd Library
            elif type_name in PE_types:
                this_reads_ref = input_ref
                if this_reads_ref in input_ref_seen:
                    continue
                input_ref_seen[this_reads_ref] = True
                this_reads_name = obj_name
                this_reads_type = self.PE_flag
                expanded_input.append({
                    'ref': this_reads_ref,
                    'name': this_reads_name,
                    'type': this_reads_type
                })
            else:
                raise ValueError("Illegal type in input_refs: " +
                                 str(obj_name) + " (" + str(input_ref) +
                                 ") is of type: '" + str(type_name) + "'")

        return expanded_input

    def stage_input(self,
                    input_item=None,
                    subsample_percent=10,
                    subsample_replicates=1,
                    subsample_seed=1,
                    fasta_file_extension='fastq'):
        '''
        Stage input based on an input data reference for Kaiju

        input_refs can be a list of references to a PairedEndLibrary, a SingleEndLibrary, or a ReadsSet

        This method creates a directory in the scratch area with the set of Fasta/Fastq files, names
        will have the fasta_file_extension parameter tacked on.

            ex:

            staged_input = stage_input({'ref':<ref>,'name':<name>,'type':<type>}, subsample_percent, subsample_replicates, subsample_seed, 'fastq')

            staged_input
            {"input_dir": '...'}
        '''
        # init
        staged_input = dict()
        replicate_input = []

        # config
        #SERVICE_VER = 'dev'
        SERVICE_VER = 'release'

        # generate a folder in scratch to hold the input
        suffix = str(int(time.time() * 1000))
        input_dir = os.path.join(self.scratch, 'input_reads_' + suffix)
        if not os.path.exists(input_dir):
            os.makedirs(input_dir)

        #
        # Download reads
        #

        # Paired End Lib
        if input_item['type'] == self.PE_flag:
            try:
                readsLibrary = self.readsUtils_Client.download_reads({
                    'read_libraries': [input_item['ref']],
                    'interleaved':
                    'false'
                })
            except Exception as e:
                raise ValueError(
                    'Unable to get read library object from workspace: (' +
                    str(input_item['ref']) + ")\n" + str(e))

            input_fwd_file_path = readsLibrary['files'][
                input_item['ref']]['files']['fwd']
            input_rev_file_path = readsLibrary['files'][
                input_item['ref']]['files']['rev']
            fwd_filename = os.path.join(
                input_dir, input_item['name'] + '.fwd.' + fasta_file_extension)
            rev_filename = os.path.join(
                input_dir, input_item['name'] + '.rev.' + fasta_file_extension)
            if input_fwd_file_path != fwd_filename:
                shutil.move(input_fwd_file_path, fwd_filename)
            if input_rev_file_path != rev_filename:
                shutil.move(input_rev_file_path, rev_filename)
            input_item['fwd_file'] = fwd_filename
            input_item['rev_file'] = rev_filename

            if not os.path.isfile(fwd_filename):
                raise ValueError('Error generating reads file ' + fwd_filename)
            if not os.path.isfile(rev_filename):
                raise ValueError('Error generating reads file ' + rev_filename)
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len):
                raise ValueError('Reads Library is empty in filename: ' +
                                 str(fwd_filename))
            if not self._fasta_seq_len_at_least(rev_filename, min_fasta_len):
                raise ValueError('Reads Library is empty in filename: ' +
                                 str(rev_filename))

        # Single End Lib
        elif input_item['type'] == self.SE_flag:
            try:
                readsLibrary = self.readsUtils_Client.download_reads(
                    {'read_libraries': [input_item['ref']]})
            except Exception as e:
                raise ValueError(
                    'Unable to get read library object from workspace: (' +
                    str(input_item['ref']) + ")\n" + str(e))

            input_fwd_file_path = readsLibrary['files'][
                input_item['ref']]['files']['fwd']
            fwd_filename = os.path.join(
                input_dir, input_item['name'] + '.fwd.' + fasta_file_extension)
            if input_fwd_file_path != fwd_filename:
                shutil.move(input_fwd_file_path, fwd_filename)
            input_item['fwd_file'] = fwd_filename

            if not os.path.isfile(fwd_filename):
                raise ValueError('Error generating reads file ' + fwd_filename)
            # make sure fasta file isn't empty
            min_fasta_len = 1
            if not self._fasta_seq_len_at_least(fwd_filename, min_fasta_len):
                raise ValueError('Reads Library is empty in filename: ' +
                                 str(fwd_filename))

        else:
            raise ValueError("No type set for input library " +
                             str(input_item['name']) + " (" +
                             str(input_item['ref']) + ")")

        #
        # Subsample
        #

        if subsample_percent == 100:
            replicate_input = [input_item]
        else:
            replicate_input = self._randomly_subsample_reads(
                input_item,
                subsample_percent=subsample_percent,
                subsample_replicates=subsample_replicates,
                subsample_seed=subsample_seed)
            # free up disk
            os.remove(input_item['fwd_file'])
            if input_item['type'] == self.PE_flag:
                os.remove(input_item['rev_file'])

        # return input file info
        #staged_input['input_dir'] = input_dir
        #staged_input['folder_suffix'] = suffix
        staged_input['replicate_input'] = replicate_input
        return staged_input

    def _randomly_subsample_reads(self,
                                  input_item=None,
                                  subsample_percent=100,
                                  subsample_replicates=1,
                                  subsample_seed=1):

        replicate_files = []
        split_num = subsample_replicates

        # for now can only do percentage instead of raw cnt of reads per subsample
        use_reads_num = False
        use_reads_perc = True
        reads_num = 0  # not used.  subsample_percent used instead

        # init randomizer
        random.seed(subsample_seed)

        # Paired End
        #
        if input_item['type'] == self.PE_flag:
            print("SUBSAMPLING PE library " + input_item['name'])  # DEBUG

            # file paths
            input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file'])
            input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path)
            input_rev_path = re.sub("\.fastq$", "", input_item['rev_file'])
            input_rev_path = re.sub("\.FASTQ$", "", input_rev_path)
            output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired"
            output_rev_paired_file_path_base = input_rev_path + "_rev_paired"

            # set up for file io
            total_paired_reads = 0
            total_unpaired_fwd_reads = 0
            total_unpaired_rev_reads = 0
            total_paired_reads_by_set = []
            fwd_ids = dict()
            paired_ids = dict()
            paired_ids_list = []
            paired_lib_i = dict()
            paired_buf_size = 100000
            recs_beep_n = 1000000

            # read fwd file to get fwd ids
            #            rec_cnt = 0  # DEBUG
            print("GETTING IDS")  # DEBUG
            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        fwd_ids[read_id] = True

                        # DEBUG
#                        if rec_cnt % 100 == 0:
#                            print ("read_id: '"+str(read_id)+"'")
#                        rec_cnt += 1

# read reverse to determine paired
            print("DETERMINING PAIRED IDS")  # DEBUG
            with open(input_item['rev_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        if fwd_ids[read_id]:
                            paired_ids[read_id] = True
                            paired_ids_list.append(read_id)

                        # DEBUG
#                        if rec_cnt % 100 == 0:
#                            print ("read_id: '"+str(read_id)+"'")
#                        rec_cnt += 1
            total_paired_reads = len(paired_ids_list)
            print("TOTAL PAIRED READS CNT: " +
                  str(total_paired_reads))  # DEBUG

            # Determine sublibrary sizes
            if use_reads_num:
                reads_per_lib = reads_num
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_num <= total_paired_reads_cnt / split_num.  You have reads_num:"
                        + str(reads_num) + " > total_paired_reads_cnt:" +
                        str(total_paired_reads) + " / split_num:" +
                        str(split_num) + ".  Instead try reads_num <= " +
                        str(total_paired_reads // split_num))
            elif use_reads_perc:
                reads_per_lib = int(
                    (subsample_percent / 100.0) * total_paired_reads)
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_perc <= 1 / split_num.  You have reads_perc:"
                        + str(subsample_percent) + " > 1 / split_num:" +
                        str(split_num) + ".  Instead try reads_perc <= " +
                        str(int(100 * 1 / split_num)))
            else:
                raise ValueError(
                    "error in logic reads_num vs. reads_perc logic")

            # Determine random membership in each sublibrary
            print("GETTING RANDOM SUBSAMPLES")  # DEBUG
            for i, read_id in enumerate(
                    random.sample(paired_ids_list, reads_per_lib * split_num)):
                lib_i = i % split_num
                paired_lib_i[read_id] = lib_i

            # split fwd paired
            print("WRITING FWD SPLIT PAIRED")  # DEBUG
            paired_output_reads_file_handles = []
            for lib_i in range(split_num):
                paired_output_reads_file_handles.append(
                    open(
                        output_fwd_paired_file_path_base + "-" + str(lib_i) +
                        ".fastq", 'w', paired_buf_size))
                total_paired_reads_by_set.append(0)

            rec_buf = []
            last_read_id = None
            paired_cnt = 0
            capture_type_paired = False

            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        if last_read_id != None:
                            if capture_type_paired:
                                lib_i = paired_lib_i[last_read_id]
                                paired_output_reads_file_handles[
                                    lib_i].writelines(rec_buf)
                                paired_cnt += 1
                                total_paired_reads_by_set[lib_i] += 1
                                if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                                    print("\t" + str(paired_cnt) +
                                          " recs processed")
                            else:
                                #unpaired_fwd_buf.extend(rec_buf)
                                pass
                            rec_buf = []
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        last_read_id = read_id
                        try:
                            found = paired_lib_i[read_id]
                            capture_type_paired = True
                        except:
                            total_unpaired_fwd_reads += 1
                            capture_type_paired = False
                    rec_buf.append(line)
                # last rec
                if len(rec_buf) > 0:
                    if capture_type_paired:
                        lib_i = paired_lib_i[last_read_id]
                        paired_output_reads_file_handles[lib_i].writelines(
                            rec_buf)
                        paired_cnt += 1
                        if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                            print("\t" + str(paired_cnt) + " recs processed")
                    else:
                        #unpaired_fwd_buf.extend(rec_buf)
                        pass
                    rec_buf = []

            for output_handle in paired_output_reads_file_handles:
                output_handle.close()

            print("\t" + str(paired_cnt) + " FWD recs processed")

            # split rev paired
            print("WRITING REV SPLIT PAIRED")  # DEBUG
            paired_output_reads_file_handles = []
            for lib_i in range(split_num):
                paired_output_reads_file_handles.append(
                    open(
                        output_rev_paired_file_path_base + "-" + str(lib_i) +
                        ".fastq", 'w', paired_buf_size))

            rec_buf = []
            last_read_id = None
            paired_cnt = 0
            capture_type_paired = False

            with open(input_item['rev_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        if last_read_id != None:
                            if capture_type_paired:
                                lib_i = paired_lib_i[last_read_id]
                                paired_output_reads_file_handles[
                                    lib_i].writelines(rec_buf)
                                paired_cnt += 1
                                if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                                    print("\t" + str(paired_cnt) +
                                          " recs processed")
                            else:
                                #unpaired_fwd_buf.extend(rec_buf)
                                pass
                            rec_buf = []
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        last_read_id = read_id
                        try:
                            found = paired_lib_i[read_id]
                            capture_type_paired = True
                        except:
                            total_unpaired_rev_reads += 1
                            capture_type_paired = False
                    rec_buf.append(line)
                # last rec
                if len(rec_buf) > 0:
                    if capture_type_paired:
                        lib_i = paired_lib_i[last_read_id]
                        paired_output_reads_file_handles[lib_i].writelines(
                            rec_buf)
                        paired_cnt += 1
                        if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                            print("\t" + str(paired_cnt) + " recs processed")
                    else:
                        #unpaired_fwd_buf.extend(rec_buf)
                        pass
                    rec_buf = []

            for output_handle in paired_output_reads_file_handles:
                output_handle.close()

            print("\t" + str(paired_cnt) + " REV recs processed")

            # summary
            report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[
                'name'] + "\n"
            report += "TOTAL PAIRED READS: " + str(total_paired_reads) + "\n"
            report += "TOTAL UNPAIRED FWD READS (discarded): " + str(
                total_unpaired_fwd_reads) + "\n"
            report += "TOTAL UNPAIRED REV READS (discarded): " + str(
                total_unpaired_rev_reads) + "\n"
            report += "\n"
            for lib_i in range(split_num):
                report += "PAIRED READS IN SET " + str(lib_i) + ": " + str(
                    total_paired_reads_by_set[lib_i]) + "\n"
            print(report)

            # make replicate objects to return
            #        for replicate_i,replicate_item in enumerate(replicate_files):
            #            replicate_input.append({'fwd_file': replicate_item['fwd_file'],
            #                                    'type': input_item['type'],
            #                                    'name': input_item['name']+"-"+str(replicate_i)
            #                                })
            #            if input_item['type'] == self.PE_flag:
            #                replicate_input[replicate_i]['rev_file'] = replicate_item['rev_file']

            print("MAKING REPLICATE OBJECT")  # DEBUG
            paired_obj_refs = []
            for lib_i in range(split_num):
                output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str(
                    lib_i) + ".fastq"
                output_rev_paired_file_path = output_rev_paired_file_path_base + "-" + str(
                    lib_i) + ".fastq"
                if not os.path.isfile (output_fwd_paired_file_path) \
                     or os.path.getsize (output_fwd_paired_file_path) == 0 \
                   or not os.path.isfile (output_rev_paired_file_path) \
                     or os.path.getsize (output_rev_paired_file_path) == 0:

                    raise ValueError("failed to create paired output")
                else:
                    zero_pad = '0' * (len(str(split_num)) -
                                      len(str(lib_i + 1)))
                    replicate_files.append({
                        'fwd_file':
                        output_fwd_paired_file_path,
                        'rev_file':
                        output_rev_paired_file_path,
                        'ref':
                        input_item[
                            'ref'],  # note: this is for the src, not the subsample which is not saved
                        'type':
                        input_item['type'],
                        'name':
                        input_item['name'] + '-' + zero_pad + str(lib_i + 1)
                    })

        # SingleEndLibrary
        #
        elif input_item['type'] == self.SE_flag:
            print("SUBSAMPLING SE library " + input_item['name'])

            # file paths
            input_fwd_path = re.sub("\.fastq$", "", input_item['fwd_file'])
            input_fwd_path = re.sub("\.FASTQ$", "", input_fwd_path)
            output_fwd_paired_file_path_base = input_fwd_path + "_fwd_paired"

            # get "paired" ids
            print("DETERMINING IDS")  # DEBUG
            paired_ids = dict()
            paired_ids_list = []
            paired_lib_i = dict()
            paired_buf_size = 100000
            recs_beep_n = 100000

            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        if read_id in paired_ids:
                            raise ValueError("repeat read_id: " + read_id)
                        paired_ids[read_id] = True
                        paired_ids_list.append(read_id)
                        # DEBUG
#                        if rec_cnt % 100 == 0:
#                            print ("read_id: '"+str(read_id)+"'")
#                        rec_cnt += 1
            total_paired_reads = len(paired_ids_list)
            print("TOTAL READS CNT: " + str(total_paired_reads))  # DEBUG

            # Determine sublibrary sizes
            if use_reads_num:
                reads_per_lib = reads_num
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_num <= total_paired_reads_cnt / split_num.  You have reads_num:"
                        + str(reads_num) + " > total_paired_reads_cnt:" +
                        str(total_paired_reads) + " / split_num:" +
                        str(split_num) + ".  Instead try reads_num <= " +
                        str(total_paired_reads // split_num))
            elif use_reads_perc:
                reads_per_lib = int(
                    (subsample_percent / 100.0) * total_paired_reads)
                if reads_per_lib > total_paired_reads // split_num:
                    raise ValueError(
                        "must specify reads_perc <= 1 / split_num.  You have reads_perc:"
                        + str(subsample_percent) + " > 1 / split_num:" +
                        str(split_num) + ".  Instead try reads_perc <= " +
                        str(int(100 * 1 / split_num)))
            else:
                raise ValueError(
                    "error in logic reads_num vs. reads_perc logic")

            # Determine random membership in each sublibrary
            print("GETTING RANDOM SUBSAMPLES")  # DEBUG
            for i, read_id in enumerate(
                    random.sample(paired_ids_list, reads_per_lib * split_num)):
                lib_i = i % split_num
                paired_lib_i[read_id] = lib_i

            # set up for file io
            total_paired_reads = 0
            total_paired_reads_by_set = []
            paired_buf_size = 1000000

            # split reads
            print("WRITING SPLIT SINGLE END READS")  # DEBUG
            paired_output_reads_file_handles = []
            for lib_i in range(split_num):
                paired_output_reads_file_handles.append(
                    open(
                        output_fwd_paired_file_path_base + "-" + str(lib_i) +
                        ".fastq", 'w', paired_buf_size))
                total_paired_reads_by_set.append(0)

            rec_buf = []
            last_read_id = None
            paired_cnt = 0
            recs_beep_n = 1000000
            with open(input_item['fwd_file'], 'r',
                      0) as input_reads_file_handle:
                rec_line_i = -1
                for line in input_reads_file_handle:
                    rec_line_i += 1
                    if rec_line_i == 3:
                        rec_line_i = -1
                    elif rec_line_i == 0:
                        if not line.startswith('@'):
                            raise ValueError("badly formatted rec line: '" +
                                             line + "'")
                        total_paired_reads += 1
                        if last_read_id != None:
                            try:
                                lib_i = paired_lib_i[last_read_id]
                                total_paired_reads_by_set[lib_i] += 1
                                paired_output_reads_file_handles[
                                    lib_i].writelines(rec_buf)
                                paired_cnt += 1
                            except:
                                pass
                            if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                                print("\t" + str(paired_cnt) +
                                      " recs processed")
                            rec_buf = []
                        read_id = line.rstrip('\n')
                        read_id = re.sub("[ \t]+.*$", "", read_id)
                        read_id = re.sub("[\/\.\_\-\:\;][012lrLRfrFR53]\'*$",
                                         "", read_id)
                        last_read_id = read_id
                    rec_buf.append(line)
                # last rec
                if len(rec_buf) > 0:
                    if last_read_id != None:
                        try:
                            lib_i = paired_lib_i[last_read_id]
                            total_paired_reads_by_set[lib_i] += 1
                            paired_output_reads_file_handles[lib_i].writelines(
                                rec_buf)
                            paired_cnt += 1
                        except:
                            pass
                    if paired_cnt != 0 and paired_cnt % recs_beep_n == 0:
                        print("\t" + str(paired_cnt) + " recs processed")
                    rec_buf = []

            for output_handle in paired_output_reads_file_handles:
                output_handle.close()

            # summary
            report = 'SUMMARY FOR SUBSAMPLE OF READ LIBRARY: ' + input_item[
                'name'] + "\n"
            report += "TOTAL READS: " + str(total_paired_reads) + "\n"
            for lib_i in range(split_num):
                report += "SINGLE END READS IN SET " + str(lib_i) + ": " + str(
                    total_paired_reads_by_set[lib_i]) + "\n"
            print(report)

            # make replicate objects to return
            print("MAKING REPLICATE OBJECTS")  # DEBUG
            paired_obj_refs = []
            for lib_i in range(split_num):
                output_fwd_paired_file_path = output_fwd_paired_file_path_base + "-" + str(
                    lib_i) + ".fastq"
                if not os.path.isfile (output_fwd_paired_file_path) \
                     or os.path.getsize (output_fwd_paired_file_path) == 0:

                    raise ValueError("failed to create paired output")
                else:
                    zero_pad = '0' * (len(str(split_num)) -
                                      len(str(lib_i + 1)))
                    replicate_files.append({
                        'fwd_file':
                        output_fwd_paired_file_path,
                        'ref':
                        input_item[
                            'ref'],  # note: this is for the src, not the subsample which is not saved
                        'type':
                        input_item['type'],
                        'name':
                        input_item['name'] + '-' + zero_pad + str(lib_i + 1)
                    })

        else:
            raise ValueError("unknown ReadLibrary type:" +
                             str(input_item['type']) + " for readslibrary: " +
                             input_item['name'])

        return replicate_files

    def _fasta_seq_len_at_least(self, fasta_path, min_fasta_len=1):
        '''
        counts the number of non-header, non-whitespace characters in a FASTA file
        '''
        seq_len = 0
        with open(fasta_path, 'r', 0) as fasta_handle:
            for line in fasta_handle:
                line = line.strip()
                if line.startswith('>'):
                    continue
                line = line.replace(' ', '')
                seq_len += len(line)
                if seq_len >= min_fasta_len:
                    return True
        return False