Example #1
0
 def save_assembly(self, wsname, output_contigs, token, name, console):
     self.log(console, 'Uploading FASTA file to Assembly')
     assemblyUtil = AssemblyUtil(self.callbackURL, token=token,
                                 service_ver='dev')
     assemblyUtil.save_assembly_from_fasta({'file': {'path': output_contigs},
                                            'workspace_name': wsname,
                                            'assembly_name': name
                                            })
Example #2
0
    def test_filter_contigs_by_length_01(self):
        method = 'filter_contigs_by_length_01'

        print("\n\nRUNNING: test_filter_contigs_by_length_01()")
        print("===========================================\n\n")

        # upload test data
        try:
            auClient = AssemblyUtil(self.callback_url,
                                    token=self.getContext()['token'])
        except Exception as e:
            raise ValueError(
                'Unable to instantiate auClient with callbackURL: ' +
                self.callback_url + ' ERROR: ' + str(e))
        ass_file_1 = 'assembly_1.fa.gz'
        ass_file_2 = 'assembly_2.fa.gz'
        ass_path_1 = os.path.join(self.scratch, ass_file_1)
        ass_path_2 = os.path.join(self.scratch, ass_file_2)
        shutil.copy(os.path.join("data", ass_file_1), ass_path_1)
        shutil.copy(os.path.join("data", ass_file_2), ass_path_2)
        ass_ref_1 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_1
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            'assembly_1'
        })
        ass_ref_2 = auClient.save_assembly_from_fasta({
            'file': {
                'path': ass_path_2
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            'assembly_2'
        })

        # run method
        input_refs = [ass_ref_1, ass_ref_2]
        base_output_name = method + '_output'
        params = {
            'workspace_name': self.getWsName(),
            'input_assembly_refs': input_refs,
            'min_contig_length': 1000,
            'output_name': 'test_filtered'
        }
        result = self.getImpl().run_filter_contigs_by_length(
            self.getContext(), params)
        print('RESULT:')
        pprint(result)
        pass
Example #3
0
    def upload_assembly(self, file_path, workspace_name, assembly_name):
        """
        From a list of file paths, uploads them to KBase, generates Assembly objects,
        then returns the generated UPAs.
        """
        if not file_path:
            raise ValueError("file_path must be defined")
        if not os.path.exists(file_path):
            raise ValueError(
                "The given assembly file '{}' does not exist".format(
                    file_path))
        if not workspace_name:
            raise ValueError("workspace_name must be defined")
        if not assembly_name:
            raise ValueError("assembly_name must be defined")

        au = AssemblyUtil(self.callback_url)
        assembly_upa = au.save_assembly_from_fasta({
            "file": {
                "path": file_path
            },
            "workspace_name":
            workspace_name,
            "assembly_name":
            assembly_name
        })
        return assembly_upa
 def get_fasta_file(self, filename, obj_name):
     assemblyUtil = AssemblyUtil(self.callback_url)
     assembly_ref = assemblyUtil.save_assembly_from_fasta({'file': {'path': filename},
                                                           'workspace_name': self.getWsName(),
                                                           'assembly_name': obj_name
                                                           })
     return assembly_ref
Example #5
0
    def getAssemblyInfo(self, ass_name):
        if hasattr(self.__class__, 'assemblyInfo'):
            if self.__class__.assemblyInfo.get(ass_name):
                return self.__class__.assemblyInfo[ass_name]

        # copy the local test file to the shared scratch space so that the AssemblyUtil
        # container can see it.
        test_fasta_file_local = os.path.join('data', 'assemblies', ass_name)
        test_fasta_file_scratch = os.path.join(
            self.scratch, os.path.basename(test_fasta_file_local))
        shutil.copy(test_fasta_file_local, test_fasta_file_scratch)

        # call the AssemblyUtil libary to upload the test data to KBase
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        ass_ref = au.save_assembly_from_fasta({
            'file': {
                'path': test_fasta_file_scratch
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            ass_name
        })

        # get the object metadata for the new test dataset
        new_obj_info = self.ws.get_object_info_new(
            {'objects': [{
                'ref': ass_ref
            }]})
        if not hasattr(self.__class__, 'assemblyInfo'):
            self.__class__.assemblyInfo = dict()
        self.__class__.assemblyInfo[ass_name] = new_obj_info[0]
        return new_obj_info[0]
    def load_genome_direct(self, filename, assembly_filename, obj_name):
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        assembly_ref = au.save_assembly_from_fasta({
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            obj_name + '.assembly',
            'file': {
                'path': assembly_filename
            }
        })
        pprint('created test assembly: ' + assembly_ref)

        with open(filename, 'r') as file:
            data_str = file.read()
        data = json.loads(data_str)
        data['assembly_ref'] = assembly_ref
        save_info = {
            'workspace': self.getWsName(),
            'data': data,
            'name': obj_name + '.genome'
        }
        info = self.gaa.save_one_genome_v1(save_info)['info']
        ref = "{}/{}/{}".format(info[6], info[0], info[4])
        print(('created test genome: ' + ref + ' from file ' + filename))
        return ref
 def load_genome_direct(cls, filename, assembly_filename, obj_name):
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
     assembly_path = os.path.join(cls.cfg['scratch'],
                                  os.path.basename(assembly_filename))
     shutil.copy(assembly_filename, assembly_path)
     assembly_ref = au.save_assembly_from_fasta({
         'workspace_name': cls.wsName,
         'assembly_name': obj_name + '.assembly',
         'file': {
             'path': assembly_path
         }
     })
     data = json.load(open(filename))
     data['assembly_ref'] = assembly_ref
     save_info = {
         'workspace':
         cls.wsName,
         'objects': [{
             'data': data,
             'name': obj_name + '.genome',
             'type': 'KBaseGenomes.Genome',
         }],
     }
     info = cls.wsClient.save_objects(save_info)[0]
     ref = f"{info[6]}/{info[0]}/{info[4]}"
     print('created test genome: ' + ref + ' from file ' + filename)
     return ref, assembly_ref
Example #8
0
 def load_fasta_file(self, filename, obj_name, contents):
     f = open(filename, 'w')
     f.write(contents)
     f.close()
     assemblyUtil = AssemblyUtil(self.callback_url)
     assembly_ref = assemblyUtil.save_assembly_from_fasta({'file': {'path': filename},
                                                           'workspace_name': self.getWsName(),
                                                           'assembly_name': obj_name
                                                           })
     return assembly_ref
    def jayrbolton_contig_filter(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN jayrbolton_contig_filter
        if not params.get('assembly_input_ref'):
            raise TypeError("`assembly_input_ref` is required")
        if not params.get('min_length') or not isinstance(
                params['min_length'], int):
            raise TypeError("`min_length` is required and needs to be an int")
        min_length = params['min_length']
        # Initialize the assembly util client
        assembly_util = AssemblyUtil(self.callback_url)
        # download the fasta file to local disk
        fasta_file = assembly_util.get_assembly_as_fasta(
            {'ref': params['assembly_input_ref']})
        filtered_path = os.path.join(self.shared_folder, 'filtered.fasta')
        report_client = KBaseReport(self.callback_url)
        result = contig_filter(fasta_file['path'], filtered_path, min_length)
        assembly_obj = assembly_util.save_assembly_from_fasta({
            'workspace_name':
            params['workspace_name'],
            'file': {
                'path': filtered_path,
                'assembly_name': 'filtered_contigs'
            },
            'assembly_name':
            'filtered_assembly'
        })
        report = report_client.create_extended_report({
            'workspace_name':
            params['workspace_name'],
            'objects_created': [{
                'ref': assembly_obj,
                'description': 'filtered_assembly'
            }],
            'message':
            (f"Filtered out {result['n_total'] - result['n_remaining']} "
             f"records out of {result['n_total']} records.")
        })
        output = {'report_ref': report['ref'], 'report_name': report['name']}
        #END jayrbolton_contig_filter

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method jayrbolton_contig_filter return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Example #10
0
 def prepare_data(cls):
     assembly_file_path = os.path.join(cls.scratch,
                                       'e_coli_assembly.fasta')
     shutil.copy('data/e_coli/e_coli_assembly.fasta', assembly_file_path)
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
     assembly_ref = au.save_assembly_from_fasta({
         'workspace_name': cls.wsName,
         'assembly_name': 'e_coli.assembly',
         'file': {'path': assembly_file_path}
     })
     cls.test_genome_data = json.load(open('data/e_coli/e_coli.json'))
     cls.test_genome_data['assembly_ref'] = assembly_ref
Example #11
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'provenance': [
                            {'service': 'GenomeFileUtil',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('GenomeFileUtil'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.ws = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = GenomeFileUtil(cls.cfg)
        gi_config = SDKConfig(cls.cfg)
        cls.genome_interface = GenomeInterface(gi_config)
        # create one WS for all tests
        suffix = int(time.time() * 1000)
        wsName = "test_GenomeAnnotationAPI_" + str(suffix)
        cls.ws.create_workspace({'workspace': wsName})
        cls.wsName = wsName

        # save new genome
        assembly_file_path = os.path.join(cls.cfg['scratch'],
                                          'Rhodo_SPAdes_assembly.fa')
        shutil.copy('data/Rhodo_SPAdes_assembly.fa', assembly_file_path)
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        cls.assembly_ref = au.save_assembly_from_fasta({
            'workspace_name': cls.wsName,
            'assembly_name': 'ecoli.assembly',
            'file': {'path': assembly_file_path}
        })

        rhodobacter_contigs = json.load(open('data/rhodobacter_contigs.json'))
        save_info = {
            'workspace': cls.wsName,
            'objects': [{
                'type': 'KBaseGenomes.ContigSet',
                'data': rhodobacter_contigs,
                'name': 'rhodobacter_contigs'
            }]
        }
        cls.contigset_ref = cls.ws.save_objects(save_info)
 def loadAssembly(self):
     if hasattr(self.__class__, 'assembly_ref'):
         return self.__class__.assembly_ref
     fasta_path = os.path.join(self.scratch, 'test.fna')
     shutil.copy(os.path.join('data', 'test.fna'), fasta_path)
     au = AssemblyUtil(self.callback_url)
     assembly_ref = au.save_assembly_from_fasta({
         'file': {
             'path': fasta_path
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         'test_assembly'
     })
     self.__class__.assembly_ref = assembly_ref
     return assembly_ref
    def prepareTestData(cls):
        """This function creates an assembly object for testing"""
        fasta_content = '>seq1 something soemthing asdf\n' \
                        'agcttttcat\n' \
                        '>seq2\n' \
                        'agctt\n' \
                        '>seq3\n' \
                        'agcttttcatgg'

        filename = os.path.join(cls.scratch, 'test1.fasta')
        with open(filename, 'w') as f:
            f.write(fasta_content)
        assemblyUtil = AssemblyUtil(cls.callback_url)
        cls.assembly_ref = assemblyUtil.save_assembly_from_fasta({
            'file': {'path': filename},
            'workspace_name': cls.wsName,
            'assembly_name': 'TestAssembly'
        })
Example #14
0
def load_fasta_file(callback_url, ws_name, filename, obj_name, contents):
    """
    Loads the given FASTA file into a workspace as an Assembly object.
    """
    f = open(filename, 'w')
    f.write(contents)
    f.close()
    assembly_util = AssemblyUtil(callback_url)
    assembly_ref = assembly_util.save_assembly_from_fasta({
        'file': {
            'path': filename
        },
        'workspace_name':
        ws_name,
        'assembly_name':
        obj_name
    })
    return assembly_ref
Example #15
0
 def get_genome_ref(self, ws_name, tf='ecoliMG1655.fa'):
     if hasattr(self.__class__, 'genomeInfo'):
         return self.__class__.genomeInfo
     au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
     target = os.path.join(self.scratch, tf)
     self.genome_path = target
     curr_dir = os.path.dirname(os.path.abspath(__file__))
     shutil.copy(os.path.join(curr_dir, 'data', tf), target)
     self.__class__.genomeInfo = au.save_assembly_from_fasta({
         'file': {
             'path': target
         },
         'workspace_name':
         ws_name,
         'assembly_name':
         tf.split('.fa')[0]
     })
     return self.__class__.genomeInfo
    def getBogusAssembly(self):
        # Create a fake assembly with lots of contigs

        assembly_file_name = "bogus.fna"  # "AP009048.fna"
        assembly_temp_file = os.path.join("/kb/module/work/tmp", assembly_file_name)
        with open(assembly_temp_file, "w") as f:
            for i in range(1, 30002):
                f.write("> contig_%d\n" % i)
                f.write("AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC\n")

        assembly_name = "Assembly.2"
        au = AssemblyUtil(os.environ["SDK_CALLBACK_URL"], token=self.getContext()["token"])
        assembly_ref = au.save_assembly_from_fasta({"file": {"path": assembly_temp_file},
                                                    "workspace_name": self.getWsName(),
                                                    "assembly_name": assembly_name})
        self.assembly_ref = assembly_ref
        print("Uploaded bogus assembly " + str(assembly_ref))
        return assembly_ref
Example #17
0
 def loadAssembly(self):
     if hasattr(self.__class__, 'assembly_ref'):
         return self.__class__.assembly_ref
     # return '23735/1/1'
     fasta_path = os.path.join(self.scratch, 'test_ref.fa')
     shutil.copy(os.path.join('data', 'bt_test_data', 'test_ref.fa'),
                 fasta_path)
     au = AssemblyUtil(self.callback_url)
     assembly_ref = au.save_assembly_from_fasta({
         'file': {
             'path': fasta_path
         },
         'workspace_name':
         self.getWsName(),
         'assembly_name':
         'test_assembly'
     })
     self.__class__.assembly_ref = assembly_ref
     print('Loaded Assembly: ' + assembly_ref)
     return assembly_ref
    def loadAssembly(self, fa_file_path):
        # if hasattr(self.__class__, 'assembly_ref'):
        #   return self.__class__.assembly_ref

        assembly_nm = os.path.basename(fa_file_path)
        fasta_path = os.path.join(self.scratch, assembly_nm)
        shutil.copy(fa_file_path, fasta_path)
        au = AssemblyUtil(self.callback_url)
        assembly_ref = au.save_assembly_from_fasta({
            'file': {
                'path': fasta_path
            },
            'workspace_name':
            self.getWsName(),
            'assembly_name':
            assembly_nm.split('.')[0]
        })
        # self.__class__.assembly_ref = assembly_ref
        print(f'Loaded Assembly:{assembly_nm} with ref of{assembly_ref}.')
        return assembly_ref
Example #19
0
    def loadAssembly(self, assembly_file, params):
        if hasattr(self.__class__, 'assembly_ref'):
            return self.__class__.assembly_ref
        # return '23735/1/1'
        fasta_path = os.path.join(self.scratch,
                                  'Ptrichocarpa_v3.1.assembly.fna')

        shutil.copy(assembly_file, fasta_path)
        #shutil.copy(os.path.join('/kb/module/test/data', 'Ptrichocarpa_v3.1.assembly.fna'), fasta_path)
        au = AssemblyUtil(self.callback_url)
        assembly_ref = au.save_assembly_from_fasta({
            'file': {
                'path': fasta_path
            },
            'workspace_name':
            params["workspace_name"],
            'assembly_name':
            'test_assembly'
        })
        self.__class__.assembly_ref = assembly_ref
        print('Loaded Assembly: ' + assembly_ref)
        return assembly_ref
Example #20
0
    def run_megahit(self, ctx, params):
        """
        :param params: instance of type "MegaHitParams" (Run MEGAHIT. Most
           parameters here are just passed forward to MEGAHIT workspace_name
           - the name of the workspace for input/output read_library_ref -
           the name of the PE read library (SE library support in the future)
           output_contig_set_name - the name of the output contigset
           megahit_parameter_preset - override a group of parameters;
           possible values: meta            '--min-count 2 --k-list
           21,41,61,81,99' (generic metagenomes, default) meta-sensitive 
           '--min-count 2 --k-list 21,31,41,51,61,71,81,91,99' (more
           sensitive but slower) meta-large      '--min-count 2 --k-list
           27,37,47,57,67,77,87' (large & complex metagenomes, like soil)
           bulk            '--min-count 3 --k-list 31,51,71,91,99 --no-mercy'
           (experimental, standard bulk sequencing with >= 30x depth)
           single-cell     '--min-count 3 --k-list 21,33,55,77,99,121
           --merge_level 20,0.96' (experimental, single cell data) min_count
           - minimum multiplicity for filtering (k_min+1)-mers, default 2
           k_min - minimum kmer size (<= 255), must be odd number, defaults
           to 21 k_max - maximum kmer size (<= 255), must be odd number,
           defaults to 141 k_step - increment of kmer size of each iteration
           (<= 28), must be even number, defaults to 10 k_list - list of kmer
           sizes (all must be odd, in the range 15-255, increment <= 28);
           override using `--k-min', `--k-max' and `--k-step'
           min_contig_length - minimum length of contigs to output, default
           is 2000 max_mem_percent - maximum memory to make available to
           MEGAHIT, as a percentage of available system memory (optional,
           default = 0.9 or 90%) @optional megahit_parameter_preset @optional
           min_count @optional k_min @optional k_max @optional k_step
           @optional k_list @optional min_contig_length @optional
           max_mem_percent) -> structure: parameter "workspace_name" of
           String, parameter "read_library_ref" of String, parameter
           "output_contigset_name" of String, parameter
           "megahit_parameter_preset" of String, parameter "min_count" of
           Long, parameter "k_min" of Long, parameter "k_max" of Long,
           parameter "k_step" of Long, parameter "k_list" of list of Long,
           parameter "min_contig_length" of Long, parameter "max_mem_percent"
           of Double
        :returns: instance of type "MegaHitOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_megahit
        print('Running run_megahit with params=')
        pprint(params)

        # STEP 1: basic parameter checks + parsing
        if 'workspace_name' not in params:
            raise ValueError('workspace_name parameter is required')
        if 'read_library_ref' not in params:
            raise ValueError('read_library_ref parameter is required')
        if 'output_contigset_name' not in params:
            raise ValueError('output_contigset_name parameter is required')

        # STEP 2: get the read library as deinterleaved fastq files
        input_ref = params['read_library_ref']
        reads_params = {'read_libraries': [input_ref],
                        'interleaved': 'false',
                        'gzipped': None
                        }
        ru = ReadsUtils(self.callbackURL)
        reads = ru.download_reads(reads_params)['files']

        print('Input reads files:')
        fwd = reads[input_ref]['files']['fwd']
        rev = reads[input_ref]['files']['rev']
        pprint('forward: ' + fwd)
        pprint('reverse: ' + rev)

        # STEP 3: run megahit
        # construct the command
        megahit_cmd = [self.MEGAHIT]

        # we only support PE reads, so add that
        megahit_cmd.append('-1')
        megahit_cmd.append(fwd)
        megahit_cmd.append('-2')
        megahit_cmd.append(rev)

        # if a preset is defined, use that:
        if 'megahit_parameter_preset' in params:
            if params['megahit_parameter_preset']:
                megahit_cmd.append('--presets')
                megahit_cmd.append(params['megahit_parameter_preset'])

        if 'min_count' in params:
            if params['min_count']:
                megahit_cmd.append('--min-count')
                megahit_cmd.append(str(params['min_count']))
        if 'k_min' in params:
            if params['k_min']:
                megahit_cmd.append('--k-min')
                megahit_cmd.append(str(params['k_min']))
        if 'k_max' in params:
            if params['k_max']:
                megahit_cmd.append('--k-max')
                megahit_cmd.append(str(params['k_max']))
        if 'k_step' in params:
            if params['k_step']:
                megahit_cmd.append('--k-step')
                megahit_cmd.append(str(params['k_step']))
        if 'k_list' in params:
            if params['k_list']:
                k_list = []
                for k_val in params['k_list']:
                    k_list.append(str(k_val))
                megahit_cmd.append('--k-list')
                megahit_cmd.append(','.join(k_list))

        min_contig_length = self.DEFAULT_MIN_CONTIG_LENGTH
        if 'min_contig_length' in params:
            if params['min_contig_length']:
                if str(params['min_contig_length']).isdigit():
                    min_contig_length = params['min_contig_length']
                else:
                    raise ValueError('min_contig_length parameter must be a non-negative integer')

        megahit_cmd.append('--min-contig-len')
        megahit_cmd.append(str(min_contig_length))

        # Set the number of CPUs to the number of cores minus 1
        megahit_cmd.append('--num-cpu-threads')
        megahit_cmd.append(str(max([(multiprocessing.cpu_count() - 1), 1])))

        # set mem usage
        # Note: this just sets the default value - 90% of available system memory allocated
        # to the container. Exposing it here as a place to later expose as a parameter.
        max_mem_percent = params.get('max_mem_percent', 0.9)
        megahit_cmd.append('-m')
        megahit_cmd.append(str(max_mem_percent))

        # set the output location
        timestamp = int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds() * 1000)
        output_dir = os.path.join(self.scratch, 'output.' + str(timestamp))
        megahit_cmd.append('-o')
        megahit_cmd.append(output_dir)

        # run megahit
        print('running megahit:')
        print('    ' + ' '.join(megahit_cmd))
        p = subprocess.Popen(megahit_cmd, cwd=self.scratch, shell=False)
        retcode = p.wait()

        print('Return code: ' + str(retcode))
        if p.returncode != 0:
            error_str = report_megahit_error(output_dir, retcode)
            raise RuntimeError(error_str)

        output_contigs = os.path.join(output_dir, 'final.contigs.fa')

        # on macs, we cannot run megahit in the shared host scratch space, so we need to move the file there
        if self.mac_mode:
            shutil.move(output_contigs, os.path.join(self.host_scratch, 'final.contigs.fa'))
            output_contigs = os.path.join(self.host_scratch, 'final.contigs.fa')

        # STEP 4: save the resulting assembly
        assemblyUtil = AssemblyUtil(self.callbackURL)
        output_data_ref = assemblyUtil.save_assembly_from_fasta({
                                                                'file': {'path': output_contigs},
                                                                'workspace_name': params['workspace_name'],
                                                                'assembly_name': params['output_contigset_name']
                                                                })


        # STEP 5: generate and save the report

        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/' + params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' + str(edges[c + 1]) + ' bp\n'

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except ServerError as qe:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from running QUAST')
            print(str(qe))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref, 'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except ServerError as re:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print(str(re))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'], 'report_ref': report_info['ref']}

        #END run_megahit

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_megahit return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
    def run_cnelsonAppDemo(self, ctx, params):
        """
        This example function accepts any number of parameters and returns results in a KBaseReport
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_cnelsonAppDemo

        # Print statements to stdout/stderr are captured and available as the App log
        logging.info('Starting run_cnelsonAppDemo function. Params=' +
                     pformat(params))

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        logging.info('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'min_length' not in params:
            raise ValueError(
                'Parameter min_length is not set in input arguments')
        min_length_orig = params['min_length']
        min_length = None
        try:
            min_length = int(min_length_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from min_length parameter (' +
                str(min_length_orig) + ')')
        if min_length < 0:
            raise ValueError('min_length parameter cannot be negative (' +
                             str(min_length) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        logging.info('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(
            {'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.
        good_contigs = []
        n_total = 0
        n_remaining = 0
        for record in SeqIO.parse(fasta_file['path'], 'fasta'):
            n_total += 1
            if len(record.seq) >= min_length:
                good_contigs.append(record)
                n_remaining += 1

        logging.info('Filtered Assembly to ' + str(n_remaining) +
                     ' contigs out of ' + str(n_total))
        filtered_fasta_file = os.path.join(self.shared_folder,
                                           'filtered.fasta')
        SeqIO.write(good_contigs, filtered_fasta_file, 'fasta')

        # Step 4 - Save the new Assembly back to the system
        logging.info('Uploading filtered Assembly data.')
        new_assembly = assemblyUtil.save_assembly_from_fasta({
            'file': {
                'path': filtered_fasta_file
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })

        # Step 5 - Build a Report and return
        reportObj = {
            'objects_created': [{
                'ref': new_assembly,
                'description': 'Filtered contigs'
            }],
            'text_message':
            'Filtered Assembly to ' + str(n_remaining) + ' contigs out of ' +
            str(n_total)
        }
        report = KBaseReport(self.callback_url)
        report_info = report.create({
            'report': reportObj,
            'workspace_name': params['workspace_name']
        })

        # STEP 6: contruct the output to send back
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref'],
            'assembly_output': new_assembly,
            'n_initial_contigs': n_total,
            'n_contigs_removed': n_total - n_remaining,
            'n_contigs_remaining': n_remaining
        }
        logging.info('returning:' + pformat(output))

        #END run_cnelsonAppDemo

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_cnelsonAppDemo return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Example #22
0
class ImportAssemblyUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'import_assembly_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.uploader_utils = UploaderUtil(config)
        self.max_contigs_for_report = 200

    def import_fasta_as_assembly_from_staging(self, params):
        """
          import_fasta_as_assembly_from_staging: wrapper method for
                                    AssemblyUtil.save_assembly_from_fasta

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          assembly_name - output Assembly file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        """
        logging.info(
            '--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n'
            f'params:\n{json.dumps(params, indent=1)}')

        self.validate_import_fasta_as_assembly_from_staging(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        file = {'path': scratch_file_path}
        import_assembly_params = params
        import_assembly_params['file'] = file

        ref = self.au.save_assembly_from_fasta(import_assembly_params)
        """
        Update the workspace object related meta-data for staged file
        """
        # self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref)

        returnVal = {'obj_ref': ref}
        return returnVal

    def validate_import_fasta_as_assembly_from_staging(self, params):
        """
        validate_import_fasta_as_assembly_from_staging:
                    validates params passed to import_fasta_as_assembly_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name', 'assembly_name'
        ]:
            if p not in params:
                raise ValueError(f'"{p}" parameter is required, but missing')

    def generate_html_report(self, assembly_ref, assembly_object, params):
        """
        _generate_html_report: generate html summary report
        """
        logging.info('start generating html report')
        html_report = list()

        assembly_data = assembly_object.get('data')[0].get('data')
        assembly_info = assembly_object.get('data')[0].get('info')

        tmp_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        handler_utils._mkdir_p(tmp_dir)
        result_file_path = os.path.join(tmp_dir, 'report.html')

        assembly_name = str(assembly_info[1])
        assembly_file = params.get('staging_file_subdir_path')

        dna_size = assembly_data.get('dna_size')
        num_contigs = assembly_data.get('num_contigs')

        assembly_overview_data = collections.OrderedDict()

        assembly_overview_data['Name'] = '{} ({})'.format(
            assembly_name, assembly_ref)
        assembly_overview_data['Uploaded File'] = assembly_file
        assembly_overview_data['Date Uploaded'] = time.strftime("%c")
        assembly_overview_data['DNA Size'] = dna_size
        assembly_overview_data['Number of Contigs'] = num_contigs

        overview_content = ['<br/><table>\n']
        for key, val in assembly_overview_data.items():
            overview_content.append(f'<tr><td><b>{key}</b></td>')
            overview_content.append(f'<td>{val}</td></tr>\n')
        overview_content.append('</table>')

        contig_data = assembly_data.get('contigs').values()
        contig_content = str([str(e['contig_id']), e['length']]
                             for e in contig_data)

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__), 'report_template',
                                 'report_template_assembly.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>*Overview_Content*</p>', ''.join(overview_content))
                report_template = report_template.replace(
                    '*CONTIG_DATA*', contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({
            'file_path': tmp_dir,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Imported Assembly'
        })
        return html_report

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_fasta_as_assembly_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """
        object_data = self.dfu.get_objects({'object_refs': [obj_ref]})

        report_params = {
            'workspace_name':
            params.get('workspace_name'),
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Imported Assembly'
            }],
            'report_object_name':
            f'kb_upload_assembly_report_{uuid.uuid4()}'
        }

        num_contigs = object_data['data'][0]['data']['num_contigs']
        if num_contigs > self.max_contigs_for_report:
            report_params['message'] = (
                "The uploaded assembly has too many contigs to display "
                "here. Click on the object for a dedicated viewer")
        else:
            output_html_files = self.generate_html_report(
                obj_ref, object_data, params)
            report_params.update({
                'html_links': output_html_files,
                'direct_html_link_index': 0,
                'html_window_height': 375,
            })

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Example #23
0
class GenbankToGenome:
    def __init__(self, config):
        self.cfg = config
        self.gi = GenomeInterface(config)
        self.dfu = DataFileUtil(config.callbackURL)
        self.aUtil = AssemblyUtil(config.callbackURL)
        self.ws = Workspace(config.workspaceURL)
        self._messages = []
        self.time_string = str(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y_%m_%d_%H_%M_%S'))
        yml_text = open('/kb/module/kbase.yml').read()
        self.version = re.search("module-version:\n\W+(.+)\n",
                                 yml_text).group(1)
        self.generate_parents = False
        self.generate_ids = False
        self.genes = OrderedDict()
        self.mrnas = OrderedDict()
        self.cdss = OrderedDict()
        self.noncoding = []
        self.ontologies_present = defaultdict(dict)
        self.ontology_events = list()
        self.skiped_features = Counter()
        self.feature_counts = Counter()
        self.orphan_types = Counter()
        self.contig_seq = {}
        self.circ_contigs = set()
        self.features_spaning_zero = set()
        self.genome_warnings = []
        self.genome_suspect = False
        self.defects = Counter()
        self.spoofed_genes = 0
        self.excluded_features = ('source', 'exon', 'fasta_record')
        self.ont_mappings = load_ontology_mappings('/kb/module/data')
        self.code_table = 11
        self.re_api_url = config.re_api_url
        # dict with feature 'id's that have been used more than once.
        self.used_twice_identifiers = {}
        self.default_params = {
            'source':
            'Genbank',
            'taxon_wsname':
            self.cfg.raw['taxon-workspace-name'],
            'taxon_lookup_obj_name':
            self.cfg.raw['taxon-lookup-object-name'],
            'ontology_wsname':
            self.cfg.raw['ontology-workspace-name'],
            'ontology_GO_obj_name':
            self.cfg.raw['ontology-gene-ontology-obj-name'],
            'ontology_PO_obj_name':
            self.cfg.raw['ontology-plant-ontology-obj-name'],
            'release':
            None,
            'genetic_code':
            11,
            'generate_ids_if_needed':
            0,
            'metadata': {}
        }

    @property
    def messages(self):
        return "\n".join(self._messages)

    def refactored_import(self, ctx, params):
        # 1) validate parameters and extract defaults
        self.validate_params(params)

        # 2) construct the input directory staging area
        input_directory = self.stage_input(params)

        # 3) update default params
        self.default_params.update(params)
        params = self.default_params
        self.generate_parents = params.get('generate_missing_genes')
        self.generate_ids = params.get('generate_ids_if_needed')
        if params.get('genetic_code'):
            self.code_table = params['genetic_code']

        # 4) Do the upload
        files = self._find_input_files(input_directory)
        consolidated_file = self._join_files_skip_empty_lines(files)
        genome = self.parse_genbank(consolidated_file, params)
        if params.get('genetic_code'):
            genome["genetic_code"] = params['genetic_code']

        result = self.gi.save_one_genome({
            'workspace': params['workspace_name'],
            'name': params['genome_name'],
            'data': genome,
            "meta": params['metadata'],
        })
        ref = f"{result['info'][6]}/{result['info'][0]}/{result['info'][4]}"
        logging.info(f"Genome saved to {ref}")

        # 5) clear the temp directory
        shutil.rmtree(input_directory)

        # 6) return the result
        info = result['info']
        details = {'genome_ref': ref, 'genome_info': info}

        return details

    @staticmethod
    def validate_params(params):
        if 'workspace_name' not in params:
            raise ValueError('required "workspace_name" field was not defined')
        if 'genome_name' not in params:
            raise ValueError('required "genome_name" field was not defined')
        if 'file' not in params:
            raise ValueError('required "file" field was not defined')

        # one and only one of 'path', 'shock_id', or 'ftp_url' is required
        file = params['file']
        if not isinstance(file, dict):
            raise ValueError('required "file" field must be a map/dict')
        sources = ('path', 'shock_id', 'ftp_url')
        n_valid_fields = sum(1 for f in sources if file.get(f))
        if n_valid_fields < 1:
            raise ValueError(f'required "file" field must include one source: '
                             f'{", ".join(sources)}')
        if n_valid_fields > 1:
            raise ValueError(
                f'required "file" field has too many sources specified: '
                f'{", ".join(file.keys())}')
        if params.get('genetic_code'):
            if not (isinstance(params['genetic_code'], int)
                    and 0 < params['genetic_code'] < 32):
                raise ValueError(f"Invalid genetic code specified: {params}")

    def stage_input(self, params):
        """ Setup the input_directory by fetching the files and uncompressing if needed. """

        # construct the input directory where we stage files
        input_directory = os.path.join(
            self.cfg.sharedFolder, f'genome-upload-staging-{uuid.uuid4()}')
        os.makedirs(input_directory)

        # at this point, the 'file' input is validated, so we don't have to catch any special cases
        # we expect one and only one of path, shock_id, or ftp_url

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory
        file = params['file']
        genbank_file_path = None
        if file.get('path') is not None:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = file['path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if 'shock_id' in file and file['shock_id'] is not None:
            # handle shock file
            logging.info(
                f'Downloading file from SHOCK node: {self.cfg.shockURL} - {file["shock_id"]}'
            )
            sys.stdout.flush()
            file_name = self.dfu.shock_to_file({
                'file_path': input_directory,
                'shock_id': file['shock_id']
            })['node_file_name']
            genbank_file_path = os.path.join(input_directory, file_name)

        if 'ftp_url' in file and file['ftp_url'] is not None:
            logging.info('Downloading file from: ' + str(file['ftp_url']))
            local_file_path = self.dfu.download_web_file({
                'file_url':
                file['ftp_url'],
                'download_type':
                'FTP'
            })['copy_file_path']
            genbank_file_path = os.path.join(input_directory,
                                             os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        # extract the file if it is compressed
        if genbank_file_path is not None:
            logging.info("staged input file =" + genbank_file_path)
            self.dfu.unpack_file({'file_path': genbank_file_path})

        else:
            raise ValueError(
                'No valid files could be extracted based on the input')

        return input_directory

    def parse_genbank(self, file_path, params):
        logging.info("Saving original file to shock")
        shock_res = self.dfu.file_to_shock({
            'file_path': file_path,
            'make_handle': 1,
            'pack': 'gzip',
        })
        # Write and save assembly file
        assembly_ref = self._save_assembly(file_path, params)
        assembly_data = self.dfu.get_objects({
            'object_refs': [assembly_ref],
            'ignore_errors': 0
        })['data'][0]['data']
        genome = {
            "id": params['genome_name'],
            "original_source_file_name": os.path.basename(file_path),
            "assembly_ref": assembly_ref,
            "gc_content": assembly_data['gc_content'],
            "dna_size": assembly_data['dna_size'],
            "md5": assembly_data['md5'],
            "genbank_handle_ref": shock_res['handle']['hid'],
            "publications": set(),
            "contig_ids": [],
            "contig_lengths": [],
        }
        genome['source'], genome['genome_tiers'] = self.gi.determine_tier(
            params['source'])

        if params.get('genome_type'):
            genome['genome_type'] = params['genome_type']

        # Set taxonomy-related fields in the genome
        # Also validates the given taxon ID
        if params.get('taxon_id'):
            set_taxon_data(int(params['taxon_id']), self.re_api_url, genome)
        else:
            set_default_taxon_data(genome)

        dates = []
        # Parse data from genbank file
        contigs = Bio.SeqIO.parse(file_path, "genbank")
        for record in contigs:
            r_annot = record.annotations
            logging.info("parsing contig: " + record.id)
            try:
                dates.append(time.strptime(r_annot.get('date'), "%d-%b-%Y"))
            except (TypeError, ValueError):
                pass
            genome['contig_ids'].append(record.id)
            genome['contig_lengths'].append(len(record))
            genome["publications"] |= self._get_pubs(r_annot)

            # only do the following once(on the first contig)
            if "source_id" not in genome:
                genome["source_id"] = record.id.split('.')[0]
                organism = r_annot.get('organism', 'Unknown Organism')
                if params.get('scientific_name'):
                    genome['scientific_name'] = params['scientific_name']
                else:
                    genome['scientific_name'] = organism
                self.code_table = genome['genetic_code']
                genome["molecule_type"] = r_annot.get('molecule_type', 'DNA')
                genome['notes'] = r_annot.get('comment',
                                              "").replace('\\n', '\n')

            self._parse_features(record, genome['source'])

        genome.update(self.get_feature_lists())

        genome['num_contigs'] = len(genome['contig_ids'])
        # add dates
        dates.sort()
        if dates:
            genome['external_source_origination_date'] = time.strftime(
                "%d-%b-%Y", dates[0])
            if dates[0] != dates[-1]:
                genome['external_source_origination_date'] += " _ " + \
                    time.strftime("%d-%b-%Y", dates[-1])

        if self.ontologies_present:
            genome['ontologies_present'] = dict(self.ontologies_present)
            genome["ontology_events"] = self.ontology_events
        genome['feature_counts'] = dict(self.feature_counts)
        # can't serialize a set
        genome['publications'] = list(genome['publications'])

        if len(genome['cdss']) and (self.defects['cds_seq_not_matching'] /
                                    float(len(genome['cdss'])) > 0.02):
            self.genome_warnings.append(
                warnings["genome_inc_translation"].format(
                    self.defects['cds_seq_not_matching'], len(genome['cdss'])))
            self.genome_suspect = 1

        if self.defects['bad_parent_loc']:
            self.genome_warnings.append(
                f"There were {self.defects['bad_parent_loc']} parent/child "
                "relationships that were not able to be determined. Some of "
                "these may have splice variants that may be valid relationships."
            )

        if self.defects['spoofed_genes']:
            self.genome_warnings.append(warnings['spoofed_genome'].format(
                self.defects['spoofed_genes']))
            genome['suspect'] = 1

        if self.defects['not_trans_spliced']:
            self.genome_warnings.append(
                warnings['genome_not_trans_spliced'].format(
                    self.defects['not_trans_spliced']))
            genome['suspect'] = 1

        if self.genome_warnings:
            genome['warnings'] = self.genome_warnings
        if self.genome_suspect:
            genome['suspect'] = 1
        logging.info(f"Feature Counts: {genome['feature_counts']}")
        return genome

    def _save_assembly(self, genbank_file, params):
        """Convert genbank file to fasta and sve as assembly"""
        contigs = Bio.SeqIO.parse(genbank_file, "genbank")
        assembly_id = f"{params['genome_name']}_assembly"
        fasta_file = f"{self.cfg.sharedFolder}/{params['genome_name']}_assembly.fasta"

        out_contigs = []
        extra_info = defaultdict(dict)
        for in_contig in contigs:
            if in_contig.annotations.get('topology', "") == 'circular':
                extra_info[in_contig.id]['is_circ'] = 1
                self.circ_contigs.add(in_contig.id)
            elif in_contig.annotations.get('topology', "") == 'linear':
                extra_info[in_contig.id]['is_circ'] = 0
            out_contigs.append(in_contig)
            self.contig_seq[in_contig.id] = in_contig.seq.upper()

        assembly_ref = params.get("use_existing_assembly")
        if assembly_ref:
            if not re.match("\d+\/\d+\/\d+", assembly_ref):
                raise ValueError(
                    f"Assembly ref: {assembly_ref} is not a valid format. Must"
                    f" be in numerical <ws>/<object>/<version> format.")
            ret = self.dfu.get_objects({'object_refs':
                                        [assembly_ref]})['data'][0]
            if "KBaseGenomeAnnotations.Assembly" not in ret['info'][2]:
                raise ValueError(
                    f"{assembly_ref} is not a reference to an assembly")
            unmatched_ids = list()
            unmatched_ids_md5s = list()
            for current_contig in self.contig_seq.keys():
                current_contig_md5 = hashlib.md5(
                    str(self.contig_seq[current_contig]).encode(
                        'utf8')).hexdigest()
                if current_contig in ret['data']['contigs']:
                    if current_contig_md5 != ret['data']['contigs'][
                            current_contig]['md5']:
                        unmatched_ids_md5s.append(current_contig)
                else:
                    unmatched_ids.append(current_contig)
            if len(unmatched_ids) > 0:
                raise ValueError(warnings['assembly_ref_extra_contigs'].format(
                    ", ".join(unmatched_ids)))
            if len(unmatched_ids_md5s) > 0:
                raise ValueError(warnings["assembly_ref_diff_seq"].format(
                    ", ".join(unmatched_ids_md5s)))
            logging.info(f"Using supplied assembly: {assembly_ref}")
            return assembly_ref
        logging.info("Saving sequence as Assembly object")
        Bio.SeqIO.write(out_contigs, fasta_file, "fasta")
        assembly_ref = self.aUtil.save_assembly_from_fasta({
            'file': {
                'path': fasta_file
            },
            'workspace_name':
            params['workspace_name'],
            'assembly_name':
            assembly_id,
            'type':
            params.get('genome_type', 'isolate'),
            'contig_info':
            extra_info
        })
        logging.info(f"Assembly saved to {assembly_ref}")
        return assembly_ref

    def _find_input_files(self, input_directory):
        logging.info("Scanning for Genbank Format files.")
        valid_extensions = [".gbff", ".gbk", ".gb", ".genbank", ".dat", ".gbf"]

        files = os.listdir(os.path.abspath(input_directory))
        logging.info("Genbank Files : " + ", ".join(files))
        genbank_files = [
            x for x in files
            if os.path.splitext(x)[-1].lower() in valid_extensions
        ]

        if len(genbank_files) == 0:
            raise Exception(
                f"The input directory does not have any files with one of the "
                f"following extensions {','.join(valid_extensions)}.")

        logging.info(f"Found {len(genbank_files)} genbank files")

        input_files = []
        for genbank_file in genbank_files:
            input_files.append(os.path.join(input_directory, genbank_file))

        return input_files

    def _join_files_skip_empty_lines(self, input_files):
        """ Applies strip to each line of each input file.
            Args:
                input_files: Paths to input files in Genbank format.
            Returns:
                Path to resulting file (currenly it's the same file as input).
            """
        if len(input_files) == 0:
            raise ValueError("NO GENBANK FILE")
        temp_dir = os.path.join(os.path.dirname(input_files[0]), "combined")
        if not os.path.exists(temp_dir):
            os.makedirs(temp_dir)
        ret_file = os.path.join(temp_dir, os.path.basename(input_files[0]))

        # take in Genbank file and remove all empty lines from it.
        with open(ret_file, 'w', buffering=2**20) as f_out:
            for input_file in input_files:
                with open(input_file, 'r') as f_in:
                    for line in f_in:
                        line = line.rstrip('\r\n')
                        if line.strip():
                            f_out.write(line + '\n')
        return ret_file

    def _get_pubs(self, r_annotations):
        """Get a contig's publications"""
        pub_list = []
        for in_pub in r_annotations.get('references', []):
            # don't add blank pubs
            if not in_pub.authors:
                continue
            out_pub = [
                0,  # pmid
                "",  # source
                in_pub.title,
                "",  # web address
                "",  # date
                in_pub.authors,
                in_pub.journal,
            ]
            date_match = re.match("\((\d{4})\)", in_pub.journal)
            if date_match:
                out_pub[4] = date_match.group(1)
            if in_pub.pubmed_id:
                out_pub[0:4] = [
                    int(in_pub.pubmed_id), "PubMed", in_pub.title,
                    f"http://www.ncbi.nlm.nih.gov/pubmed/{in_pub.pubmed_id}"
                ]
            pub_list.append(tuple(out_pub))
        logging.info(f"Parsed {len(pub_list)} publication records")
        return set(pub_list)

    def _get_id(self, feat, tags=None):
        """Assign a id to a feature based on the first tag that exists"""
        _id = ""
        if not tags:
            tags = ['locus_tag', 'kbase_id']
        for t in tags:
            _id = feat.qualifiers.get(t, [""])[0]
            if _id:
                break

        if not _id:
            if feat.type == 'gene':
                if not self.generate_ids:
                    raise ValueError(
                        f"Unable to find a valid id for gene "
                        f"among these tags: {', '.join(tags)}. Correct the "
                        f"file or rerun with generate_ids\n {feat}")
                self.orphan_types['gene'] += 1
                _id = f"gene_{self.orphan_types['gene']}"
            if 'rna' in feat.type.lower() or feat.type in {
                    'CDS', 'sig_peptide', 'five_prime_UTR', 'three_prime_UTR'
            }:
                _id = f"gene_{self.orphan_types['gene']}"

        return _id

    def _parse_features(self, record, source):
        def _location(feat):
            """Convert to KBase style location objects"""
            strand_trans = ("", "+", "-")
            loc = []
            for part in feat.location.parts:
                contig_id = part.ref if part.ref else record.id
                if part.strand >= 0:
                    begin = int(part.start) + 1
                else:
                    begin = int(part.end)
                loc.append(
                    (contig_id, begin, strand_trans[part.strand], len(part)))
            return loc

        def _warn(message):
            if message not in out_feat.get('warnings', []):
                out_feat['warnings'] = out_feat.get('warnings', []) + [message]

        def _check_suspect_location(parent=None):
            if 'trans_splicing' in out_feat.get('flags', []):
                return

            if out_feat['location'] == sorted(
                    out_feat['location'],
                    reverse=(in_feature.location.strand == -1)):
                return

            if record.id in self.circ_contigs and \
                    in_feature.location.start == 0 \
                    and in_feature.location.end == len(record):
                self.features_spaning_zero.add(out_feat['id'])
                return

            if parent and parent['id'] in self.features_spaning_zero:
                return

            _warn(warnings['not_trans_spliced'])
            self.defects['not_trans_spliced'] += 1

        for in_feature in record.features:
            if in_feature.type in self.excluded_features:
                self.skiped_features[in_feature.type] += 1
                continue
            feat_seq = self._get_seq(in_feature, record.id)
            if source == "Ensembl":
                _id = self._get_id(in_feature, ['gene', 'locus_tag'])
            else:
                _id = self._get_id(in_feature)

            # The following is common to all the feature types
            out_feat = {
                "id": "_".join([_id, in_feature.type]),
                "location": _location(in_feature),
                "dna_sequence": str(feat_seq),
                "dna_sequence_length": len(feat_seq),
                "md5": hashlib.md5(str(feat_seq).encode('utf8')).hexdigest(),
            }
            if not _id:
                out_feat['id'] = in_feature.type

            # validate input feature
            # note that end is the larger number regardless of strand
            if int(in_feature.location.end) > len(record):
                self.genome_warnings.append(
                    warnings["coordinates_off_end"].format(out_feat['id']))
                self.genome_suspect = 1
                continue

            for piece in in_feature.location.parts:
                if not isinstance(piece.start, ExactPosition) \
                        or not isinstance(piece.end, ExactPosition):
                    _warn(warnings["non_exact_coordinates"])

            self.feature_counts[in_feature.type] += 1

            # add optional fields
            if 'note' in in_feature.qualifiers:
                out_feat['note'] = in_feature.qualifiers["note"][0]

            out_feat.update(self._get_aliases_flags_functions(in_feature))

            ont, db_xrefs = self._get_ontology_db_xrefs(in_feature)
            if ont:
                out_feat['ontology_terms'] = ont
            if db_xrefs:
                out_feat['db_xrefs'] = db_xrefs

            if 'inference' in in_feature.qualifiers:
                out_feat['inference_data'] = parse_inferences(
                    in_feature.qualifiers['inference'])

            _check_suspect_location(self.genes.get(_id))

            # add type specific features
            if in_feature.type == 'CDS':
                self.process_cds(_id, feat_seq, in_feature, out_feat)

            elif in_feature.type == 'gene':
                self.process_gene(_id, out_feat)

            elif in_feature.type == 'mRNA':
                self.process_mrna(_id, out_feat)

            else:
                self.noncoding.append(
                    self.process_noncoding(_id, in_feature.type, out_feat))

    def get_feature_lists(self):
        """sort genes into their final arrays"""
        coding = []
        for g in self.genes.values():
            if len(g['cdss']):
                if g['mrnas'] and len(g['mrnas']) != len(g['cdss']):
                    msg = "The length of the mrna and cdss arrays are not equal"
                    g['warnings'] = g.get('warnings', []) + [msg]

                # remove duplicates that may arise from CDS info propagation
                for key in ('functions', 'aliases', 'db_xrefs'):
                    if key in g:
                        g[key] = list(set(g[key]))
                if not g['mrnas']:
                    del g['mrnas']
                del g['type']
                coding.append(g)
                self.feature_counts["protein_encoding_gene"] += 1
            else:
                del g['mrnas'], g['cdss']
                self.noncoding.append(g)
                self.feature_counts["non_coding_genes"] += 1

        self.feature_counts["non_coding_features"] = len(self.noncoding)
        return {
            'features': coding,
            'non_coding_features': self.noncoding,
            'cdss': list(self.cdss.values()),
            'mrnas': list(self.mrnas.values())
        }

    def _get_seq(self, feat, contig):
        """Extract the DNA sequence for a feature"""
        seq = []
        for part in feat.location.parts:
            strand = part.strand
            # handle trans-splicing across contigs
            if part.ref:
                part_contig = part.ref
            else:
                part_contig = contig

            if strand >= 0:
                seq.append(
                    str(self.contig_seq[part_contig][part.start:part.end]))
            else:
                seq.append(
                    str(self.contig_seq[part_contig]
                        [part.start:part.end].reverse_complement()))
        return "".join(seq)

    def _create_ontology_event(self, ontology_type):
        """Creates the ontology_event if necessary
        Returns the index of the ontology event back."""
        if ontology_type not in self.ont_mappings:
            raise ValueError(f"{ontology_type} is not a supported ontology")

        if "event_index" not in self.ont_mappings[ontology_type]:
            self.ont_mappings[ontology_type]['event_index'] = len(
                self.ontology_events)
            if ontology_type == "GO":
                ontology_ref = "KBaseOntology/gene_ontology"
            elif ontology_type == "PO":
                ontology_ref = "KBaseOntology/plant_ontology"
            else:
                ontology_ref = f"KBaseOntology/{ontology_type.lower()}_ontology"
            self.ontology_events.append({
                "method": "GenomeFileUtils Genbank uploader from annotations",
                "method_version": self.version,
                "timestamp": self.time_string,
                "id": ontology_type,
                "ontology_ref": ontology_ref
            })

        return self.ont_mappings[ontology_type]['event_index']

    def _get_ontology_db_xrefs(self, feature):
        """Splits the ontology info from the other db_xrefs"""
        ontology = defaultdict(dict)
        db_xrefs = []
        for key in ("GO_process", "GO_function", "GO_component"):
            ontology_event_index = self._create_ontology_event("GO")
            for term in feature.qualifiers.get(key, []):
                sp = term.split(" - ")
                ontology['GO'][sp[0]] = [ontology_event_index]
                self.ontologies_present['GO'][
                    sp[0]] = self.ont_mappings['GO'].get(sp[0], '')

        for ref in feature.qualifiers.get('db_xref', []):
            if ref.startswith('GO:'):
                ontology['GO'][ref] = [self._create_ontology_event("GO")]
                self.ontologies_present['GO'][ref] = self.ont_mappings[
                    'GO'].get(ref, '')
            elif ref.startswith('PO:'):
                ontology['PO'][ref] = [self._create_ontology_event("PO")]
                self.ontologies_present['PO'][ref] = self.ont_mappings[
                    'PO'].get(ref, '')
            elif ref.startswith('KO:'):
                ontology['KO'][ref] = [self._create_ontology_event("KO")]
                self.ontologies_present['KO'][ref] = self.ont_mappings[
                    'KO'].get(ref, '')
            elif ref.startswith('COG'):
                ontology['COG'][ref] = [self._create_ontology_event("COG")]
                self.ontologies_present['COG'][ref] = self.ont_mappings[
                    'COG'].get(ref, '')
            elif ref.startswith('PF'):
                ontology['PFAM'][ref] = [self._create_ontology_event("PFAM")]
                self.ontologies_present['PFAM'][ref] = self.ont_mappings[
                    'PFAM'].get(ref, '')
            elif ref.startswith('TIGR'):
                ontology['TIGRFAM'][ref] = [
                    self._create_ontology_event("TIGRFAM")
                ]
                self.ontologies_present['TIGRFAM'][ref] = self.ont_mappings[
                    'TIGRFAM'].get(ref, '')
            elif ":" not in ref:
                db_xrefs.append(tuple(["Unknown_Source", ref]))
            else:
                db_xrefs.append(tuple(ref.split(":", 1)))

        return dict(ontology), sorted(db_xrefs)

    @staticmethod
    def _get_aliases_flags_functions(feat):
        """Get the values for aliases flags and features from qualifiers"""
        alias_keys = {
            'locus_tag', 'old_locus_tag', 'protein_id', 'transcript_id',
            'gene', 'EC_number', 'gene_synonym'
        }
        result = defaultdict(list)
        for key, val_list in feat.qualifiers.items():
            if key in alias_keys:
                result['aliases'].extend([(key, val) for val in val_list])
            # flags have no other information associated with them
            if val_list == ['']:
                result['flags'].append(key)
            if key == 'function':
                result['functional_descriptions'].extend(
                    val_list[0].split('; '))
            if key == 'product':
                result['functions'] = val_list

        return result

    def _find_parent_gene(self, potential_id, feature):
        """Unfortunately, Genbank files don't have a parent ID and the features can be out of
        order at times. To account for this, the this function works backwards from the end of
        list of IDs and stops when if finds a parent with valid coordinates or it hits the maximum
        number of tries"""
        if potential_id in self.genes:
            lookup_attempts = 0
            while lookup_attempts < MAX_PARENT_LOOKUPS:
                if is_parent(self.genes[potential_id], feature):
                    return potential_id

                lookup_attempts += 1
                try:
                    potential_id = list(
                        self.genes.keys())[-(lookup_attempts + 1)]
                except IndexError:
                    break  # no more genes that could match exist

            self.defects['bad_parent_loc'] += 1
        return None

    def assign_new_id(self, _id):
        """given a feature id that has already been used, add a unique modifier to it"""
        _id_modifier = self.used_twice_identifiers.get(_id, 1)
        self.used_twice_identifiers[_id] = _id_modifier + 1
        return _id + "." + str(_id_modifier)

    def process_gene(self, _id, out_feat):
        out_feat.update({
            "id": _id,
            "type": 'gene',
            "mrnas": [],
            'cdss': [],
        })
        if _id in self.genes:
            _id = self.assign_new_id(_id)
            out_feat.update({"id": _id})
            # raise ValueError(f"Duplicate gene ID: {_id}")
        self.genes[_id] = out_feat

    def process_noncoding(self, gene_id, feat_type, out_feat):
        out_feat["type"] = feat_type

        # this prevents big misc_features from blowing up the genome size
        if out_feat['dna_sequence_length'] > MAX_MISC_FEATURE_SIZE:
            del out_feat['dna_sequence']

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            if 'children' not in self.genes[gene_id]:
                self.genes[gene_id]['children'] = []
            out_feat['id'] += "_" + str(
                len(self.genes[gene_id]['children']) + 1)
            self.genes[gene_id]['children'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types[feat_type] += 1
            out_feat['id'] += "_" + str(self.orphan_types[feat_type])

        return out_feat

    def process_mrna(self, gene_id, out_feat):
        if gene_id not in self.genes and self.generate_parents:
            self.process_gene(gene_id, copy.copy(out_feat))

        gene_id = self._find_parent_gene(gene_id, out_feat)
        if gene_id:
            out_feat['id'] = "_".join(
                (gene_id, "mRNA", str(len(self.genes[gene_id]['mrnas']) + 1)))
            self.genes[gene_id]['mrnas'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['mrna'] += 1
            out_feat['id'] = f"mRNA_{self.orphan_types['mrna']}"
            out_feat['warnings'] = out_feat.get('warnings', []) + [
                'Unable to find parent gene for ' + str(out_feat['id'])
            ]

        self.mrnas[out_feat['id']] = out_feat

    def process_cds(self, gene_id, feat_seq, in_feature, out_feat):
        # Associate CDS with parents
        cds_warnings = out_feat.get('warnings', [])
        validated_gene_id = self._find_parent_gene(gene_id, out_feat)
        if validated_gene_id:
            out_feat['id'] = "_".join(
                (validated_gene_id, "CDS",
                 str(len(self.genes[validated_gene_id]['cdss']) + 1)))
            self.genes[validated_gene_id]['cdss'].append(out_feat['id'])
            out_feat['parent_gene'] = validated_gene_id
        elif self.generate_parents and gene_id not in self.genes:
            new_feat = copy.copy(out_feat)
            new_feat['id'] = gene_id
            new_feat['warnings'] = [warnings['spoofed_gene']]
            self.orphan_types['gene'] += 1
            self.defects['spoofed_genes'] += 1
            self.process_gene(new_feat['id'], new_feat)

            out_feat['id'] = "_".join(
                (gene_id, "CDS", str(len(self.genes[gene_id]['cdss']) + 1)))
            self.genes[gene_id]['cdss'].append(out_feat['id'])
            out_feat['parent_gene'] = gene_id
        else:
            self.orphan_types['cds'] += 1
            out_feat['id'] = f"CDS_{self.orphan_types['cds']}"
            cds_warnings.append(
                f"Unable to find parent gene for {out_feat['id']}")

        # there is a 1 to 1 relationship of mRNA to CDS so XXX_mRNA_1 will match XXX_CDS_1
        mrna_id = out_feat["id"].replace('CDS', 'mRNA')
        if mrna_id in self.mrnas:
            if not is_parent(self.mrnas[mrna_id], out_feat):
                cds_warnings.append(warnings['cds_mrna_cds'].format(mrna_id))
                self.mrnas[mrna_id]['warnings'] = self.mrnas[mrna_id].get(
                    'warnings', []) + [warnings['cds_mrna_mrna']]
                self.defects['bad_parent_loc'] += 1
            else:
                out_feat['parent_mrna'] = mrna_id
                self.mrnas[mrna_id]['cds'] = out_feat['id']

        # process protein
        prot_seq = in_feature.qualifiers.get("translation", [""])[0]

        # allow a little slack to account for frameshift and stop codon
        if prot_seq and abs(len(prot_seq) * 3 - len(feat_seq)) > 4:
            cds_warnings.append(warnings["inconsistent_CDS_length"].format(
                len(feat_seq), len(prot_seq)))
            self.genome_warnings.append(
                warnings['genome_inc_CDS_length'].format(
                    out_feat['id'], len(feat_seq), len(prot_seq)))
            self.genome_suspect = 1

        try:
            if prot_seq and prot_seq != Seq.translate(
                    feat_seq, self.code_table, cds=True).strip("*"):
                cds_warnings.append(warnings["inconsistent_translation"])
                self.defects['cds_seq_not_matching'] += 1

        except TranslationError as e:
            cds_warnings.append("Unable to verify protein sequence:" + str(e))

        if not prot_seq:
            try:
                prot_seq = Seq.translate(feat_seq, self.code_table,
                                         cds=True).strip("*")
                cds_warnings.append(warnings["no_translation_supplied"])

            except TranslationError as e:
                cds_warnings.append(warnings["no_translation_supplied"] +
                                    str(e))

        out_feat.update({
            "protein_translation":
            prot_seq,
            "protein_md5":
            hashlib.md5(prot_seq.encode('utf8')).hexdigest(),
            "protein_translation_length":
            len(prot_seq),
        })

        if out_feat.get('parent_gene'):
            propagate_cds_props_to_gene(out_feat,
                                        self.genes[out_feat['parent_gene']])

        if cds_warnings:
            out_feat['warnings'] = cds_warnings

        self.cdss[out_feat['id']] = out_feat
Example #24
0
    def finish_run(self, params):
        """
        Finish up the run by uploading output and
        creating the report
        """
        console = []
        self.log(console, 'Running post')

        # run hipmer, capture output as it happens
        self.log(console, 'running hipmer:')

        # grab path of output contigs
        output_contigs = ''
        for root, subdirs, files in os.walk(self.scratch):
            for f in files:
                if f == 'final_assembly.fa':
                    output_contigs = os.path.join(root,f)
                    print("found OUTPUT CONTIGS {}".format(output_contigs))
                    continue

        output_name = params['output_contigset_name']
        slurm_out = os.path.join(self.scratch, 'slurm.out')

        if not os.path.exists(output_contigs):
            self.log(console, "It looks like HipMER failed. Could not find the output contigs.")
            self.log(console, "Show errors in log file")
            with open(slurm_out, 'r') as f:
                for line in f:
                    if line.lower().find('error') >= 0:
                        self.log(console, line)
            raise RuntimeError("Error in HipMER execution")

        wsname = params['workspace_name']

        self.log(console, 'Filtering short length contigs from HipMer assembly')

        assemblyUtil = AssemblyUtil(self.callbackURL, token=self.token)

        assembly_size_filter = params['assembly_size_filter']

        filtered_fasta_file_path = self.filter_contigs_by_length(output_contigs, assembly_size_filter)

        if os.stat(filtered_fasta_file_path).st_size == 0:
            raise ValueError("Error: Using input parameters, you have filtered all contigs from the HipMer \
                             assembly. Decrease the minimum contig size and try again.")
        else:
            output_contigs = filtered_fasta_file_path

        self.log(console, 'Uploading FASTA file to Assembly')

        save_input = {'file': {'path': output_contigs},
                      'workspace_name': wsname,
                      'assembly_name': output_name
                      }

        output_data_ref = assemblyUtil.save_assembly_from_fasta(save_input)

        # create a Report
        # compute a simple contig length distribution for the report
        lengths = []
        for seq_record in SeqIO.parse(output_contigs, 'fasta'):
            lengths.append(len(seq_record.seq))

        report = ''
        report += 'ContigSet saved to: ' + params['workspace_name'] + '/'
        report += params['output_contigset_name'] + '\n'
        report += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report += 'Contig Length Distribution (# of contigs -- min to max basepairs):\n'
        for c in range(bins):
            report += '   \%d\t--\t%d' % (counts[c], edges[c])
            report += ' to %d bp\n' % (edges[c + 1])

        print('Running QUAST')
        kbq = kb_quast(self.callbackURL)
        try:
            quastret = kbq.run_QUAST({'files': [{'path': output_contigs,
                                                 'label': params['output_contigset_name']}]})
        except Exception as e:
            # not really any way to test this, all inputs have been checked
            # earlier and should be ok
            print('Logging exception from running QUAST')
            print((str(e)))
            # TODO delete shock node
            raise

        print('Saving report')
        kbr = KBaseReport(self.callbackURL)
        try:
            report_info = kbr.create_extended_report(
                {'message': report,
                 'objects_created': [{'ref': output_data_ref,
                                      'description': 'Assembled contigs'}],
                 'direct_html_link_index': 0,
                 'html_links': [{'shock_id': quastret['shock_id'],
                                 'name': 'report.html',
                                 'label': 'QUAST report'}
                                ],
                 'report_object_name': 'kb_megahit_report_' + str(uuid.uuid4()),
                 'workspace_name': params['workspace_name']
                 })
        except Exception as e:
            # not really any way to test this, all inputs have been checked earlier and should be
            # ok
            print('Logging exception from creating report object')
            print((str(e)))
            # TODO delete shock node
            raise

        # STEP 6: contruct the output to send back
        output = {'report_name': report_info['name'],
                  'report_ref': report_info['ref']
                  }
        return output
Example #25
0
    def setUpClass(cls):
        token = os.environ.get('KB_AUTH_TOKEN', None)
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        config = configparser.ConfigParser()
        config.read(config_file)
        cls.cfg = {n[0]: n[1] for n in config.items('GenomeAnnotationAPI')}
        authServiceUrl = cls.cfg.get('auth-service-url',
                                     "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'GenomeAnnotationAPI',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})

        cls.ws = Workspace(cls.cfg['workspace-url'], token=token)
        cls.impl = GenomeAnnotationAPI(cls.cfg)
        test_gbk_file = "/kb/module/test/data/kb_g.399.c.1.gbk"
        temp_gbk_file = "/kb/module/work/tmp/kb_g.399.c.1.gbk"
        shutil.copy(test_gbk_file, temp_gbk_file)
        suffix = int(time.time() * 1000)
        wsName = "test_GenomeAnnotationAPI_" + str(suffix)
        cls.ws.create_workspace({'workspace': wsName})
        cls.wsName = wsName

        data = json.load(open('data/rhodobacter_contigs.json'))
        # save to ws
        save_info = {
            'workspace': wsName,
            'objects': [{
                'type': 'KBaseGenomes.ContigSet',
                'data': data,
                'name': 'rhodo_contigs'
            }]
        }
        info = cls.ws.save_objects(save_info)[0]
        contigset_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        data = json.load(open('data/rhodobacter.json'))
        data['contigset_ref'] = contigset_ref
        # save to ws
        info = cls.impl.save_one_genome_v1(cls.ctx, {
            'workspace': wsName,
            'name': "rhodobacter",
            'data': data,
        })[0]['info']
        cls.old_genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(
            info[4])
        print('created old test genome')

        assembly_file_path = os.path.join(cls.cfg['scratch'],
                                          'e_coli_assembly.fasta')
        shutil.copy('data/e_coli_assembly.fasta', assembly_file_path)
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        assembly_ref = au.save_assembly_from_fasta({
            'workspace_name': cls.wsName,
            'assembly_name': 'ecoli.assembly',
            'file': {'path': assembly_file_path}
        })
        data = json.load(open('data/new_ecoli_genome.json'))
        data['assembly_ref'] = assembly_ref
        # save to ws
        save_info = {
            'workspace': wsName,
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data': data,
                'name': 'new_ecoli'
            }]
        }
        info = cls.ws.save_objects(save_info)[0]
        cls.new_genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        print('created new test genome')
Example #26
0
    def setUpClass(cls):
        print('Setting up class')
        token = os.environ.get('KB_AUTH_TOKEN', None)
        config_file = os.environ.get('KB_DEPLOYMENT_CONFIG', None)
        config = configparser.ConfigParser()
        config.read(config_file)
        cls.cfg = {n[0]: n[1] for n in config.items('GenomeAnnotationAPI')}
        authServiceUrl = cls.cfg.get('auth-service-url',
                "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'GenomeAnnotationAPI',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})

        cls.ws = Workspace(cls.cfg['workspace-url'], token=token)
        cls.impl = GenomeAnnotationAPI(cls.cfg)

        # Second user
        test_cfg_file = '/kb/module/work/test.cfg'
        test_cfg_text = "[test]\n"
        with open(test_cfg_file, "r") as f:
            test_cfg_text += f.read()
        config = configparser.ConfigParser()
        config.read_file(io.StringIO(test_cfg_text))
        test_cfg_dict = dict(config.items("test"))
        if ('test_token2' not in test_cfg_dict):
            raise ValueError("Configuration in <module>/test_local/test.cfg file should " +
                             "include second user credentials ('test_token2')")
        token2 = test_cfg_dict['test_token2']
        user2 = auth_client.get_user(token2)
        cls.ctx2 = MethodContext(None)
        cls.ctx2.update({'token': token2,
                         'user_id': user2,
                         'provenance': [
                            {'service': 'NarrativeService',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                         'authenticated': 1})
        
        # create one WS for all tests
        suffix = int(time.time() * 1000)
        wsName = "test_GenomeAnnotationAPI_" + str(suffix)
        ret = cls.ws.create_workspace({'workspace': wsName})
        cls.wsName = wsName

        # preload with reference data
        with open ('data/rhodobacter.json', 'r') as file:
            data_str=file.read()
        data = json.loads(data_str)
        # save old genome
        info = cls.impl.save_one_genome_v1(cls.ctx, {
               'workspace': wsName,
               'name': "rhodobacter",
               'data': data,
           })[0]['info']
        cls.rhodobacter_ref = str(info[6]) +'/' + str(info[0]) + '/' + str(info[4])
        print('created rhodobacter test genome: ' + cls.rhodobacter_ref)

        assembly_file_path = os.path.join(cls.cfg['scratch'],
                                          'e_coli_assembly.fasta')
        shutil.copy('data/e_coli_assembly.fasta', assembly_file_path)
        au = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        assembly_ref = au.save_assembly_from_fasta({
            'workspace_name': cls.wsName,
            'assembly_name': 'ecoli.assembly',
            'file': {'path': assembly_file_path}
        })
        data = json.load(open('data/new_ecoli_genome.json'))
        data['assembly_ref'] = assembly_ref
        # save new genome
        save_info = {
            'workspace': wsName,
            'objects': [{
                'type': 'KBaseGenomes.Genome',
                'data': data,
                'name': 'new_ecoli'
            }]
        }
        info = cls.ws.save_objects(save_info)[0]
        cls.new_genome_ref = str(info[6]) + '/' + str(info[0]) + '/' + str(
            info[4])
        print('created new test genome')
Example #27
0
class VirSorterUtils:
    def __init__(self, config):
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.mgu = MetagenomeUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.ws = Workspace(config['workspace-url'], token=config['token'])

    def VirSorter_help(self):
        command = 'wrapper_phage_contigs_sorter_iPlant.pl --help'
        self._run_command(command)

    def get_fasta(self, ref):
        # check type of object, i.e KBaseGenomeAnnotations.Assembly-3.0
        obj_type = self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0][2]
        if 'assembly' in obj_type.lower():
            genome_ref = ref
        elif 'kbasegenomes' in obj_type.lower():
            data = self.ws.get_objects2({
                'objects': [{
                    'ref': ref,
                    'included': ['assembly_ref'],
                    'strict_maps': 1
                }]
            })['data'][0]['data']
            genome_ref = data['assembly_ref']
        else:
            raise ValueError(
                f"Input reference {ref} is of type {obj_type}. Type KBaseGenomes.Genome or "
                f"KBaseGenomeAnnotations.Assembly required.")
        return self.au.get_assembly_as_fasta({'ref': genome_ref})['path']

    def run_VirSorter(self, params):

        params['SDK_CALLBACK_URL'] = self.callback_url
        params['KB_AUTH_TOKEN'] = os.environ['KB_AUTH_TOKEN']

        # Get contigs from 'assembly'
        genome_fp = self.get_fasta(params['genomes'])

        command = 'wrapper_phage_contigs_sorter_iPlant.pl --data-dir /data/virsorter-data'

        # Add in first args
        command += f' -f {genome_fp} --db {params["database"]}'

        # Check if additional genomes were submitted
        if params.get('add_genomes'):
            add_genomes_fp = self.get_fasta(params['add_genomes'])
            print(f'Added genomes DETECTED: {add_genomes_fp}')
            command += f' --cp {add_genomes_fp}'

        bool_args = ['virome', 'diamond', 'keep_db',
                     'no_c']  # keep_db = keep-db

        for bool_arg in bool_args:
            if params[
                    bool_arg] == 1:  # 0 is true and therefore run... though for some reason it's reversed on json
                if bool_arg == 'keep_db':
                    bool_arg = 'keep-db'

                command += f' --{bool_arg}'

        self._run_command(command)

        report = self._generate_report(
            params)  # Basically, do everything that's after the tool runs

        return report

    def _run_command(self, command):
        """

        :param command:
        :return:
        """

        log('Start executing command:\n{}'.format(command))
        pipe = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True)
        output, err = pipe.communicate()
        exitCode = pipe.returncode

        if exitCode == 0:
            log('Executed command:\n{}\n'.format(command) +
                'Exit Code: {}\nOutput:\n{}'.format(exitCode, output))
        else:
            error_msg = 'Error running command:\n{}\n'.format(command)
            error_msg += 'Exit Code: {}\nOutput:\n{}\nError: {}'.format(
                exitCode, output, err)
            raise RuntimeError(error_msg)

    def _parse_summary(self, virsorter_global_fp, affi_contigs_shock_id):
        columns = [
            'Contig_id',
            'Nb genes contigs',
            'Fragment',
            'Nb genes',
            'Category',
            'Nb phage hallmark genes',
            'Phage gene enrichment sig',
            'Non-Caudovirales phage gene enrichment sig',
            'Pfam depletion sig',
            'Uncharacterized enrichment sig',
            'Strand switch depletion sig',
            'Short genes enrichment sig',
        ]

        try:
            with open(virsorter_global_fp, 'r') as vir_fh:
                data = {}
                category = ''
                for line in vir_fh:
                    if line.startswith('## Contig_id'):
                        continue
                    elif line.startswith(
                            '## '
                    ):  # If 'header' lines are consumed by 1st if, then remaining should be good
                        category = line.split('## ')[-1].split(' -')[0]
                    else:
                        values = line.strip().split(',')
                        data[values[0]] = dict(zip(columns[1:], values[1:]))
        except:
            vir_path = os.path.join(os.getcwd(), 'virsorter-out')
            files = os.listdir(vir_path)
            raise RuntimeError(
                f"{virsorter_global_fp} is not a file. existing files {files}."
            )

        df = pd.DataFrame().from_dict(data, orient='index')
        df.index.name = columns[0]
        df.reset_index(inplace=True)

        html = df.to_html(index=False,
                          classes='my_class table-striped" id = "my_id')

        # Need to file write below
        direct_html = html_template.substitute(
            html_table=html, affi_contigs_shock_id=affi_contigs_shock_id)

        # Find header so it can be copied to footer, as dataframe.to_html doesn't include footer
        start_header = Literal("<thead>")
        end_header = Literal("</thead>")

        text = start_header + SkipTo(end_header)

        new_text = ''
        for data, start_pos, end_pos in text.scanString(direct_html):
            new_text = ''.join(data).replace(
                ' style="text-align: right;"', '').replace(
                    'thead>', 'tfoot>\n  ') + '\n</tfoot>'

        # Get start and end positions to insert new text
        end_tbody = Literal("</tbody>")
        end_table = Literal("</table>")

        insertion_pos = end_tbody + SkipTo(end_table)

        final_html = ''
        for data, start_pos, end_pos in insertion_pos.scanString(direct_html):
            final_html = direct_html[:start_pos +
                                     8] + '\n' + new_text + direct_html[
                                         start_pos + 8:]

        return final_html

    def get_assembly_contig_ids(self, assembly_ref):
        """get contig ids from assembly_ref"""
        contigs = self.ws.get_objects2(
            {'objects': [{
                'ref': assembly_ref,
                'included': ['contigs']
            }]})['data'][0]['data']['contigs']
        return contigs.keys()

    def _generate_report(self, params):
        """

        :param params:
        :return:
        """

        # Get URL
        self.dfu = dfu(params['SDK_CALLBACK_URL'])

        # Output directory should be $PWD/virsorter-out - ASSUMES that's the output location
        virsorter_outdir = os.path.join(os.getcwd(), 'virsorter-out')

        print(
            f'VIRSorter output directory contents: {os.listdir(virsorter_outdir)}'
        )

        # Replacing individual download files with BinnedContigs

        # kb_deseq adds output files, then builds report files and sends all of them to the workspace
        output_files = []  # Appended list of dicts containing attributes

        # Collect all the files needed to report to end-user
        # Get all predicted viral sequences
        pred_fnas = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.fasta'))
        pred_gbs = glob.glob(
            os.path.join(virsorter_outdir,
                         'Predicted_viral_sequences/VIRSorter_*.gb'))
        # Summary 'table'
        glob_signal = os.path.join(virsorter_outdir,
                                   'VIRSorter_global-phage-signal.csv')

        print('Identified the following predicted viral sequences:\n{}'.format(
            '\n\t'.join(pred_fnas)))

        if len(pred_fnas) == 0:
            print(
                f"Unable to find predicted viral sequences, here are the directory's content:\n"
                f"{os.listdir(os.path.join(virsorter_outdir, 'Predicted_viral_sequences'))}"
            )

        if os.path.exists(glob_signal):

            print(f'Identified the global phage signal: {glob_signal}')

            lines = -1  # Don't count header
            with open(glob_signal) as fh:
                for ln in fh:
                    lines += 1

            if lines == 0:
                print('But it is EMPTY!')

        else:
            print(
                'Unable to find the global phage signal file. Was there an error during the run?'
            )

        # Append error and out files from VIRSorter
        err_fp = os.path.join(virsorter_outdir, 'logs/err')
        # if os.path.exists(err_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/err'),
        #         'name': 'VIRSorter_err',
        #         'label': 'VIRSorter_err',
        #         'description': 'VIRSorter error log file, generated from the tool itself.'
        #     })
        out_fp = os.path.join(virsorter_outdir, 'logs/out')
        # if os.path.exists(out_fp):
        #     output_files.append({
        #         'path': os.path.join(virsorter_outdir, 'logs/out'),
        #         'name': 'VIRSorter_out',
        #         'label': 'VIRSorter_out',
        #         'description': 'VIRSorter output log file, generated from the tool itself.'
        #     })

        if not (os.path.exists(err_fp) or os.path.exists(out_fp)):
            print(
                'Unable to find err and/or out files in LOG directory, contents:'
            )
            print(os.listdir(os.path.join(virsorter_outdir, 'logs')))

        # Make output directory
        output_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(output_dir)

        # Deal with nucleotide and protein fasta
        pred_fna_tgz_fp = os.path.join(output_dir,
                                       'VIRSorter_predicted_viral_fna.tar.gz')
        with tarfile.open(
                pred_fna_tgz_fp,
                'w:gz') as pred_fna_tgz_fh:  # Compress to minimize disk usage
            for pred_fna in pred_fnas:
                pred_fna_tgz_fh.add(pred_fna,
                                    arcname=os.path.basename(pred_fna))
        output_files.append({
            'path':
            pred_fna_tgz_fp,
            'name':
            os.path.basename(pred_fna_tgz_fp),
            'label':
            os.path.basename(pred_fna_tgz_fp),
            'description':
            'FASTA-formatted nucleotide sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_fna_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in FASTA format: '
                f'{pred_fna_tgz_fp}')

        pred_gb_tgz_fp = os.path.join(output_dir,
                                      'VIRSorter_predicted_viral_gb.tar.gz')
        with tarfile.open(pred_gb_tgz_fp, 'w:gz') as pred_gb_tgz_fh:
            for pred_gb in pred_gbs:
                pred_gb_tgz_fh.add(pred_gb, arcname=os.path.basename(pred_gb))
        output_files.append({
            'path':
            pred_gb_tgz_fp,
            'name':
            os.path.basename(pred_gb_tgz_fp),
            'label':
            os.path.basename(pred_gb_tgz_fp),
            'description':
            'Genbank-formatted sequences of VIRSorter predicted viruses'
        })

        if os.path.exists(pred_gb_tgz_fp):
            print(
                f'Generated gzipped version of the predicted viral sequences in Genbank format: '
                f'{pred_gb_tgz_fp}')

        # To create BinnedContig, need to create another directory with each of the "bins" as separate files?
        binned_contig_output_dir = os.path.join(self.scratch,
                                                str(uuid.uuid4()))
        self._mkdir_p(binned_contig_output_dir)

        # Before creating final HTML output, need to create BinnedContig object so other tools/users can take advantage
        # of its features, but also to feed more easily into other tools (e.g. vConTACT)
        created_objects = []  # Will store the objects that go to the workspace

        # load contig ids from the assembly input
        # assembly_contig_ids = self.get_assembly_contig_ids(self.assembly_ref)
        assembly_contig_ids = self.get_assembly_contig_ids(
            params['genomes'])  # Will fail for Genome

        summary_fp = os.path.join(
            binned_contig_output_dir,
            'VIRSorter.summary')  # Anything that ends in .summary
        with open(summary_fp, 'w') as summary_fh:

            summary_writer = csv.writer(summary_fh,
                                        delimiter='\t',
                                        quoting=csv.QUOTE_MINIMAL)
            summary_writer.writerow(
                ['Bin name', 'Completeness', 'Genome size', 'GC content'])

            for category_fp in pred_fnas:
                # _get_bin_ids from MetaGenomeUtils requires files to follow the header.0xx.fasta convention
                category = os.path.basename(category_fp).split(
                    'cat-')[-1].split('.')[0]
                dest_fn = 'VirSorter.{}.fasta'.format(category.zfill(3))
                dest_fp = os.path.join(output_dir, dest_fn)
                binned_contig_fp = os.path.join(binned_contig_output_dir,
                                                dest_fn)

                genome_size = 0
                gc_content = []

                # Need stats for summary file
                # Also need to adjust sequence name so binnedContig object can retrieve sequences
                adjusted_sequences = []
                with open(category_fp, 'rU') as category_fh:
                    for record in SeqIO.parse(category_fh, 'fasta'):
                        seq = record.seq
                        gc_content.append(SeqUtils.GC(seq))
                        genome_size += len(seq)

                        # This is very dirty, but need to change name to match original contigs
                        record.id = record.id.replace('VIRSorter_',
                                                      '').replace(
                                                          '-circular',
                                                          '').split('-cat_')[0]
                        if 'gene' in record.id:  # Prophage
                            record.id = record.id.split('_gene')[0]
                        record.id = record.id.rsplit('_', 1)[0]

                        # here we make sure that the id's line up with contig ids in the input assembly object
                        if record.id not in assembly_contig_ids:
                            for assembly_contig_id in assembly_contig_ids:
                                # first check if record.id is substring of current contig id,
                                # then check if current contig id is substring of record.id
                                # NOTE: this is not a perfect way of checking and will likely
                                #       fail in some circumstances.
                                #       A more complete check would be to make sure there is a 1:1
                                #       mapping of contig id's in the assembly object as compared to
                                #       the binned contig object (the fasta files defined here).
                                if (record.id in assembly_contig_id) or (
                                        assembly_contig_id in record.id):
                                    record.id = assembly_contig_id
                                    break

                        record.description = ''
                        record.name = ''
                        adjusted_sequences.append(record)

                if genome_size != 0:  # Empty file

                    summary_writer.writerow([
                        dest_fn, '100%', genome_size,
                        (sum(gc_content) / len(gc_content))
                    ])

                    print('Copying {} to results directory'.format(
                        os.path.basename(category_fp)))
                    # Yes, need both. One is to get file_links in report. Second is for binnedContigs object
                    shutil.copyfile(category_fp, dest_fp)

                    # Write renamed sequences
                    with open(binned_contig_fp, 'w') as binned_contig_fh:
                        SeqIO.write(adjusted_sequences, binned_contig_fh,
                                    'fasta')

                    result = self.au.save_assembly_from_fasta({
                        'file': {
                            'path': dest_fp
                        },
                        'workspace_name':
                        params['workspace_name'],
                        'assembly_name':
                        'VirSorter-Category-{}'.format(category)
                    })

                    created_objects.append({
                        "ref":
                        result,
                        "description":
                        "KBase Assembly object from VIRSorter"
                    })

        # Create BinnedContigs object, but 1st, a little metadata
        generate_binned_contig_param = {
            'file_directory': binned_contig_output_dir,
            'assembly_ref':
            params['genomes'],  # params.get('genomes'), self.assembly_ref
            'binned_contig_name': params['binned_contig_name'],
            'workspace_name': params['workspace_name']
        }
        binned_contig_object_ref = self.mgu.file_to_binned_contigs(
            generate_binned_contig_param).get('binned_contig_obj_ref')

        # Add binned contigs reference here, as it was already created above
        created_objects.append({
            "ref": binned_contig_object_ref,
            "description": "BinnedContigs from VIRSorter"
        })

        # Save VIRSorter_affi-contigs.tab for DRAM-v
        affi_contigs_fp = os.path.join(virsorter_outdir, 'Metric_files',
                                       'VIRSorter_affi-contigs.tab')
        affi_contigs_shock_id = self.dfu.file_to_shock(
            {'file_path': affi_contigs_fp})['shock_id']

        # Use global signal (i.e. summary) file and create HTML-formatted version
        raw_html = self._parse_summary(glob_signal, affi_contigs_shock_id)

        html_fp = os.path.join(output_dir, 'index.html')

        with open(html_fp, 'w') as html_fh:
            html_fh.write(raw_html)

        report_shock_id = self.dfu.file_to_shock({
            'file_path': output_dir,
            'pack': 'zip'
        })['shock_id']

        html_report = [{
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(html_fp),
            'label':
            os.path.basename(html_fp),
            'description':
            'HTML summary report for VIRSorter-predicted viral genomes.'
        }]

        report_params = {
            'message':
            'Here are the results from your VIRSorter run. Above, you\'ll find a report with '
            'all the identified (putative) viral genomes, and below, links to the report as '
            'well as files generated.',
            'workspace_name':
            params['workspace_name'],
            'html_links':
            html_report,
            'direct_html_link_index':
            0,
            'report_object_name':
            'VIRSorter_report_{}'.format(str(uuid.uuid4())),
            'file_links':
            output_files,
            'objects_created':
            created_objects,
        }

        kbase_report_client = KBaseReport(params['SDK_CALLBACK_URL'],
                                          token=params['KB_AUTH_TOKEN'])
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref'],
            'result_directory': binned_contig_output_dir,
            'binned_contig_obj_ref': binned_contig_object_ref
        }

        return report_output

    def _mkdir_p(self, path):
        """
        :param path:
        :return:
        """

        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise
Example #28
0
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('SetAPI'):
            cls.cfg[nameval[0]] = nameval[1]
        authServiceUrl = cls.cfg.get(
            'auth-service-url',
            "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            'token':
            token,
            'user_id':
            user_id,
            'provenance': [{
                'service': 'SetAPI',
                'method': 'please_never_use_it_in_production',
                'method_params': []
            }],
            'authenticated':
            1
        })
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = SetAPI(cls.cfg)

        # setup data at the class level for now (so that the code is run
        # once for all tests, not before each test case.  Not sure how to
        # do that outside this function..)
        suffix = int(time.time() * 1000)
        wsName = "test_SetAPI_" + str(suffix)
        ret = cls.wsClient.create_workspace({'workspace': wsName})
        #        wsName = 'pranjan77:1477441032423'
        cls.wsName = wsName
        # copy test file to scratch area
        fna_filename = "seq.fna"
        fna_path = os.path.join(cls.cfg['scratch'], fna_filename)
        shutil.copy(os.path.join("data", fna_filename), fna_path)

        ru = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])
        ws_obj_name = 'MyNewAssembly'
        cls.assembly1ref = ru.save_assembly_from_fasta({
            'file': {
                'path': fna_path
            },
            'workspace_name':
            wsName,
            'assembly_name':
            'assembly_obj_1'
        })
        cls.assembly2ref = ru.save_assembly_from_fasta({
            'file': {
                'path': fna_path
            },
            'workspace_name':
            wsName,
            'assembly_name':
            'assembly_obj_2'
        })
    def run_ContigFilter_max(self, ctx, params):
        """
        New app which filters contigs in an assembly using both a minimum and a maximum contig length
        :param params: instance of mapping from String to unspecified object
        :returns: instance of type "ReportResults" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN run_ContigFilter_max
        # Check that the parameters are valid
        for name in [
                'min_length', 'max_length', 'assembly_ref', 'workspace_name'
        ]:
            if name not in params:
                raise ValueError('Parameter "' + name +
                                 '" is required but missing')
        if not isinstance(params['min_length'],
                          int) or (params['min_length'] < 0):
            raise ValueError('Min length must be a non-negative integer')
        if not isinstance(params['max_length'],
                          int) or (params['max_length'] < 0):
            raise ValueError('Max length must be a non-negative integer')
        if not isinstance(params['assembly_ref'], str) or not len(
                params['assembly_ref']):
            raise ValueError('Pass in a valid assembly reference string')

        print(params['min_length'], params['max_length'],
              params['assembly_ref'])
        output = {}

        assembly_util = AssemblyUtil(self.callback_url)
        fasta_file = assembly_util.get_assembly_as_fasta(
            {'ref': params['assembly_ref']})
        print(fasta_file)

        # Parse the downloaded file in FASTA format
        parsed_assembly = SeqIO.parse(fasta_file['path'], 'fasta')
        min_length = params['min_length']
        max_length = params['max_length']

        # Keep a list of contigs greater than min_length
        good_contigs = []
        # total contigs regardless of length
        n_total = 0
        # total contigs over the min_length
        n_remaining = 0
        for record in parsed_assembly:
            n_total += 1
            if len(record.seq) >= min_length and len(record.seq) <= max_length:
                good_contigs.append(record)
                n_remaining += 1
        # Create a file to hold the filtered data
        workspace_name = params['workspace_name']
        filtered_path = os.path.join(self.shared_folder, 'filtered.fasta')
        SeqIO.write(good_contigs, filtered_path, 'fasta')
        # Upload the filtered data to the workspace
        new_ref = assembly_util.save_assembly_from_fasta({
            'file': {
                'path': filtered_path
            },
            'workspace_name':
            workspace_name,
            'assembly_name':
            fasta_file['assembly_name']
        })
        # Create an output summary message for the report
        text_message = "".join([
            'Filtered assembly to ',
            str(n_remaining), ' contigs out of ',
            str(n_total)
        ])
        # Data for creating the report, referencing the assembly we uploaded
        report_data = {
            'objects_created': [{
                'ref': new_ref,
                'description': 'Filtered contigs'
            }],
            'text_message':
            text_message
        }
        # Initialize the report
        kbase_report = KBaseReport(self.callback_url)
        report = kbase_report.create({
            'report': report_data,
            'workspace_name': workspace_name
        })
        # Return the report reference and name in our results
        output = {
            'report_ref': report['ref'],
            'report_name': report['name'],
            'n_total': n_total,
            'n_remaining': n_remaining,
            'filtered_assembly_ref': new_ref
        }
        #END run_ContigFilter_max

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method run_ContigFilter_max return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Example #30
0
class SPAdesUtils:
    """
    Define the SPAdesUtils functions
    """
    SPADES_VERSION = '3.13.0'
    SPADES_BIN = '/opt/SPAdes-' + SPADES_VERSION + '-Linux/bin'

    DISABLE_SPADES_OUTPUT = False  # should be False in production

    # Basic options
    PARAM_IN_SINGLE_CELL = 'single_cell'  # --sc
    PARAM_IN_METAGENOME = 'metagenomic'  # --meta
    PARAM_IN_PLASMID = 'plasmid'  # --plasmid
    PARAM_IN_RNA = 'rna'  # --rna
    PARAM_IN_IONTORRENT = 'iontorrent'  # --iontorrent

    # Pipeline options
    PARAM_IN_ONLY_ERROR_CORR = 'only-error-correction'  # --only-error-correction
    PARAM_IN_ONLY_ASSEMBLER = 'only-assembler'  # --only-assembler
    PARAM_IN_CAREFUL = 'careful'  # --careful
    PARAM_IN_CONTINUE = 'continue'  # --continue
    PARAM_IN_DISABLE_GZIP = 'disable-gzip-output'  # --disable-gzip-output

    # Input parameters
    PARAM_IN_WS = 'workspace_name'
    PARAM_IN_CS_NAME = 'output_contigset_name'
    PARAM_IN_READS = 'reads_libraries'
    PARAM_IN_LONG_READS = 'long_reads_libraries'
    PARAM_IN_KMER_SIZES = 'kmer_sizes'
    PARAM_IN_SKIP_ERR_CORRECT = 'skip_error_correction'
    PARAM_IN_MIN_CONTIG_LENGTH = 'min_contig_length'
    PARAM_IN_DNA_SOURCE = 'dna_source'
    PARAM_IN_PIPELINE_OPTION = 'pipeline_options'
    ASSEMBLE_RESULTS_DIR = 'assemble_results'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    THREADS_PER_CORE = 3
    MAX_THREADS = 64  # per email thread with Anton Korobeynikov
    MAX_THREADS_META = 128  # Increase threads for metagenomic assemblies
    MEMORY_OFFSET_GB = 1  # 1GB
    MIN_MEMORY_GB = 5
    MAX_MEMORY_GB_SPADES = 500
    MAX_MEMORY_GB_META_SPADES = 1000
    GB = 1000000000

    # private method definition
    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token, service_ver='release')
        self.au = AssemblyUtil(self.callback_url, token=self.token, service_ver='release')
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir

        self.spades_version = 'SPAdes-' + os.environ['SPADES_VERSION']

    def _get_kbreads_info(self, wsname, reads_refs):
        """
        _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding
        reads info with as interleaved fastq files and returns a list of reads data in the
        following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'type': reads_type,  # ('interleaved', 'paired', or 'single')
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file,  # only if paired end
        }
        """
        obj_ids = []
        for r in reads_refs:
            if r:
                obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})

        if not obj_ids:
            return []

        ws_info = self.ws_client.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = self.ru.download_reads({
                        'read_libraries': reads_params,
                        'interleaved': 'false'
                        })['files']
        except ServerError as se:
            log('logging stacktrace from dynamic client error')
            log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.SingleEndLibrary ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'KBaseFile.SingleEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        # log('Downloaded reads data from KBase:\n' + pformat(reads))
        reads_data = []
        for ref in reads_refs:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            seq_tech = reads[ref]['sequencing_tech']
            rds_info = {
                'fwd_file': f['fwd'],
                'reads_ref': ref,
                'type': f['type'],
                'seq_tech': seq_tech,
                'reads_name': reads_name
            }
            if f.get('rev', None):
                rds_info['rev_file'] = f['rev']
            reads_data.append(rds_info)

        return reads_data

    def _generate_output_file_list(self, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.proj_dir, str(uuid.uuid4()))
        _mkdir_p(output_directory)
        spades_output = os.path.join(output_directory, 'spades_output.zip')
        self._zip_folder(out_dir, spades_output)

        output_files.append({'path': spades_output,
                             'name': os.path.basename(spades_output),
                             'label': os.path.basename(spades_output),
                             'description': 'Output file(s) generated by {}'.format(
                                 self.spades_version)})

        return output_files

    def _zip_folder(self, folder_path, output_path):
        """
        _zip_folder: Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders could be included in the archive as well
        if the commented portion is used.
        """
        with zipfile.ZipFile(output_path, 'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    # print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print("{} created successfully.".format(output_path))
        # with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size

    def _load_stats(self, input_file_name):
        log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly')
        log('Building Object.')
        if not os.path.isfile(input_file_name):
            raise Exception('The input file name {0} is not a file!'.format(input_file_name))
        with open(input_file_name, 'r') as input_file_handle:
            contig_id = None
            sequence_len = 0
            fasta_dict = dict()
            first_header_found = False
            # Pattern for replacing white space
            pattern = re.compile(r'\s+')
            for current_line in input_file_handle:
                if (current_line[0] == '>'):
                    # found a header line
                    # Wrap up previous fasta sequence
                    if not first_header_found:
                        first_header_found = True
                    else:
                        fasta_dict[contig_id] = sequence_len
                        sequence_len = 0
                    fasta_header = current_line.replace('>', '').strip()
                    try:
                        contig_id = fasta_header.strip().split(' ', 1)[0]
                    except (IndexError, ValueError, KeyError):
                        contig_id = fasta_header.strip()
                else:
                    sequence_len += len(re.sub(pattern, '', current_line))
        # wrap up last fasta sequence
        if not first_header_found:
            raise Exception("There are no contigs in this file")
        else:
            fasta_dict[contig_id] = sequence_len
        return fasta_dict

    def _parse_single_reads(self, reads_type, reads_list):
        """
        _parse_single_reads: given the reads_type and a list of reads, return an object
        defining the type and a list of fastq files.
        """
        single_reads_fqs = []
        ret_obj = {}
        if reads_list and isinstance(reads_list, list):
            for rds in reads_list:
                single_reads_fqs.append(rds['fwd_file'])
        if single_reads_fqs:
            ret_obj = {
                "type": reads_type,
                "single reads": single_reads_fqs
            }

        return ret_obj

    def _parse_pair_reads(self, reads_type, reads_list):
        """
        _parse_pair_reads: given the reads_type and a list of reads, return an object
        defining the type and a list of fastq files.
        """
        right_reads_fqs = []
        left_reads_fqs = []
        ret_obj = {}
        if reads_list and isinstance(reads_list, list):
            for rds in reads_list:
                right_reads_fqs.append(rds['fwd_file'])
                if rds.get('rev_file', None):
                    left_reads_fqs.append(rds['rev_file'])
            orent = reads_list[0]['orientation']

        if right_reads_fqs:
            ret_obj["right reads"] = right_reads_fqs
            ret_obj["orientation"] = orent
            ret_obj["type"] = reads_type
        if left_reads_fqs:
            ret_obj["left reads"] = left_reads_fqs

        return ret_obj
    # end of private methods

    # public method definitions

    def check_spades_params(self, params):
        """
        check_spades_params: checks params passed to run_HybridSPAdes method and set default values
        """
        # log('Start validating run_HybridSPAdes parameters:\n{}'.format(
        # json.dumps(params, indent=1)))

        # check for mandatory parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_WS))
        if self.INVALID_WS_NAME_RE.search(params[self.PARAM_IN_WS]):
            raise ValueError('Invalid workspace name: {}.'.format(params[self.PARAM_IN_WS]))

        if params.get(self.PARAM_IN_CS_NAME, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_CS_NAME))
        if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]):
            raise ValueError('Invalid workspace object name: {}.'.format(
                params[self.PARAM_IN_CS_NAME]))

        if params.get(self.PARAM_IN_READS, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(self.PARAM_IN_READS))
        if type(params[self.PARAM_IN_READS]) != list:
            raise ValueError('Input reads {} must be a list.'.format(self.PARAM_IN_READS))
        if len(params[self.PARAM_IN_READS]) == 0:
            raise ValueError('Input parameter {} should have at least one reads.'.format(
                             self.PARAM_IN_READS))

        if self.PARAM_IN_MIN_CONTIG_LENGTH in params:
            if not isinstance(params[self.PARAM_IN_MIN_CONTIG_LENGTH], int):
                raise ValueError('{} must be of type int.'.format(self.PARAM_IN_MIN_CONTIG_LENGTH))

        if not params.get(self.PARAM_IN_KMER_SIZES, None):
            params[self.PARAM_IN_KMER_SIZES] = [21, 33, 55]
        kmer_sstr = ",".join(str(num) for num in params[self.PARAM_IN_KMER_SIZES])
        params[self.PARAM_IN_KMER_SIZES] = kmer_sstr
        print("KMER_SIZES: " + kmer_sstr)

        if params.get(self.PARAM_IN_SKIP_ERR_CORRECT, None):
            print("SKIP ERR CORRECTION: " + str(params[self.PARAM_IN_SKIP_ERR_CORRECT]))

        # check for basic option parameters
        if params.get(self.PARAM_IN_DNA_SOURCE, None):
            dna_src = params[self.PARAM_IN_DNA_SOURCE]
            if dna_src not in [self.PARAM_IN_SINGLE_CELL,
                               self.PARAM_IN_METAGENOME,
                               self.PARAM_IN_PLASMID,
                               self.PARAM_IN_RNA,
                               self.PARAM_IN_IONTORRENT]:
                params[self.PARAM_IN_DNA_SOURCE] = None
        else:
            params[self.PARAM_IN_DNA_SOURCE] = None

        # a list of basic options0
        params['basic_options'] = ['-o', self.ASSEMBLE_RESULTS_DIR]
        dna_src = params.get(self.PARAM_IN_DNA_SOURCE)
        if dna_src == self.PARAM_IN_SINGLE_CELL:
            params['basic_options'].append('--sc')
        elif dna_src == self.PARAM_IN_METAGENOME:
            params['basic_options'].append('--meta')
        elif dna_src == self.PARAM_IN_PLASMID:
            params['basic_options'].append('--plasmid')
        elif dna_src == self.PARAM_IN_RNA:
            params['basic_options'].append('--rna')
        elif dna_src == self.PARAM_IN_IONTORRENT:
            params['basic_options'].append('--iontorrent')

        # processing pipeline option parameters
        if params.get(self.PARAM_IN_PIPELINE_OPTION, None):
            pipe_opts = params[self.PARAM_IN_PIPELINE_OPTION]
            opts = [self.PARAM_IN_ONLY_ERROR_CORR,
                    self.PARAM_IN_ONLY_ASSEMBLER,
                    self.PARAM_IN_CONTINUE,
                    self.PARAM_IN_DISABLE_GZIP,
                    self.PARAM_IN_CAREFUL]
            if any(elem in opts for elem in pipe_opts):
                pass
            else:
                params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL]
        else:
            params[self.PARAM_IN_PIPELINE_OPTION] = [self.PARAM_IN_CAREFUL]

        if '--meta' in params['basic_options']:
            # you cannot specify --careful, --mismatch-correction
            # or --cov-cutoff in metagenomic mode!
            try:
                params[self.PARAM_IN_PIPELINE_OPTION].remove(self.PARAM_IN_CAREFUL)
                params[self.PARAM_IN_PIPELINE_OPTION].remove('mismatch-correction')
                params[self.PARAM_IN_PIPELINE_OPTION].remove('cov-cutoff')
            except ValueError:
                pass

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return params

    def generate_report(self, fa_file_name, params, out_dir, wsname):
        """
        Generating and saving report
        """
        log('Generating and saving report')

        fa_file_with_path = os.path.join(out_dir, fa_file_name)
        fasta_stats = self._load_stats(fa_file_with_path)
        lengths = [fasta_stats[contig_id] for contig_id in fasta_stats]

        assembly_ref = wsname + '/' + params[self.PARAM_IN_CS_NAME]

        report_text = ''
        report_text += 'SPAdes results saved to: ' + wsname + '/' + out_dir + '\n'
        report_text += 'Assembly saved to: ' + assembly_ref + '\n'
        report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report_text += 'Avg Length: ' + str(sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n'
        for c in range(bins):
            report_text += ('   ' + str(counts[c]) + '\t--\t' + str(edges[c]) + ' to ' +
                            str(edges[c + 1]) + ' bp\n')
        print('Running QUAST')
        quastret = self.kbq.run_QUAST(
            {'files': [{'path': fa_file_with_path, 'label': params[self.PARAM_IN_CS_NAME]}]})

        output_files = self._generate_output_file_list(out_dir)

        print('Saving report')
        report_output = self.kbr.create_extended_report(
            {'message': report_text,
             'objects_created': [{'ref': assembly_ref, 'description': 'Assembled contigs'}],
             'direct_html_link_index': 0,
             'file_links': output_files,
             'html_links': [{'shock_id': quastret['shock_id'],
                             'name': 'report.html',
                             'label': 'QUAST report'}
                            ],
             'report_object_name': 'kb_spades_report_' + str(uuid.uuid4()),
             'workspace_name': params[self.PARAM_IN_WS]})

        return report_output['name'], report_output['ref']

    def get_hybrid_reads_info(self, input_params):
        """
        get_hybrid_reads_info--from a list of ReadsParams structures fetches the corresponding
        reads info with the ReadsParams[lib_ref]
        returns None or a tuple of nine reads data each is a list of the following structure:
        {
                'fwd_file': path_to_fastq_file,
                'orientation': (default value is "fr" (forward-reverse) for paired-end libraries
                                "rf" (reverse-forward) for mate-pair libraries), None for others
                'lib_type': ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio",
                              "nanopore", "sanger", "trusted-contigs", "untrusted-contigs"),
                'type': reads_type, # 'interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file  # only if paired end
        }
        OR:
        {
                'fwd_file': path_to_fastq_file,
                'long_reads_type': ("pacbio-ccs", "pacbio-clr", "nanopore", "sanger",
                                    "trusted-contigs", "untrusted-contigs"),
                'type': reads_type, # 'interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience
        }
        """
        rds_params = copy.deepcopy(input_params)
        if rds_params.get(self.PARAM_IN_READS, None) is None:
            return ()  # an empty tuple

        wsname = rds_params[self.PARAM_IN_WS]

        sgl_rds_data = []  # single
        pe_rds_data = []   # paired-end
        mp_rds_data = []   # mate-pairs
        pb_ccs_data = []   # pacbio-ccs
        pb_clr_data = []   # pacbio-clr
        np_rds_data = []   # nanopore
        sgr_rds_data = []  # sanger
        tr_ctg_data = []   # trusted-contigs
        ut_ctg_data = []   # untrusted-contigs

        # a list of Illumina or IonTorrent paired-end/high-quality mate-pairs/unpaired reads
        rds_refs = []

        rds_libs = rds_params[self.PARAM_IN_READS]
        for rds_lib in rds_libs:
            if rds_lib.get('lib_ref', None):
                rds_refs.append(rds_lib['lib_ref'])
        kb_rds_data = self._get_kbreads_info(wsname, rds_refs)

        for rds_lib in rds_libs:
            for kb_d in kb_rds_data:
                if 'lib_ref' in rds_lib and rds_lib['lib_ref'] == kb_d['reads_ref']:
                    if rds_lib['lib_type'] == 'single':  # single end reads grouped params
                        kb_d['orientation'] = None
                        kb_d['lib_type'] = 'single'
                        sgl_rds_data.append(kb_d)
                    elif rds_lib['lib_type'] == 'paired-end':  # pairedEnd reads grouped params
                        kb_d['orientation'] = ('fr' if rds_lib.get('orientation', None) is None
                                               else rds_lib['orientation'])
                        kb_d['lib_type'] = 'paired-end'
                        pe_rds_data.append(kb_d)
                    elif rds_lib['lib_type'] == 'mate-pairs':
                        # mate-pairs reads grouped params
                        kb_d['orientation'] = ('rf' if rds_lib.get('orientation', None) is None
                                               else rds_lib['orientation'])
                        kb_d['lib_type'] = 'mate-pairs'
                        mp_rds_data.append(kb_d)

        # a list of PacBio (CCS or CLR), Oxford Nanopore Sanger reads
        # and/or additional contigs
        long_rds_refs = []
        if rds_params.get(self.PARAM_IN_LONG_READS, None):
            long_rds_libs = rds_params[self.PARAM_IN_LONG_READS]
            for lrds_lib in long_rds_libs:
                if lrds_lib.get('long_reads_ref', None):
                    long_rds_refs.append(lrds_lib['long_reads_ref'])
            kb_lrds_data = self._get_kbreads_info(wsname, long_rds_refs)

            for lrds_lib in long_rds_libs:
                for kb_ld in kb_lrds_data:
                    if ('long_reads_ref' in lrds_lib and
                            lrds_lib['long_reads_ref'] == kb_ld['reads_ref']):
                        if lrds_lib['long_reads_type'] == 'pacbio-ccs':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            pb_ccs_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'pacbio-clr':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            pb_clr_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'nanopore':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            np_rds_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'sanger':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            sgr_rds_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'trusted-contigs':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            tr_ctg_data.append(kb_ld)
                        elif lrds_lib['long_reads_type'] == 'untrusted-contigs':
                            kb_ld['long_reads_type'] = lrds_lib['long_reads_type']
                            ut_ctg_data.append(kb_ld)

        return (sgl_rds_data, pe_rds_data, mp_rds_data, pb_ccs_data, pb_clr_data, np_rds_data,
                sgr_rds_data, tr_ctg_data, ut_ctg_data)

    def construct_yaml_dataset_file(self, sgl_libs=None, pe_libs=None, mp_libs=None,
                                    pb_ccs=None, pb_clr=None, np_libs=None,
                                    sgr_libs=None, tr_ctgs=None, ut_ctgs=None):
        """
        construct_yaml_dataset_file: Specifying input data with YAML data set file (advanced)
        An alternative way to specify an input data set for SPAdes is to create a YAML
        data set file.
        By using a YAML file you can provide an unlimited number of paired-end, mate-pair
        and unpaired libraries. Basically, YAML data set file is a text file, in which input
        libraries are provided as a comma-separated list in square brackets. Each library is
        provided in braces as a comma-separated list of attributes.

        The following attributes are available:

            - orientation ("fr", "rf", "ff")
            - type ("paired-end", "mate-pairs", "hq-mate-pairs", "single", "pacbio", "nanopore",
                "sanger", "trusted-contigs", "untrusted-contigs")
            - interlaced reads (comma-separated list of files with interlaced reads)
            - left reads (comma-separated list of files with left reads)
            - right reads (comma-separated list of files with right reads)
            - single reads (comma-separated list of files with single reads or unpaired reads from
                paired library)
            - merged reads (comma-separated list of files with merged reads)

        To properly specify a library you should provide its type and at least one file with reads.
        For ONT, PacBio, Sanger and contig libraries you can provide only single reads. Orientation
        is an optional attribute. Its default value is "fr" (forward-reverse) for paired-end
        libraries and "rf" (reverse-forward) for mate-pair libraries.

        The value for each attribute is given after a colon. Comma-separated lists of files should
        be given in square brackets.
        For each file you should provide its full path in double quotes. Make sure that files with
        right reads are given in the same order as corresponding files with left reads.

        For example, if you have one paired-end library splitted into two pairs of files:
            lib_pe1_left_1.fastq
            lib_pe1_right_1.fastq
            lib_pe1_left_2.fastq
            lib_pe1_right_2.fastq

        one mate-pair library:
            lib_mp1_left.fastq
            lib_mp1_right.fastq

        and PacBio CCS and CLR reads:
            pacbio_ccs.fastq
            pacbio_clr.fastq

        YAML file should look like this:
        ------------------------------------------------
        [
            {
                orientation: "fr",
                type: "paired-end",
                right reads: [
                "/FULL_PATH_TO_DATASET/lib_pe1_right_1.fastq",
                "/FULL_PATH_TO_DATASET/lib_pe1_right_2.fastq"
                ],
                left reads: [
                "/FULL_PATH_TO_DATASET/lib_pe1_left_1.fastq",
                "/FULL_PATH_TO_DATASET/lib_pe1_left_2.fastq"
                ]
            },
            {
                orientation: "rf",
                type: "mate-pairs",
                right reads: [
                "/FULL_PATH_TO_DATASET/lib_mp1_right.fastq"
                ],
                left reads: [
                "/FULL_PATH_TO_DATASET/lib_mp1_left.fastq"
                ]
            },
            {
                type: "single",
                single reads: [
                "/FULL_PATH_TO_DATASET/pacbio_ccs.fastq"
                ]
            },
            {
                type: "pacbio",
                single reads: [
                "/FULL_PATH_TO_DATASET/pacbio_clr.fastq"
                ]
            }
        ]
        ------------------------------------------------

        Once you have created a YAML file save it with .yaml extension (e.g. as my_data_set.yaml)
        and run SPAdes using the --dataset option:
        e.g., <SPAdes_bin_dir>/spades.py --dataset <your YAML file> -o spades_output

        """
        # STEP 1: get the working folder housing the .yaml file and the SPAdes results
        if not os.path.exists(self.proj_dir):
            os.makedirs(self.proj_dir)
        yaml_file_path = os.path.join(self.proj_dir, 'input_data_set.yaml')

        # STEP 2: construct and save the 'input_data_set.yaml' file
        # generate the object array
        input_data_set = []

        if pe_libs:
            pair_libs = self._parse_pair_reads('paired-end', pe_libs)
            if pair_libs:
                input_data_set.append(pair_libs)

        if mp_libs:
            pair_libs = self._parse_pair_reads('mate-pairs', mp_libs)
            if pair_libs:
                input_data_set.append(pair_libs)

        # for reads_type = 'single'
        if sgl_libs:
            single_libs = self._parse_single_reads("single", sgl_libs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'pacbio-ccs', treated as type of 'single'
        if pb_ccs:
            single_libs = self._parse_single_reads("single", pb_ccs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'pacbio-clr'
        if pb_clr:
            single_libs = self._parse_single_reads("pacbio", pb_clr)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'nanopore'
        if np_libs:
            single_libs = self._parse_single_reads("nanopore", np_libs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'sanger'
        if sgr_libs:
            single_libs = self._parse_single_reads("sanger", sgr_libs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'trusted-contigs'
        if tr_ctgs:
            single_libs = self._parse_single_reads("trusted-contigs", tr_ctgs)
            if single_libs:
                input_data_set.append(single_libs)

        # for long_reads_type = 'untrusted-contigs'
        if ut_ctgs:
            single_libs = self._parse_single_reads("untrusted-contigs", ut_ctgs)
            if single_libs:
                input_data_set.append(single_libs)

        if input_data_set == []:
            print('Empty input data set!!')
            return ''

        pprint(input_data_set)
        try:
            with open(yaml_file_path, 'w') as yaml_file:
                json.dump(input_data_set, yaml_file)
        except IOError as ioerr:
            log('Creation of the {} file raised error:\n'.format(yaml_file_path))
            pprint(ioerr)
            return ''
        else:
            return yaml_file_path

    def run_assemble(self, yaml_file, kmer_sizes, dna_source=None,
                     basic_opts=None, pipeline_opts=['careful']):
        """
        run_assemble: run the SPAdes assemble with given input parameters/options
        """
        exit_code = 1
        if not os.path.isfile(yaml_file):
            log("The input data set yaml file DOES NOT exist at {}\n".format(yaml_file))
            return exit_code

        log("The input data set yaml file exists at {}\n".format(yaml_file))
        yf_dir, yf_nm = os.path.split(yaml_file)

        mem = (psutil.virtual_memory().available / self.GB - self.MEMORY_OFFSET_GB)
        if mem < self.MIN_MEMORY_GB:
            raise ValueError(
                'Only ' + str(psutil.virtual_memory().available) +
                ' bytes of memory are available. The SPAdes wrapper will' +
                ' not run without at least ' +
                str(self.MIN_MEMORY_GB + self.MEMORY_OFFSET_GB) +
                ' gigabytes available')

        if dna_source and dna_source == self.PARAM_IN_METAGENOME:
            max_mem = self.MAX_MEMORY_GB_META_SPADES
            max_threads = self.MAX_THREADS_META
        else:
            max_mem = self.MAX_MEMORY_GB_SPADES
            max_threads = self.MAX_THREADS

        threads = min(max_threads, psutil.cpu_count() * self.THREADS_PER_CORE)

        if mem > max_mem:
            mem = max_mem

        tmpdir = os.path.join(self.proj_dir, 'spades_tmp_dir')
        if not os.path.exists(tmpdir):
            os.makedirs(tmpdir)

        a_cmd = [os.path.join(self.SPADES_BIN, 'spades.py')]
        a_cmd += ['--threads', str(threads), '--memory', str(mem)]
        a_cmd += ['--tmp-dir', tmpdir]
        a_cmd += ['--dataset', yaml_file]

        if kmer_sizes is not None:
            a_cmd += ['-k ' + kmer_sizes]

        if basic_opts is None:
            basic_opts = ['-o', self.ASSEMBLE_RESULTS_DIR]
        if isinstance(basic_opts, list):
            a_cmd += basic_opts

        if pipeline_opts and isinstance(pipeline_opts, list):
            for p_opt in pipeline_opts:
                if p_opt == self.PARAM_IN_CAREFUL:
                    a_cmd += ['--careful']
                if p_opt == self.PARAM_IN_ONLY_ERROR_CORR:
                    a_cmd += ['--only-error-correction']
                if p_opt == self.PARAM_IN_ONLY_ASSEMBLER:
                    a_cmd += ['--only-assembler']
                if p_opt == self.PARAM_IN_CONTINUE:
                    a_cmd += ['--continue']
                if p_opt == self.PARAM_IN_DISABLE_GZIP:
                    a_cmd += ['--disable-gzip-output']

        # Last check of command options before the call
        if '--meta' in a_cmd:
            # you cannot specify --careful, --mismatch-correction
            # or --cov-cutoff in metagenomic mode!
            try:
                a_cmd.remove(self.PARAM_IN_CAREFUL)
                a_cmd.remove('mismatch-correction')
                a_cmd.remove('cov-cutoff')
            except ValueError:
                pass

        log("**************The HybridSPAdes assembling command is:\n{}".format(' '.join(a_cmd)))
        assemble_out_dir = os.path.join(self.proj_dir, self.ASSEMBLE_RESULTS_DIR)
        if not os.path.exists(assemble_out_dir):
            os.makedirs(assemble_out_dir)

        p = subprocess.Popen(a_cmd, cwd=yf_dir, shell=False)
        exit_code = p.wait()
        log('Return code: ' + str(exit_code))

        if p.returncode != 0:
            raise ValueError('Error running spades.py, return code: ' + str(p.returncode) + '\n')
        else:
            exit_code = p.returncode
        return exit_code

    def save_assembly(self, fa_file_path, wsname, a_name, min_ctg_length=0):
        """
        save_assembly: save the assembly to KBase workspace
        """
        if os.path.isfile(fa_file_path):
            log('Uploading FASTA file to Assembly...')
            if min_ctg_length > 0:
                self.au.save_assembly_from_fasta(
                            {'file': {'path': fa_file_path},
                             'workspace_name': wsname,
                             'assembly_name': a_name,
                             'min_contig_length': min_ctg_length})
            else:
                self.au.save_assembly_from_fasta(
                            {'file': {'path': fa_file_path},
                             'workspace_name': wsname,
                             'assembly_name': a_name})
        else:
            log("The resulting sequence file {} is not found.".format(fa_file_path))
Example #31
0
class masurca_utils:
    """
    masurca_utils: defining a system of utils for running masurca
    """
    MaSuRCA_VERSION = 'MaSuRCA-3.2.9'
    MaSuRCA_BIN = '/kb/module/' + MaSuRCA_VERSION + '/bin/masurca'
    PARAM_IN_WS = 'workspace_name'
    PARAM_IN_THREADN = 'num_threads'
    PARAM_IN_READS_LIBS = 'reads_libraries'
    PARAM_IN_JUMP_LIBS = 'jump_libraries'
    PARAM_IN_JF_SIZE = 'jf_size'
    PARAM_IN_CS_NAME = 'output_contigset_name'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')

    def __init__(self, prj_dir, config):
        self.workspace_url = config['workspace-url']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        if 'shock-url' in config:
            self.shock_url = config['shock-url']
        if 'handle-service-url' in config:
            self.handle_url = config['handle-service-url']

        self.ws_client = Workspace(self.workspace_url, token=self.token)
        self.ru = ReadsUtils(self.callback_url, token=self.token)
        self.au = AssemblyUtil(self.callback_url, token=self.token)
        self.kbr = KBaseReport(self.callback_url)
        self.kbq = kb_quast(self.callback_url)
        self.proj_dir = prj_dir
        self.prog_runner = Program_Runner(self.MaSuRCA_BIN, self.proj_dir)

    def _has_long_reads(self, params):
        """
        _has_long_reads: check if a long reads input exists in the parameters
        """
        return (params.get('pacbio_reads', None)
                or params.get('nanopore_reads', None)
                or params.get('other_frg_file', None))

    def _get_data_portion(self,
                          pe_reads_data,
                          jp_reads_data=None,
                          pacbio_reads_file='',
                          nanopore_reads_file='',
                          other_frg_file=''):
        """
        _get_data_portion: build the 'DATA...END' portion for the config.txt file
        """
        data_str = ''
        if pe_reads_data:
            # log('PE reads data details:\n{}'.format(json.dumps(pe_reads_data, indent=1)))
            for pe in pe_reads_data:
                if data_str != '':
                    data_str += '\n'
                data_str += 'PE= ' + pe['pe_prefix'] + ' ' + str(pe['pe_mean']) + ' ' + \
                            str(pe['pe_stdev']) + ' ' + pe['fwd_file']
                if pe.get('rev_file', None):
                    data_str += ' ' + pe['rev_file']

        if jp_reads_data:
            # log('JUMP reads data details:\n{}'.format(json.dumps(jp_reads_data, indent=1)))
            for jp in jp_reads_data:
                if data_str != '':
                    data_str += '\n'
                data_str += 'JUMP= ' + jp['jp_prefix'] + ' ' + str(jp['jp_mean']) + ' ' + \
                            str(jp['jp_stdev']) + ' ' + jp['fwd_file']
                if jp.get('rev_file', None):
                    data_str += ' ' + jp['rev_file']

        # Adding the pacbio_reads
        # Note that pcbio reads must be in a single fasta file!
        # For example:
        # data_str +='\nPACBIO= /pool/genomics/frandsenp/masurca/PacBio/pacbio_reads.fasta'
        # ***if you have both types of reads supply them both as NANOPORE type***
        if pacbio_reads_file != '':
            if data_str != '':
                data_str += '\n'
            if nanopore_reads_file != '':
                data_str += 'NANOPORE=' + pacbio_reads_file
            else:
                data_str += 'PACBIO=' + pacbio_reads_file

        # Adding the nanopore_reads and note that nanopore reads must be in a single fasta file!
        # For example:
        # data_str +='\nNANOPORE= /pool/genomics/frandsenp/masurca/NanoPore/nanopore_reads.fasta'
        if nanopore_reads_file != '':
            if data_str != '':
                data_str += '\n'
            data_str += 'NANOPORE= ' + nanopore_reads_file

        # Adding the other_frg_file inputs if any
        # any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first converted into
        # Celera Assembler compatible .frg file
        # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg
        if other_frg_file != '':
            if data_str != '':
                data_str += '\n'
            data_str += 'OTHER=' + other_frg_file

        return data_str

    def _get_parameters_portion(self, params):
        """
        build the 'PARAMETERS...END' portion for the config.txt file
        """
        # set the default parameters as suggested in the example configuration file
        param_str = (
            "EXTEND_JUMP_READS=0\nUSE_GRID=0\nGRID_QUEUE=all.q\nGRID_BATCH_SIZE"
            + "=300000000\nLHE_COVERAGE=25\nMEGA_READS_ONE_PASS=0")
        if (params.get('graph_kmer_size', None)
                and type(params['graph_kmer_size']) == int):
            if param_str != '':
                param_str += '\n'
            param_str += 'GRAPH_KMER_SIZE=' + str(params['graph_kmer_size'])
        else:
            if param_str != '':
                param_str += '\n'
            param_str += 'GRAPH_KMER_SIZE=auto'
        if params.get('use_linking_mates', None):
            if param_str != '':
                param_str += '\n'
            if params['use_linking_mates'] == 1 and not self._has_long_reads(
                    params):
                param_str += 'USE_LINKING_MATES=1'
            else:
                param_str += 'USE_LINKING_MATES=0'
        if params.get('limit_jump_coverage', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'LIMIT_JUMP_COVERAGE = ' + str(
                params['limit_jump_coverage'])
        if params.get('cgwErrorRate', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'CA_PARAMETERS = cgwErrorRate=' + str(
                params['cgwErrorRate'])
        if params.get(self.PARAM_IN_THREADN, None):
            if param_str != '':
                param_str += '\n'
            param_str += 'NUM_THREADS = ' + str(params[self.PARAM_IN_THREADN])
        if params.get('jf_size', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'JF_SIZE=' + str(params['jf_size'])
        if params.get('kmer_count_threshold', None):
            if param_str != '':
                param_str += '\n'
            param_str += 'KMER_COUNT_THRESHOLD=' + str(
                params['kmer_count_threshold'])
        if params.get('do_homopolymer_trim', None):
            if param_str != '':
                param_str += '\n'
            if params['do_homopolymer_trim'] == 1:
                param_str += 'DO_HOMOPOLYMER_TRIM=1'
            else:
                param_str += 'DO_HOMOPOLYMER_TRIM=0'
        if params.get('close_gaps', None):
            if param_str != '':
                param_str += '\n'
            if params['close_gaps'] == 1:
                param_str += 'CLOSE_GAPS=1'
            else:
                param_str += 'CLOSE_GAPS=0'
        if params.get('soap_assembly', None):
            if param_str != '':
                param_str += '\n'
            if params['soap_assembly'] == 1:
                param_str += 'SOAP_ASSEMBLY=1'
            else:
                param_str += 'SOAP_ASSEMBLY=0'
        return param_str

    def _replaceSectionText(self, orig_txt, begin_patn, end_patn, repl_txt):
        """
        replace a section of text of orig_txt between lines begin-patn and end-patn with repl_text
        examples of parameters:
            begin_patn1 = "DATA\n"
            begin_patn2 = "PARAMETERS\n"
            end_patn1 = "END\nPARAMETERS\n"
            end_patn2 = "END\n"
            repl_txt1 = ('PE= pe 500 50 /kb/module/work/testReads/small.forward.fq' +
                          ' /kb/module/work/testReads/small.reverse.fq\n')
            repl_txt2 = ('GRAPH_KMER_SIZE=auto\nUSE_LINKING_MATES=1\nLIMIT_JUMP_COVERAGE = 60\n' +
                          'CA_PARAMETERS = cgwErrorRate=0.15\nNUM_THREADS= 64\nJF_SIZE=100000000\n
                          DO_HOMOPOLYMER_TRIM=0\n')
        """
        if repl_txt != '':
            # create regular expression pattern
            repl = re.compile(begin_patn + '.*?' + end_patn, re.DOTALL)
            repl_txt = begin_patn + repl_txt + '\n' + end_patn
            # replace the text between begin_patn and end_patn with repl_txt
            txt_replaced = repl.sub(repl_txt, orig_txt)
            # pprint(txt_replaced)
            return txt_replaced
        else:
            return orig_txt

    def _unique_prefix_check(self, pfix, refs):
        prefix_lookup = {}
        for ref in refs:
            pre = ref[pfix][0:2]
            if pre not in prefix_lookup:
                prefix_lookup[pre] = 1
            else:
                raise ValueError('The first two characters in \'' + ref[pfix] +
                                 '\' has been used.')

    def _get_pereads_info(self, input_params):
        """
        _get_pereads_info--from a list of paired_readsParams structures fetches the
        corresponding reads info with the paired_readsParams[pe_id]
        returns a list of reads data in the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'pe_prefix': the two-letter prefix for the reads library,
                'pe_mean': the average reads length for the reads library,
                'pe_stdev': the standard deviation for the reads library,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        rds_params = copy.deepcopy(input_params)
        wsname = rds_params[self.PARAM_IN_WS]
        rds_refs = []
        rds_data = []

        # reads_libraries grouped params
        if rds_params.get(self.PARAM_IN_READS_LIBS, None):
            pe_reads_libs = rds_params[self.PARAM_IN_READS_LIBS]

            for pe_lib in pe_reads_libs:
                if pe_lib.get('pe_id', None):
                    rds_refs.append(pe_lib['pe_id'])
            rds_data = self._get_kbreads_info(wsname, rds_refs)

            for pe_lib in pe_reads_libs:
                i = 0
                for rds in rds_data:
                    i += 1
                    if 'pe_id' in pe_lib and pe_lib['pe_id'] == rds[
                            'reads_ref']:
                        if pe_lib.get('pe_prefix', None):
                            rds['pe_prefix'] = pe_lib['pe_prefix'][0]
                        else:
                            rds['pe_prefix'] = 'p'
                        rds['pe_prefix'] += str(i)
                        pe_lib['pe_prefix'] = rds['pe_prefix']

                        if pe_lib.get('pe_mean', None) is None:
                            pe_lib['pe_mean'] = 500
                        rds['pe_mean'] = pe_lib['pe_mean']

                        if pe_lib.get('pe_stdev', None) is None:
                            pe_lib['pe_stdev'] = 50
                        rds['pe_stdev'] = pe_lib['pe_stdev']

            self._unique_prefix_check('pe_prefix', pe_reads_libs)
        else:
            raise ValueError("Parameter {} is required.".format(
                self.PARAM_IN_READS_LIBS))
        return rds_data

    def _get_jpreads_info(self, input_params):
        """
        _get_jpreads_info--from a list of jump_readsParams structures fetches the corresponding
        reads info with the paired_readsParams[pe_id]
        returns a list of reads data in the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'jp_prefix': the two-letter prefix for the reads library,
                'jp_mean': the average reads length for the reads library,
                'jp_stdev': the standard deviation for the reads library,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        rds_params = copy.deepcopy(input_params)
        wsname = rds_params[self.PARAM_IN_WS]
        rds_refs = []
        rds_data = []

        # jump_libraries grouped params
        if rds_params.get(self.PARAM_IN_JUMP_LIBS, None):
            jp_reads_libs = rds_params[self.PARAM_IN_JUMP_LIBS]
            for jp_lib in jp_reads_libs:
                if jp_lib.get('jp_id', None):
                    rds_refs.append(jp_lib['jp_id'])
            rds_data = self._get_kbreads_info(wsname, rds_refs)

            for jp_lib in jp_reads_libs:
                i = 0
                for rds in rds_data:
                    i += 1
                    if 'jp_id' in jp_lib and jp_lib['jp_id'] == rds[
                            'reads_ref']:
                        if jp_lib.get('jp_prefix', None):
                            rds['jp_prefix'] = jp_lib['jp_prefix'][0]
                        else:
                            rds['jp_prefix'] = 's'
                        rds['jp_prefix'] += str(i)
                        jp_lib['jp_prefix'] = rds['jp_prefix']

                        if jp_lib.get('jp_mean', None) is None:
                            jp_lib['jp_mean'] = 3600
                        rds['jp_mean'] = jp_lib['jp_mean']

                        if jp_lib.get('jp_stdev', None) is None:
                            jp_lib['jp_stdev'] = 200
                        rds['jp_stdev'] = jp_lib['jp_stdev']

            self._unique_prefix_check('jp_prefix', jp_reads_libs)
        return rds_data

    def _get_kbreads_info(self, wsname, reads_refs):
        """
        _get_kbreads_info--from a set of given KBase reads refs, fetches the corresponding
         reads info with as deinterleaved fastq files and returns a list of reads data in
         the following structure:
        reads_data = {
                'fwd_file': path_to_fastq_file,
                'type': reads_type, #('interleaved', 'paired', or 'single'
                'seq_tech': sequencing_tech,
                'reads_ref': KBase object ref for downstream convenience,
                'reads_name': KBase object name for downstream convenience,
                'rev_file': path_to_fastq_file, #only if paired end
        }
        """
        obj_ids = []
        for r in reads_refs:
            if r:
                obj_ids.append({'ref': r if '/' in r else (wsname + '/' + r)})

        if not obj_ids:
            return []

        ws_info = self.ws_client.get_object_info_new({'objects': obj_ids})
        reads_params = []

        reftoname = {}
        for wsi, oid in zip(ws_info, obj_ids):
            ref = oid['ref']
            reads_params.append(ref)
            obj_name = wsi[1]
            reftoname[ref] = wsi[7] + '/' + obj_name

        typeerr = ('Supported types: KBaseFile.SingleEndLibrary ' +
                   'KBaseFile.PairedEndLibrary ' +
                   'KBaseAssembly.SingleEndLibrary ' +
                   'KBaseAssembly.PairedEndLibrary')
        try:
            reads = self.ru.download_reads({
                'read_libraries': reads_params,
                'interleaved': 'false'
            })['files']
        except ServerError as se:
            log('logging stacktrace from dynamic client error')
            log(se.data)
            if typeerr in se.message:
                prefix = se.message.split('.')[0]
                raise ValueError(
                    prefix + '. Only the types ' +
                    'KBaseAssembly.SingleEndLibrary ' +
                    'KBaseAssembly.PairedEndLibrary ' +
                    'KBaseFile.SingleEndLibrary ' +
                    'and KBaseFile.PairedEndLibrary are supported')
            else:
                raise

        # log('Downloaded reads data from KBase:\n' + pformat(reads))
        reads_data = []
        for ref in reads_refs:
            reads_name = reftoname[ref]
            f = reads[ref]['files']
            seq_tech = reads[ref]['sequencing_tech']
            rds_info = {
                'fwd_file': f['fwd'],
                'reads_ref': ref,
                'type': f['type'],
                'seq_tech': seq_tech,
                'reads_name': reads_name
            }
            if f.get('rev', None) is not None:
                rds_info['rev_file'] = f['rev']
            reads_data.append(rds_info)

        return reads_data

    def _generate_output_file_list(self, out_dir):
        """
        _generate_output_file_list: zip result files and generate file_links for report
        """
        log('start packing result files')

        output_files = list()

        output_directory = os.path.join(self.proj_dir, str(uuid.uuid4()))
        mkdir_p(output_directory)
        masurca_output = os.path.join(output_directory, 'masurca_output.zip')
        self._zip_folder(out_dir, masurca_output)

        output_files.append({
            'path':
            masurca_output,
            'name':
            os.path.basename(masurca_output),
            'label':
            os.path.basename(masurca_output),
            'description':
            'Output file(s) generated by MaSuRCA'
        })

        return output_files

    def _zip_folder(self, folder_path, output_path):
        """
        _zip_folder: Zip the contents of an entire folder (with that folder included
        in the archive). Empty subfolders could be included in the archive as well
        if the commented portion is used.
        """
        with zipfile.ZipFile(output_path,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as ziph:
            for root, folders, files in os.walk(folder_path):
                for f in files:
                    absolute_path = os.path.join(root, f)
                    relative_path = os.path.join(os.path.basename(root), f)
                    # print "Adding {} to archive.".format(absolute_path)
                    ziph.write(absolute_path, relative_path)

        print("{} created successfully.".format(output_path))
        # with zipfile.ZipFile(output_path, "r") as f:
        #    print 'Checking the zipped file......\n'
        #    for info in f.infolist():
        #        print info.filename, info.date_time, info.file_size, info.compress_size

    def _load_stats(self, input_file_name):
        log('Starting conversion of FASTA to KBaseGenomeAnnotations.Assembly')
        log('Building Object.')
        if not os.path.isfile(input_file_name):
            raise Exception('The input file name {0} is not a file!'.format(
                input_file_name))
        with open(input_file_name, 'r') as input_file_handle:
            contig_id = None
            sequence_len = 0
            fasta_dict = dict()
            first_header_found = False
            # Pattern for replacing white space
            pattern = re.compile(r'\s+')
            for current_line in input_file_handle:
                if current_line[0] == '>':
                    # found a header line
                    # Wrap up previous fasta sequence
                    if not first_header_found:
                        first_header_found = True
                    else:
                        fasta_dict[contig_id] = sequence_len
                        sequence_len = 0
                    fasta_header = current_line.replace('>', '').strip()
                    try:
                        contig_id = fasta_header.strip().split(' ', 1)[0]
                    except (IndexError, KeyError, ValueError):
                        contig_id = fasta_header.strip()
                else:
                    sequence_len += len(re.sub(pattern, '', current_line))
        # wrap up last fasta sequence
        if not first_header_found:
            raise Exception("There are no contigs in this file")
        else:
            fasta_dict[contig_id] = sequence_len
        return fasta_dict

    def _check_reference(self, ref):
        """
        Tests the given ref string to make sure it conforms to the expected
        object reference format. Returns True if it passes, False otherwise.
        """
        obj_ref_regex = re.compile(
            "^(?P<wsid>\d+)\/(?P<objid>\d+)(\/(?P<ver>\d+))?$")
        ref_path = ref.strip().split(";")
        for step in ref_path:
            if not obj_ref_regex.match(step):
                return False
        return True

    def _check_ref_type(self, ref, allowed_types):
        """
        Validates the object type of ref against the list of allowed types. If it passes, this
        returns True, otherwise False.
        Really, all this does is verify that at least one of the strings in allowed_types is
        a substring of the ref object type name.
        Ex1:
        ref = "KBaseGenomes.Genome-4.0"
        allowed_types = ["assembly", "KBaseFile.Assembly"]
        returns False
        Ex2:
        ref = "KBaseGenomes.Genome-4.0"
        allowed_types = ["assembly", "genome"]
        returns True
        """
        obj_type = self._get_object_type(ref).lower()
        for t in allowed_types:
            if t.lower() in obj_type:
                return True
        return False

    def _get_object_type(self, ref):
        """
        Fetches and returns the typed object name of ref from the given workspace url.
        If that object doesn't exist, or there's another Workspace error, this raises a
        RuntimeError exception.
        """
        info = self.ws_client.get_object_info3({'objects': [{'ref': ref}]})
        obj_info = info.get('infos', [[]])[0]
        if len(obj_info) == 0:
            raise RuntimeError(
                "An error occurred while fetching type info from the Workspace. "
                "No information returned for reference {}".format(ref))
        return obj_info[2]

    def _get_fasta_from_assembly(self, assembly_ref):
        """
        From an assembly or contigset, this uses a data file to build a FASTA file
        and return the path to it.
        """
        allowed_types = [
            'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly',
            'KBaseGenomes.ContigSet'
        ]
        if not self._check_ref_type(assembly_ref, allowed_types):
            raise ValueError(
                "The reference {} cannot be used to fetch a FASTA file".format(
                    assembly_ref))
        au = AssemblyUtil(self.callback_url)
        return au.get_assembly_as_fasta({'ref': assembly_ref})

    def generate_report(self, contig_file_name, params, out_dir, wsname):
        """
        generate_report: reporting results
        """
        log('Generating and saving report')

        contig_file_with_path = os.path.join(out_dir, contig_file_name)
        fasta_stats = self._load_stats(contig_file_with_path)
        lengths = [fasta_stats[contig_id] for contig_id in fasta_stats]

        assembly_ref = params[self.PARAM_IN_WS] + '/' + params[
            self.PARAM_IN_CS_NAME]

        report_text = ''
        report_text += 'MaSuRCA results saved to: ' + wsname + '/' + out_dir + '\n'
        report_text += 'Assembly saved to: ' + assembly_ref + '\n'
        report_text += 'Assembled into ' + str(len(lengths)) + ' contigs.\n'
        report_text += 'Avg Length: ' + str(
            sum(lengths) / float(len(lengths))) + ' bp.\n'

        # compute a simple contig length distribution
        bins = 10
        counts, edges = np.histogram(lengths, bins)
        report_text += 'Contig Length Distribution (# of contigs -- min to max ' + 'basepairs):\n'
        for c in range(bins):
            report_text += ('   ' + str(counts[c]) + '\t--\t' + str(edges[c]) +
                            ' to ' + str(edges[c + 1]) + ' bp\n')
        print('Running QUAST')
        quastret = self.kbq.run_QUAST({
            'files': [{
                'path': contig_file_with_path,
                'label': params[self.PARAM_IN_CS_NAME]
            }]
        })

        output_files = self._generate_output_file_list(out_dir)

        print('Saving report')
        report_output = self.kbr.create_extended_report({
            'message':
            report_text,
            'objects_created': [{
                'ref': assembly_ref,
                'description': 'Assembled contigs'
            }],
            'direct_html_link_index':
            0,
            'file_links':
            output_files,
            'html_links': [{
                'shock_id': quastret['shock_id'],
                'name': 'report.html',
                'label': 'QUAST report'
            }],
            'report_object_name':
            'kb_masurca_report_' + str(uuid.uuid4()),
            'workspace_name':
            params[self.PARAM_IN_WS]
        })
        report_name = report_output['name']
        report_ref = report_output['ref']
        return report_name, report_ref

    def validate_params(self, params):
        """
        validate_params: checks params passed to run_masurca_app method and set default values
        """
        # log('Start validating run_masurca_app parameters:\n{}'.format(
        # json.dumps(params, indent=1)))

        # check for mandatory parameters
        if params.get(self.PARAM_IN_WS, None) is None:
            raise ValueError(self.PARAM_IN_WS + ' parameter is mandatory')
        if self.PARAM_IN_THREADN not in params:
            raise ValueError(self.PARAM_IN_THREADN + ' parameter is mandatory')

        if params.get(self.PARAM_IN_JF_SIZE, None) is None:
            raise ValueError(self.PARAM_IN_JF_SIZE + ' parameter is mandatory')
        if params.get(self.PARAM_IN_READS_LIBS, None) is None:
            raise ValueError(self.PARAM_IN_READS_LIBS +
                             ' parameter is mandatory')
        if type(params[self.PARAM_IN_READS_LIBS]) != list:
            raise ValueError(self.PARAM_IN_READS_LIBS + ' must be a list')

        if params.get(self.PARAM_IN_CS_NAME, None) is None:
            raise ValueError('Parameter {} is mandatory!'.format(
                self.PARAM_IN_CS_NAME))
        if self.INVALID_WS_OBJ_NAME_RE.search(params[self.PARAM_IN_CS_NAME]):
            raise ValueError('Invalid workspace object name: {}.'.format(
                params[self.PARAM_IN_CS_NAME]))

        if 'dna_source' in params:
            dna_src = params.get('dna_source')
            if dna_src == 'bacteria':
                params['limit_jump_coverage'] = 60
                params['cgwErrorRate'] = 0.25
            else:
                params['limit_jump_coverage'] = 300
                params['cgwErrorRate'] = 0.15

        if params.get('create_report', None) is None:
            params['create_report'] = 0

        return params

    def construct_masurca_assembler_cfg(self, params):
        # STEP 1: get the working folder housing the config.txt file and the masurca results
        wsname = params[self.PARAM_IN_WS]
        config_file_path = os.path.join(self.proj_dir, 'config.txt')

        # STEP 2.1: retrieve the reads data from input parameter
        pe_reads_data = self._get_pereads_info(params)
        jp_reads_data = []
        if params.get(self.PARAM_IN_JUMP_LIBS, None):
            jp_reads_data = self._get_jpreads_info(params)
            if 'jp_mean' not in params or type(params['jp_mean']) != int:
                params['jp_mean'] = 3600
            if 'jp_stdev' not in params or type(params['jp_stdev']) != int:
                params['jp_stdev'] = 200

        # STEP 2.2: PACBIO reads must be in a single FASTA file and supplied as PACBIO=reads.fa;
        assbl_types = [
            'KBaseFile.Assembly', 'KBaseGenomeAnnotations.Assembly',
            'KBaseGenomes.ContigSet'
        ]
        reads_types = [
            'KBaseAssembly.SingleEndLibrary', 'KBaseFile.SingleEndLibrary',
            'KBaseAssembly.PairedEndLibrary', 'KBaseFile.PairedEndLibrary'
        ]
        pb_reads_file = ''
        if params.get('pacbio_reads', None):
            pb_ref = params['pacbio_reads']
            if self._check_ref_type(pb_ref, assbl_types):
                pb_reads_file = (self._get_fasta_from_assembly(pb_ref)).get(
                    'path', '')
            else:
                if self._check_ref_type(pb_ref, reads_types):
                    pb_rd = self._get_kbreads_info(wsname, [pb_ref])
                    pb_reads_file = pb_rd[0]['fwd_file']
                    if pb_rd[0].get('rev_file', None):
                        pb_reads_file += ' ' + pb_rd[0]['rev_file']

        # STEP 2.3: NANOPORE reads must be in a single FASTA/FASTQ file and supplied
        # as NANOPORE=reads.fa
        np_reads_file = ''
        if params.get('nanopore_reads', None):
            np_ref = params['nanopore_reads']
            if self._check_ref_type(np_ref, assbl_types):
                np_reads_file = (self._get_fasta_from_assembly(np_ref)).get(
                    'path', '')
            else:
                if self._check_ref_type(np_ref, reads_types):
                    np_rd = self._get_kbreads_info(wsname, [np_ref])
                    np_reads_file = np_rd[0]['fwd_file']
                    if np_rd[0].get('rev_file', None):
                        np_reads_file += ' ' + np_rd[0]['rev_file']

        # STEP 2.4: any OTHER sequence data (454, Sanger, Ion torrent, etc) must be first
        # converted into Celera Assembler compatible .frg files
        # (see http://wgsassembler.sourceforge.com) and supplied as OTHER=file.frg
        other_frg = ''
        if params.get('other_frg_file', None):
            other_frg = params['other_frg_file']

        # STEP 3: construct and save the config.txt file for running masurca
        try:
            # STEP 3.1: replace the 'DATA...END' portion of the config_template.txt file
            data_str = self._get_data_portion(pe_reads_data, jp_reads_data,
                                              pb_reads_file, np_reads_file,
                                              other_frg)
            if data_str == '':  # no reads libraries are specified, no further actions
                return ''

            config_template = ''
            with codecs.open(os.path.join(os.path.dirname(__file__),
                                          'config_template.txt'),
                             mode='r',
                             encoding='utf-8') as config_template_file:
                config_template = config_template_file.read()

            begin_patn1 = "DATA\n"
            end_patn1 = "END\nPARAMETERS\n"
            config_with_data = self._replaceSectionText(
                config_template, begin_patn1, end_patn1, data_str)
            # log("\n***After DATA section replacement:\n{}\nSaved at {}".format(
            #             config_with_data.encode('utf-8').decode('utf-8'), config_file_path))

            with codecs.open(config_file_path, mode='w',
                             encoding='utf-8') as config_file:
                config_file.write(config_with_data)

            # STEP 3.2: replace the 'PARAMETERS...END' portion of the config_file file saved above
            param_str = self._get_parameters_portion(params)
            if param_str == '':  # no parameters are specified, no further actions
                return ''

            previous_config = ''
            with codecs.open(config_file_path, mode='r',
                             encoding='utf-8') as previous_config_file:
                previous_config = previous_config_file.read()

            begin_patn2 = "PARAMETERS\n"
            end_patn2 = "END\n"
            final_config = self._replaceSectionText(previous_config,
                                                    begin_patn2, end_patn2,
                                                    param_str)
            log("\n***Configuration file content:\n{}\nSaved at {}".format(
                final_config.encode('utf-8').decode('utf-8'),
                config_file_path))

            with codecs.open(config_file_path, mode='w',
                             encoding='utf-8') as config_file:
                config_file.write(final_config)
        except IOError as ioerr:
            log('Creation of the config.txt file raised error:\n')
            pprint(ioerr)
            return ''
        else:
            return config_file_path

    def generate_assemble_script(self, config_file):
        if os.path.isfile(config_file):
            f_dir, f_nm = os.path.split(config_file)
            m_cmd = [self.MaSuRCA_BIN]
            m_cmd.append(config_file)
            try:
                self.prog_runner.run(m_cmd, f_dir)
                assemble_file = os.path.join(f_dir, 'assemble.sh')
                log('Created the assemble.sh file at {}.\n'.format(
                    assemble_file))
                return assemble_file
            except ValueError as ve:
                log('Error generating assemble.sh file: \n{}'.format(ve))
                raise ValueError('Failed to generate assemble.sh file!')
        else:
            log("The config file {} is not found.\n".format(config_file))
            log('NO assemble.sh file created.\n')
        return ''

    def run_assemble(self, asmbl_file):
        exit_code = 1
        if os.path.isfile(asmbl_file):
            log("The assemble.sh file exists at {}\n".format(asmbl_file))
            f_dir, f_nm = os.path.split(asmbl_file)
            a_cmd = ['/bin/bash']
            a_cmd.append(asmbl_file)
            log("The working directory is {}\n".format(f_dir))
            log("The assembling command is {}\n".format(' '.join(a_cmd)))
            try:
                exit_code = self.prog_runner.run(a_cmd, f_dir)
            except ValueError as ve:
                log('Error running assemble: \n{}'.format(ve))
        else:
            log("The assemble.sh file {} is not found.".format(asmbl_file))
        return exit_code

    def save_assembly(self, contig_fa, wsname, a_name):
        if os.path.isfile(contig_fa):
            log('Uploading FASTA file to Assembly...')
            self.au.save_assembly_from_fasta({
                'file': {
                    'path': contig_fa
                },
                'workspace_name': wsname,
                'assembly_name': a_name
            })
        else:
            log("The contig file {} is not found.".format(contig_fa))