Ejemplo n.º 1
0
class DownloadUtils:
    def __init__(self, callbackURL):
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        self.au = AssemblyUtil(self.callbackURL)
        self.vu = VariationUtil(self.callbackURL)
        self.gfu = GenomeFileUtil(self.callbackURL)
        pass

    def download_genome(self, genomeref, output_dir):
        '''
        this funciton downloads genome.
        :param genomeref:
        :param output_dir:
        :return:
        '''

        file = self.au.get_assembly_as_fasta({
            'ref':
            genomeref,
            'filename':
            os.path.join(output_dir, "ref_genome.fa")
        })
        return file

    def get_variation(self, variation_ref):
        '''
        This function downloads variations.
        :param variation_ref:
        :param filename:
        :return:
        '''

        filepath = self.vu.get_variation_as_vcf(
            {'variation_ref': variation_ref})['path']
        return filepath

    def get_gff(self, genome_ref):
        '''
        :param genome_ref:
        :return: gff file path
        '''

        file = self.gfu.genome_to_gff({'genome_ref': genome_ref})
        return file['file_path']

    def get_assembly(self, assembly_ref, output_dir):
        '''
        :param assembly_ref:
        :param output_dir:
        :return: assembly file path
        '''

        file = self.au.get_assembly_as_fasta({
            'ref':
            assembly_ref,
            'filename':
            os.path.join(output_dir, "ref_genome.fa")
        })
        return file['path']
Ejemplo n.º 2
0
class downloaddatautils:
    def __init__(self):
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        self.gfu = GenomeFileUtil(self.callbackURL)
        self.vfu = VariationUtil(self.callbackURL)
        pass

    def download_genome(self, params):
        file = self.gfu.genome_to_gff({'genome_ref': params['gff_ref']})
        return file

    def download_vcf(self, params):
        params['input_var_ref'] = params['vcf_ref']
        self.vu.export_variation_as_vcf(params)
class DownloadUtils:
    def __init__(self):
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.au = AssemblyUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        pass

    def get_gff(self, genome_ref, output_dir):
        '''
        function for downloaing gff file
        :param genome_ref:
        :param output_dir:
        :return:
        '''

        gff_filename = os.path.join(output_dir + "/snp_eff/data/kbase_v1",
                                    "gene.gff")
        file = self.gfu.genome_to_gff({
            'genome_ref': genome_ref,
            'filename': gff_filename
        })
        return file['file_path']

    def get_assembly(self, assembly_ref, output_dir):
        '''
        function for downloaing assembly file.
        :param assembly_ref:
        :param output_dir:
        :return:
        '''
        assembly_filename = os.path.join(output_dir + "/snp_eff/data/kbase_v1",
                                         "sequences.fa")
        file = self.au.get_assembly_as_fasta({
            'ref': assembly_ref,
            'filename': assembly_filename
        })
        return file['path']
Ejemplo n.º 4
0
class QualiMapRunner:

    QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap'
    JAVA_MEM_DEFAULT_SIZE = '16G'
    LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024  # 20 GB
    TIMEOUT = 72 * 60 * 60  # 72 hours

    def _get_file_size(self, file_path):
        file_size = os.path.getsize(file_path)
        print('File size: {} -- {}'.format(file_size, file_path))
        return file_size

    def _large_file(self, file_path):

        filename, file_extension = os.path.splitext(file_path)
        multiplier = 0

        if file_extension == '.txt':
            total_file_size = 0
            with open(file_path, 'r') as f:
                for line in f:
                    bam_file_path = line.split('\t')[1]
                    total_file_size += self._get_file_size(bam_file_path)
            print('Total file size: {}'.format(total_file_size))
            multiplier = int(total_file_size) // int(self.LARGE_BAM_FILE_SIZE)
        else:
            multiplier = int(self._get_file_size(file_path)) // int(
                self.LARGE_BAM_FILE_SIZE)

        print('setting number of windows multiplier to: {}'.format(multiplier))

        return multiplier

    def _timeout_handler(self, signum, frame):
        print('Signal handler called with signal', signum)
        raise ValueError('QualiMap takes too long')

    def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
        self.scratch_dir = scratch_dir
        self.rau = ReadsAlignmentUtils(callback_url)
        self.kbr = KBaseReport(callback_url)
        self.dfu = DataFileUtil(callback_url)
        self.gfu = GenomeFileUtil(callback_url)
        self.set_api = SetAPI(srv_wiz_url)
        self.ws = Workspace(workspace_url)
        self.valid_commands = ['bamqc', 'multi-bamqc']

    def run_app(self, params):
        self.validate_params(params)
        print('Validated Params = ')
        pprint(params)
        run_info = self.get_run_info(params)

        if run_info.get('mode') not in ['single', 'multi']:
            raise ValueError(
                'Error in fetching the type to determine run settings.')

        run_error = False
        try:
            signal.signal(signal.SIGALRM, self._timeout_handler)
            signal.alarm(self.TIMEOUT)
            if run_info['mode'] == 'single':
                result = self.run_bamqc(params['input_ref'],
                                        run_info['input_info'])
            elif run_info['mode'] == 'multi':
                result = self.run_multi_sample_qc(params['input_ref'],
                                                  run_info['input_info'])
            signal.alarm(0)
        except Exception:
            run_error = True

            workdir = os.path.join(self.scratch_dir,
                                   'qualimap_' + str(int(time.time() * 10000)))
            os.makedirs(workdir)

            with open(os.path.join(workdir, 'qualimapReport.html'),
                      'w') as report:
                report.write('<html><body><p></p></body></html>')

            package_info = self.package_output_folder(
                workdir, 'QualiMap_report',
                'EMPTY HTML report directory for QualiMap BAM QC',
                'qualimapReport.html')

            result = {
                'qc_result_folder_path': workdir,
                'qc_result_zip_info': package_info,
                'shock_id': None
            }
            error_msg = 'Running QualiMap returned an error:\n{}\n'.format(
                traceback.format_exc())
            error_msg += 'Generating simple report instead\n'
            print(error_msg)

        if params['create_report']:
            result = self.create_report(result, params['output_workspace'],
                                        run_error, params['input_ref'])

        return result

    def create_report(self,
                      result,
                      output_workspace,
                      run_error=None,
                      input_ref=None):

        if run_error:
            objects_created = []
            info = self.get_obj_info(input_ref)
            obj_type = self.get_type_from_obj_info(info)
            if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'Alignment'
                })

            if obj_type in [
                    'KBaseRNASeq.RNASeqAlignmentSet',
                    'KBaseSets.ReadsAlignmentSet'
            ]:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'AlignmentSet'
                })
                reads_alignment_info = self.get_alignments_from_set(input_ref)
                for alignment in reads_alignment_info:
                    alignment_ref = alignment.get('ref')
                    objects_created.append({
                        'ref': alignment_ref,
                        'description': 'Alignment'
                    })

            report_info = self.kbr.create_extended_report({
                'message':
                ' ',
                'objects_created':
                objects_created,
                'report_object_name':
                'qualimap_report' + str(uuid.uuid4()),
                'workspace_name':
                output_workspace
            })
            result['report_name'] = report_info['name']
            result['report_ref'] = report_info['ref']
            return result

        qc_result_zip_info = result['qc_result_zip_info']
        report_info = self.kbr.create_extended_report({
            'message':
            '',
            'objects_created': [],
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'report_object_name':
            'qualimap_report' + str(uuid.uuid4()),
            'workspace_name':
            output_workspace
        })
        result['report_name'] = report_info['name']
        result['report_ref'] = report_info['ref']
        return result

    def get_gtf_file(self, input_ref, set_op=False):

        print('Start fetching GFF file from genome')

        if set_op:
            set_data = self.set_api.get_reads_alignment_set_v1({
                'ref':
                input_ref,
                'include_item_info':
                1
            })
            input_ref = set_data['data']['items'][0]['ref']

        obj_data = self.dfu.get_objects({"object_refs":
                                         [input_ref]})['data'][0]['data']

        genome_ref = obj_data.get('genome_id')

        if not genome_ref:
            raise ValueError(
                'Alignment is not associated with a Genome object')

        result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4()))
        os.makedirs(result_directory)

        genome_gtf_file = self.gfu.genome_to_gff({
            'genome_ref': genome_ref,
            'is_gtf': True,
            'target_dir': result_directory
        })['file_path']

        return genome_gtf_file

    def run_bamqc(self, input_ref, input_info):
        # download the input and setup a working dir
        alignment_info = self.rau.download_alignment({'source_ref': input_ref})
        bam_file_path = self.find_my_bam_file(
            alignment_info['destination_dir'])
        try:
            gtf_file = self.get_gtf_file(input_ref)
        except:
            gtf_file = ''

        workdir = os.path.join(self.scratch_dir,
                               'qualimap_' + str(int(time.time() * 10000)))

        options = [
            '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat',
            'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        options.append('--java-mem-size={}'.format(
            self.JAVA_MEM_DEFAULT_SIZE))  # always use large mem
        multiplier = self._large_file(bam_file_path)
        if multiplier:
            window_size = multiplier * 400
            print(f'using larger window size: {window_size} and Java memory: '
                  f'{self.JAVA_MEM_DEFAULT_SIZE}')
            options.append(
                '-nw {}'.format(window_size))  # increase size of windows

        self.run_cli_command('bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap BAM QC', 'qualimapReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def run_multi_sample_qc(self, input_ref, input_info):
        # download the input and setup a working dir
        reads_alignment_info = self.get_alignments_from_set(input_ref)
        try:
            gtf_file = self.get_gtf_file(input_ref, set_op=True)
        except:
            gtf_file = ''
        suffix = 'qualimap_' + str(int(time.time() * 10000))
        workdir = os.path.join(self.scratch_dir, suffix)
        os.makedirs(workdir)

        input_file_path = self.create_multi_qualimap_cfg(
            reads_alignment_info, workdir)

        options = [
            '-d', input_file_path, '-r', '-c', '-outdir', workdir,
            '-outformat', 'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        multiplier = self._large_file(input_file_path)
        if multiplier:
            window_size = multiplier * 400
            print(f'using larger window size: {window_size} and Java memory: '
                  f'{self.JAVA_MEM_DEFAULT_SIZE}')
            options.append(f'-nw {window_size}')  # increase size of windows
            options.append(f'--java-mem-size={self.JAVA_MEM_DEFAULT_SIZE}')

        self.run_cli_command('multi-bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap Multi-sample BAM QC',
            'multisampleBamQcReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def get_alignments_from_set(self, alignment_set_ref):
        set_data = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            1
        })
        items = set_data['data']['items']

        reads_alignment_data = []
        for alignment in items:
            alignment_info = self.rau.download_alignment(
                {'source_ref': alignment['ref']})
            bam_file_path = self.find_my_bam_file(
                alignment_info['destination_dir'])
            label = None
            if 'label' in alignment:
                label = alignment['label']
            reads_alignment_data.append({
                'bam_file_path': bam_file_path,
                'ref': alignment['ref'],
                'label': label,
                'info': alignment['info']
            })
        return reads_alignment_data

    def create_multi_qualimap_cfg(self, reads_alignment_info, workdir):
        # Group by labels if there is at least one defined
        use_labels = False
        for alignment in reads_alignment_info:
            if alignment['label']:
                use_labels = True
                break

        # write the file
        input_file_path = os.path.join(workdir, 'multi_input.txt')
        input_file = open(input_file_path, 'w')
        name_lookup = {}
        for alignment in reads_alignment_info:
            name = alignment['info'][1]
            if name in name_lookup:
                name_lookup[name] += 1
                name = name + '_' + str(name_lookup[name])
            else:
                name_lookup[name] = 1

            input_file.write(name + '\t' + alignment['bam_file_path'])
            if use_labels:
                if alignment['label']:
                    input_file.write('\t' + alignment['label'])
                else:
                    input_file.write('\tunlabeled')
            input_file.write('\n')
        input_file.close()
        return input_file_path

    def get_run_info(self, params):
        info = self.get_obj_info(params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
            return {'mode': 'single', 'input_info': info}
        if obj_type in [
                'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet'
        ]:
            return {'mode': 'multi', 'input_info': info}
        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def validate_params(self, params):
        if 'input_ref' not in params:
            raise ValueError(
                'required parameter field "input_ref" was not set')

        create_report = False
        if 'create_report' in params:
            if int(params['create_report']) == 1:
                if 'output_workspace' not in params:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                if not params['output_workspace']:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                create_report = True
        params['create_report'] = create_report

    def run_cli_command(self, command, options, cwd=None):
        if command not in self.valid_commands:
            raise ValueError('Invalid QualiMap command: ' + str(command))
        command = [self.QUALIMAP_PATH, command] + options
        print('Running: ' + ' '.join(command))

        if not cwd:
            cwd = self.scratch_dir

        p = subprocess.Popen(command, cwd=cwd, shell=False)
        exitCode = p.wait()

        if exitCode == 0:
            print('Success, exit code was: ' + str(exitCode))
        else:
            raise ValueError('Error running command: ' + ' '.join(command) +
                             '\n' + 'Exit Code: ' + str(exitCode))

    def find_my_bam_file(self, dirpath):
        bam_path = None
        for f in os.listdir(dirpath):
            fullpath = os.path.join(dirpath, f)
            if os.path.isfile(fullpath) and f.lower().endswith('.bam'):
                if bam_path is not None:
                    raise ValueError(
                        'Error! Too many BAM files were downloaded for this alignment!'
                    )
                bam_path = fullpath
        if bam_path is None:
            raise ValueError(
                'Error! No BAM files were downloaded for this alignment!')
        return bam_path

    def package_output_folder(self, folder_path, zip_file_name,
                              zip_file_description, index_html_file):
        """ Simple utility for packaging a folder and saving to shock """
        output = self.dfu.file_to_shock({
            'file_path': folder_path,
            'make_handle': 0,
            'pack': 'zip'
        })
        return {
            'shock_id': output['shock_id'],
            'name': zip_file_name,
            'description': zip_file_description,
            'index_html_file_name': index_html_file
        }

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
Ejemplo n.º 5
0
class JbrowseUtil:
    def __init__(self, Config):
        callback_url = os.environ['SDK_CALLBACK_URL']
        ws_url = Config['ws_url']
        self.wsc = Workspace(ws_url)
        self.dfu = DataFileUtil(callback_url)
        self.gfu = GenomeFileUtil(callback_url)
        #service-wizard url
        self.sw_url = Config['sw_url']
        self.shock_url = Config['shock_url']
        scratch = Config['scratch']
        session = str(uuid.uuid4())
        self.session_dir = (os.path.join(scratch, session))
        os.mkdir(self.session_dir)
        pass

    def get_variation_service_url(self, sw_url):
        '''
        get the most recent VariationFileServ url from the service wizard.
        sw_url: service wizard url
        '''
        # TODO Fix the following dev thing to beta or release or future
        json_obj = {
            "method": "ServiceWizard.get_service_status",
            "id": "",
            "params": [{"module_name": "VariationFileServ", "version": "dev"}]
        }
        sw_resp = requests.post(url=sw_url, data=json.dumps(json_obj))
        vfs_resp = sw_resp.json()
        self.shock_url = self.shock_url.replace("https://", "")
        vfs_url = vfs_resp['result'][0]['url'] + "/jbrowse_query/" + self.shock_url + "/node"
        return vfs_url

    def _run_cmd(self, cmd):
        try:
            process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
            stdout, stderr = process.communicate()
            if stdout:
                logging.info("ret> ", process.returncode)
                logging.info("OK> output ", stdout)
            if stderr:
                logging.info("ret> ", process.returncode)
                logging.info("Error> error ", stderr.strip())

        except OSError as e:
            logging.info("OSError > ", e.errno)
            logging.info("OSError > ", e.strerror)
            logging.info("OSError > ", e.filename)

    def create_refseqs_data_from_assembly(self, assembly_ref):
        '''

        :param assembly_json:
        :return:
        '''
        refseqs_data = []
        # 1) Download assembly contig info and parse contig length information
        data = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': assembly_ref
        }])[0]['data']
        for key in data['contigs']:
            refseqs_data.append(
                {"end": data['contigs'][key]["length"],
                 "length": data['contigs'][key]["length"],
                 "name": data['contigs'][key]["contig_id"],
                 "seqChunkSize": 20000,
                 "start": 0
                 }
            )
        return refseqs_data


    def prepare_genome_features_track(self, genome_ref, vfs_url):
        """
        Builds track for genome features

        :param genome_ref:
        :return:
        """
        shock_handles = list()
        gff_track = ""

        # 1) Download gff using genomefileutil
        gff_file_info = self.gfu.genome_to_gff({'genome_ref': genome_ref})
        gff_file = gff_file_info["file_path"]

        # 2) sort gff
        outfile = gff_file + "_sorted"
        sorted_gff_cmd = " ".join(["sort -k1,1 -k4,4n",
                                  gff_file, ">", outfile])
        self._run_cmd(sorted_gff_cmd)

        # 3) compress gff
        zip_cmd = "bgzip " + outfile
        self._run_cmd(zip_cmd)

        # 4) index gff
        index_gff_cmd = "tabix -p gff " + gff_file + "_sorted.gz"
        self._run_cmd(index_gff_cmd)

        gff_gz_file_path = gff_file + "_sorted.gz"
        gff_index_file_path = gff_file + "_sorted.gz.tbi"

        # 5) Upload gff and gff index to shock
        if os.path.exists(gff_gz_file_path):
            gff_shock_ref = self.dfu.file_to_shock(
                {'file_path': gff_gz_file_path, 'make_handle': 1}
            )
        if os.path.exists(gff_index_file_path):
            gff_index_shock_ref = self.dfu.file_to_shock(
                {'file_path': gff_index_file_path, 'make_handle': 1}
            )

        # 6 Create gff track text that will be used for genome features track
        gff_track = '''
        {
            "label": "Genome Features",
            "key": "GenomeFeatures",
            "storeClass": "JBrowse/Store/SeqFeature/GFF3Tabix",
            "urlTemplate":"<vfs_url>/<gff_shock_ref>",
            "tbiUrlTemplate": "<vfs_url>/<gff_index_shock_ref>",
            "type": "JBrowse/View/Track/CanvasFeatures"
        }
        '''
        gff_track = gff_track.replace("<gff_shock_ref>",
                                      gff_shock_ref['handle']['id'])
        gff_track = gff_track.replace("<gff_index_shock_ref>",
                                      gff_index_shock_ref['handle']['id'])
        gff_track = gff_track.replace("<vfs_url>", vfs_url)
        gff_track_dict = json.loads(gff_track)

        # 7) Capture shock handles
        shock_handles.append(gff_shock_ref['handle'])
        shock_handles.append(gff_index_shock_ref['handle'])

        # 8) return shock handles and gff track info
        return {"shock_handle_list": shock_handles, "track_item": gff_track_dict}



    def prepare_snp_frequency_track(self, vcf_filepath, assembly_ref, binsize, vfs_url):
        """

        :param vcf_filepath:
        :param assembly_ref:
        :param binsize:
        :return:
        """
        BEDGRAPHTOBIGWIG="/kb/deployment/bin/bedGraphToBigWig"
        shock_handles = list()

        chr_length_dict = {}
        chr_length_data = ""
        chr_length_path = None
        counts = Counter()

        # 1) Download assembly contig info and parse contig length information
        data = self.wsc.get_object_subset([{
            'included': ['/contigs'],
            'ref': assembly_ref
        }])[0]['data']

        contigs = data["contigs"]
        for contig in contigs:
            contig_data = data["contigs"][contig]
            chr_length_data += str(contig_data['contig_id']) + '\t' + str(contig_data['length']) + '\n'
            c_id = str(contig_data['contig_id'])
            c_length = str(contig_data['length'])
            chr_length_dict[c_id] = c_length

        # 2) Write contig lengths to a file (needed later)
        if chr_length_data is not None:
            chr_length_path = os.path.join(self.session_dir,
                                            "chr_length.txt")
            with open(chr_length_path, "w") as f:
                f.write(chr_length_data)

        # 3) Read and parse vcf file (must be bgzip compressed)
        #    Caclculate number of SNPs in each bin and write in bedgraph format
        reader = gzip.open(vcf_filepath, "rt")
        logging.info("Generating bedgraph file\n")
        for record in reader:
            if record[0] == "#":
                continue
            rs = record.split("\t")
            CHR, POS = rs[0], rs[1]
            bin_pos = int(int(POS) / binsize)
            bin_id = str(CHR) + "\t" + str(bin_pos)
            counts[bin_id] += 1
        bedgraph_file = os.path.join(self.session_dir, "vcf_bedgraph.txt")
        try:
            with open(bedgraph_file, "w") as fout:
                for j, k in counts.items():
                    chromosome, bin_num = j.split("\t")
                    bin_start = int(bin_num) * binsize
                    bin_end = bin_start + binsize
                    chr_length = chr_length_dict[chromosome]
                    if bin_end <= int(chr_length):
                        fout.write(chromosome + "\t" + str(bin_start) + "\t" + str(bin_end) + "\t" + str(k) + "\n")
                    else:
                        fout.write(chromosome + "\t" + str(bin_start) + "\t" + str(chr_length) + "\t" + str(k) + "\n")
        except IOError:
            logging.info("Unable to write " + bedgraph_file, + " file on disk.")

        # 4) Sort bedgraph file by chromosome id and co-ordinates
        sorted_bedgraph_file = bedgraph_file + "_sorted"
        sort_cmd = "sort -k1,1 -k2,2n " + bedgraph_file + "> " + sorted_bedgraph_file
        self._run_cmd(sort_cmd)

        # 5) Convert sorted bedgraph to bigwig format using utility bedgraphTOBigWig tool
        output_bigwig_file = bedgraph_file + "_bigwig.bw"
        cmd = BEDGRAPHTOBIGWIG + " " + sorted_bedgraph_file + " " + chr_length_path + " " + output_bigwig_file
        logging.info("Generating bigwig ..\n" + cmd + "\n")
        self._run_cmd(cmd)

        # 6) upload bigwig file to shock
        logging.info("Uploading Bigwig file to shock")
        if os.path.exists(output_bigwig_file):
            bigwig_shock_ref = self.dfu.file_to_shock(
                {'file_path': output_bigwig_file, 'make_handle': 1}
            )
        # 7) Append shock handle to genomic_indexes
        shock_handles.append(bigwig_shock_ref['handle'])

        # 8) Build snp frequency track
        output_bigwig_shock = bigwig_shock_ref['handle']['id']
        snp_frequency_track = '''
        {
            "label": "Variation Densityy", 
            "key": "Variation_density", 
            "storeClass": "JBrowse/Store/SeqFeature/BigWig", 
            "urlTemplate": "<vfs_url>/<bigwig_shock_id>", 
            "type": "JBrowse/View/Track/Wiggle/XYPlot"
        } 
        '''
        snp_frequency_track = snp_frequency_track.replace("<bigwig_shock_id>", output_bigwig_shock)
        snp_frequency_track = snp_frequency_track.replace("<vfs_url>", vfs_url)
        snp_frequency_track_dict = json.loads(snp_frequency_track)
        # 9) Return shock handles and track info
        return {"shock_handle_list": shock_handles, "track_item": snp_frequency_track_dict}


    def prepare_snp_track(self, vcf_shock_id, vcf_index_shock_id, vfs_url):
        """

        :param vcf_shock_id:
        :param vcf_index_shock_id:
        :return:
        """
        shock_handles = list()

        snp_track ='''
            {
                "label": "Variation", 
                "key": "Variation", 
                "storeClass": "JBrowse/Store/SeqFeature/VCFTabix", 
                "urlTemplate": "<vfs_url>/<vcf_shock_id>", 
                "tbiUrlTemplate": "<vfs_url>/<vcf_index_shock_id>", 
                "type": "JBrowse/View/Track/HTMLVariants"
            }
        '''
        snp_track = snp_track.replace("<vcf_shock_id>", vcf_shock_id)
        snp_track = snp_track.replace("<vcf_index_shock_id>", vcf_index_shock_id)
        snp_track = snp_track.replace("<vfs_url>", vfs_url)
        snp_track_dict = json.loads(snp_track)
        # shock handles should be empty list in return when built from shock ids
        return {"shock_handle_list": shock_handles, "track_item": snp_track_dict}

    def build_jbrowse_data_folder(self, jbrowse_path):
        shock_handles = list()
        data_folder_shock_ref = self.dfu.file_to_shock({'file_path': jbrowse_path,
                                            'pack': 'zip', 'make_handle': 1})
        shock_handles.append(data_folder_shock_ref['handle'])
        return {"shock_handle_list": shock_handles}

    def build_jbrowse(self, jbrowse_src, jbrowse_path, refseqs_data, genomic_indexes, tracklist_items):
        """

        :param jbrowse_src:
        :param jbrowse_path:
        :param genomic_indexes:
        :param tracklist_items:
        :return:
        """
        jbrowse_report = {}

        # 1) Copy the jbrowse source code to build report
        destination = shutil.copytree(jbrowse_src, jbrowse_path)

        # 2) Put tracklist.json in jbrowse data path
        tracklist_path = os.path.join(jbrowse_path, "data", "trackList.json")
        trackdata = {
            'formatVersion': 1,
            'tracks': tracklist_items
        }
        with open(tracklist_path, "w") as f:
            f.write(json.dumps(trackdata))

        # 3) Put refseq.json in jbrowse seq path
        refseqs_json_path = os.path.join(jbrowse_path, "data", "seq", "refSeqs.json")
        with open(refseqs_json_path, "w") as f:
            f.write(json.dumps(refseqs_data))

        #Build jbrowse data folder to support jbrowse widget in narrative
        res = self.build_jbrowse_data_folder(jbrowse_path)
        data_folder_index = res['shock_handle_list']
        genomic_indexes = genomic_indexes + data_folder_index

        # Build jbrowse report dict
        jbrowse_report["jbrowse_data_path"] = jbrowse_path
        jbrowse_report["genomic_indexes"] = genomic_indexes

        return jbrowse_report

    def prepare_jbrowse_report(self, jbrowse_params):
        """
        Build genomic indexes, prepare jbrowse report
        :param input_params:
        :return:
        """
        # Service wizard
        sw_url = self.sw_url
        # Variation file service url for serving jbrowse track files
        vfs_url = self.get_variation_service_url(sw_url)

        print(vfs_url)

        genomic_indexes = list()
        tracklist_items = list()
        refseqs_data = None

        # 1) Build refseqs_data
        #    This is used to build refseqs.json file for jbrowse
        #    Jbrowse report can not be built if assembly ref doesn't exist
        if 'assembly_ref' in jbrowse_params:
            assembly_ref = jbrowse_params['assembly_ref']
            refseqs_data = self.create_refseqs_data_from_assembly(assembly_ref)
        else:
            raise ValueError ("assembly ref not found")
            return

        # 2) Build genome features track
        if 'genome_ref' in jbrowse_params:
            genome_ref = jbrowse_params['genome_ref']
            output = self.prepare_genome_features_track(genome_ref, vfs_url)
            shock_handles, track_item = output["shock_handle_list"], output["track_item"]
            genomic_indexes = genomic_indexes + shock_handles
            tracklist_items.append(track_item)
        else:
            print ("Skipping genome features track")


        # 3) Build SNP frequency track
        cond1 = 'vcf_path' in jbrowse_params
        cond2 = 'assembly_ref' in jbrowse_params
        cond3 = 'binsize' in jbrowse_params
        if cond1 and cond2 and cond3:
            vcf_path = jbrowse_params['vcf_path']
            assembly_ref = jbrowse_params['assembly_ref']
            binsize = jbrowse_params["binsize"]
            output = self.prepare_snp_frequency_track(vcf_path, assembly_ref, binsize, vfs_url)
            shock_handles, track_item = output["shock_handle_list"], output["track_item"]
            if shock_handles:
                genomic_indexes = genomic_indexes + shock_handles
            tracklist_items.append(track_item)
        else:
            print ("Skipping SNP frequency track")

        # 4) Build SNP track
        cond1 = 'vcf_shock_id' in jbrowse_params
        cond2 = 'vcf_index_shock_id' in jbrowse_params
        if cond1 and cond2:
            vcf_shock_id = jbrowse_params['vcf_shock_id']
            vcf_index_shock_id = jbrowse_params['vcf_index_shock_id']
            output = self.prepare_snp_track(vcf_shock_id, vcf_index_shock_id, vfs_url)
            shock_handles, track_item = output["shock_handle_list"], output["track_item"]
            genomic_indexes = genomic_indexes + shock_handles
            tracklist_items.append(track_item)
        else:
            print ("Skipping SNP track")
        # 5) Build jbrowse directory with index.html
        # jbrowse directory later on gets uploaded as html report
        jbrowse_src = "/kb/module/deps/jbrowse"
        jbrowse_path = os.path.join(self.session_dir, "jbrowse")

        if tracklist_items:
             jbrowse_report = self.build_jbrowse(jbrowse_src,
                                                 jbrowse_path,
                                                 refseqs_data,
                                                 genomic_indexes,
                                                 tracklist_items)
        else:
            raise ValueError ("No tracks found")
            return

        return jbrowse_report
Ejemplo n.º 6
0
def download_gffs(cb_url, scratch, genome_set_ref):
    """
    Args:
    cb_url - callback server URL
    scratch - scratch work folder
    genome_set_ref - reference to genome_set object in workspace
    Returns the path to the folder containing .gff files


    we want to first handle a GenomeSet Object "KBaseSearch.GenomeSet" or "KBaseSets.GenomeSet"
    """

    # Get our utilities
    dfu = DataFileUtil(cb_url)
    au = AssemblyUtil(cb_url)
    gfu = GenomeFileUtil(cb_url)

    obj_data = dfu.get_objects({'object_refs': [genome_set_ref]})['data'][0]
    gs_obj = obj_data['data']
    obj_type = obj_data['info'][2]

    if 'KBaseSets.GenomeSet' in obj_type:
        refs = [gsi['ref'] for gsi in gs_obj['items']]
    elif 'KBaseSearch.GenomeSet' in obj_type:
        refs = [gse['ref'] for gse in gs_obj['elements'].values()]
    else:
        raise TypeError(
            'provided input must of type KBaseSets.GenomeSet or KBaseSearch.GenomeSet not '
            + str * (obj_type))

    if len(refs) < 2:
        raise ValueError("Must provide GenomeSet with at least 2 Genomes.")

    # name the output directory
    temp_dir = scratch + '/temp'
    final_dir = scratch + '/gff'

    os.mkdir(final_dir)
    os.mkdir(temp_dir)

    # write file that will help us cat the gff and fasta files
    cat_path = scratch + '/fast_cat.txt'

    with open(cat_path, 'w') as cat_file:
        cat_file.write("##FASTA\n")

    path_to_ref_and_ID_pos_dict = {}
    all_ids = set([])

    for ref in refs:
        gen_obj = dfu.get_objects({'object_refs': [ref]})['data'][0]['data']

        # NO Eukaryotes, NO Fungi,
        # yes bacateria, yes archaea, yes(?) virus
        if gen_obj['domain'] not in ['Bacteria', 'Archaea']:
            raise TypeError(
                'Provided Genomes are not labeled as Bacteria or Archaea. Roary is only equipped to handle Archaea or Bacteria'
            )

        fasta_path = temp_dir + "/" + gen_obj['id'] + ".fa"
        gff_file = gfu.genome_to_gff({
            'genome_ref': ref,
            'target_dir': temp_dir
        })
        if 'assembly_ref' not in gen_obj.keys():
            raise TypeError("All genomes must contain an 'assembly_ref'")
        else:
            fasta_file = au.get_assembly_as_fasta({
                'ref':
                gen_obj['assembly_ref'],
                'filename':
                fasta_path
            })
            # check that fasta_file exists
            if not os.path.isfile(fasta_file['path']):
                raise ValueError(
                    'An input Genome does not have an associated FASTA file.')

        # need to figure out if FASTA is already in gff file
        # not sure if we need to do this step.
        if 'path' in gff_file:
            gff_file_path = gff_file['path']
        elif 'file_path' in gff_file:
            gff_file_path = gff_file['file_path']
        elif 'gff_file' in gff_file:
            gff_file_path = gff_file['gff_file']['path']
        else:
            raise ValueError("No GFF File Path found.")

        assert (
            os.path.isfile(gff_file_path)
        ), "Could not find input GFF file for object with workspace reference: %s" % ref

        # oki doki, here we wanna make sure that the ID's in the genome object match up with
        # ID's in the gff file. This is importatnt because the pangenome object uses the genome
        # objects (in the pangenomeviewer).

        gen_id_to_pos, contains_fasta, all_ids, gen_ids = filter_gff(
            gff_file_path, gen_obj, all_ids=all_ids)

        new_file_path = final_dir + "/" + gen_obj['id'] + '.gff'

        if contains_fasta:
            args = ['mv', gff_file_path, new_file_path]
            subprocess.call(args)
        else:
            # NOTE: We have to pipe output of cat call to the new_file_path
            # next we make a new 'gff' file that contains both the gff and fasta information

            args = ['cat', gff_file_path, cat_path, fasta_file['path']]
            catted_files = subprocess.check_output(args)
            with open(new_file_path, 'w') as f:
                f.write(catted_files.decode('utf-8'))

        path_to_ref_and_ID_pos_dict[new_file_path] = (ref, gen_id_to_pos,
                                                      gen_ids)

    return final_dir, path_to_ref_and_ID_pos_dict