Beispiel #1
0
def download_genome_to_json_files(token, genome_ref, target_dir):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    file_name_to_data_map = {}
    dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                       token=token,
                       service_ver='dev')
    genome_data = dfu.get_objects({'object_refs': [genome_ref]})['data'][0]
    genome_obj = genome_data['data']
    genome_meta = genome_data['info'][10]
    file_name_to_data_map["genome.json"] = genome_obj
    file_name_to_data_map["genome.meta.json"] = genome_meta
    if 'genbank_handle_ref' in genome_obj:
        gbk_file_name = "genome.gbk"
        dfu.shock_to_file({
            'handle_id': genome_obj['genbank_handle_ref'],
            'file_path': os.path.join(target_dir, gbk_file_name)
        })
        genome_obj['genbank_handle_ref'] = gbk_file_name
    if 'contigset_ref' in genome_obj:
        contigset_data = dfu.get_objects(
            {'object_refs': [genome_obj['contigset_ref']]})['data'][0]
        contigset_obj = contigset_data['data']
        contigset_meta = contigset_data['info'][10]
        file_name_to_data_map["contigset.json"] = contigset_obj
        file_name_to_data_map["contigset.meta.json"] = contigset_meta
        genome_obj['contigset_ref'] = "contigset.json"
    elif 'assembly_ref' in genome_obj:
        assembly_data = dfu.get_objects(
            {'object_refs': [genome_obj['assembly_ref']]})['data'][0]
        assembly_obj = assembly_data['data']
        assembly_meta = assembly_data['info'][10]
        file_name_to_data_map["assembly.json"] = assembly_obj
        file_name_to_data_map["assembly.meta.json"] = assembly_meta
        genome_obj['assembly_ref'] = "assembly.json"
        fasta_handle_ref = assembly_obj['fasta_handle_ref']
        fasta_file_name = "assembly.fa"
        dfu.shock_to_file({
            'handle_id':
            fasta_handle_ref,
            'file_path':
            os.path.join(target_dir, fasta_file_name)
        })
        assembly_obj['fasta_handle_ref'] = fasta_file_name
        assembly_obj['external_source_id'] = fasta_file_name
        if 'taxon_ref' in assembly_obj:
            taxon_obj = dfu.get_objects(
                {'object_refs':
                 [assembly_obj['taxon_ref']]})['data'][0]['data']
            file_name_to_data_map["taxon.json"] = taxon_obj
            assembly_obj['taxon_ref'] = "taxon.json"
            if 'taxon_ref' in genome_obj:
                genome_obj['taxon_ref'] = "taxon.json"
            taxon_obj['parent_taxon_ref'] = ""
    for target_file_name in file_name_to_data_map:
        with open(os.path.join(target_dir, target_file_name), 'w') as f:
            json.dump(file_name_to_data_map[target_file_name],
                      f,
                      sort_keys=True,
                      indent=4)
Beispiel #2
0
    def BuildFastaFromSequenceSet(self, ctx, params):
        """
        :param params: instance of type "BuildSeqIn" -> structure: parameter
           "workspace_name" of String, parameter "SequenceSetRef" of String,
           parameter "fasta_outpath" of String
        :returns: instance of type "BuildSeqOut" -> structure: parameter
           "fasta_outpath" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN BuildFastaFromSequenceSet
        dfu = DataFileUtil(self.callback_url)
        get_objects_params = {'object_refs': [params['SequenceSetRef']]}
        SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data']

        outFile = open(params['fasta_outpath'], 'w')
        for s in SeqSet['sequences']:
            sname = '>' + s['sequence_id'] + '\n'
            outFile.write(sname)
            sseq = s['sequence'] + '\n'
            outFile.write(sseq)
        outFile.close()
        output = {'fasta_outpath': params['fasta_outpath']}

        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method BuildFastaFromSequenceSet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Beispiel #3
0
    def test_load_with_filter_and_options(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "legacy_test.fna"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'FilteredAssembly'
        result = assemblyUtil.save_assembly_from_fasta(
            self.getContext(), {
                'file': {
                    'path': fasta_path
                },
                'workspace_name': self.getWsName(),
                'assembly_name': ws_obj_name,
                'min_contig_length': 9,
                'external_source': 'someplace',
                'external_source_id': 'id',
                'external_source_origination_date': 'sunday',
                'type': 'metagenome',
                'contig_info': {
                    's3': {
                        'is_circ': 0,
                        'description': 'somethin'
                    }
                }
            })

        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        assembly = dfu.get_objects({'object_refs':
                                    [result[0]]})['data'][0]['data']

        self.assertEqual(len(assembly['contigs']), 1)
        self.assertEqual(assembly['contigs']['s3']['md5'],
                         '4f339bd56e5f43ecb52e8682a790a111')
        self.assertEqual(assembly['contigs']['s3']['contig_id'], 's3')
        self.assertEqual(assembly['contigs']['s3']['length'], 18)
        self.assertEqual(assembly['contigs']['s3']['is_circ'], 0)
        self.assertEqual(assembly['contigs']['s3']['description'], 'somethin')

        self.assertEqual(assembly['dna_size'], 18)
        self.assertEqual(assembly['gc_content'], 0.44444)
        self.assertEqual(assembly['md5'], 'eba4d1771060e19671a56832d159526e')
        self.assertEqual(assembly['num_contigs'], 1)
        self.assertEqual(assembly['type'], 'metagenome')
        self.assertEqual(assembly['external_source'], 'someplace')
        self.assertEqual(assembly['external_source_id'], 'id')
        self.assertEqual(assembly['external_source_origination_date'],
                         'sunday')
    def generate_report(self, obj_refs, workspace_name):
        """
		generate_report: generate summary report

		params: 
		obj_refs: generated workspace object references. (return of upload_fastq_file)
		workspace_name: workspace name/ID that reads will be stored to

		"""

        uuid_string = str(uuid.uuid4())
        obj_refs_list = obj_refs.split(',')

        dfu = DataFileUtil(self.callback_url)

        upload_message = 'Upload Finished\nUploaded Reads:\n'

        for obj_ref in obj_refs_list:
            get_objects_params = {
                'object_refs': [obj_ref],
                'ignore_errors': False
            }

            object_data = dfu.get_objects(get_objects_params)
            upload_message += "Reads Name: " + str(
                object_data.get('data')[0].get('info')[1]) + '\n'
            upload_message += "Reads Type: " + str(
                object_data.get('data')[0].get('info')[2]) + '\n'
            reads_info = object_data.get('data')[0].get('info')[-1]
            if isinstance(reads_info, dict):
                upload_message += "Reads Info: " + json.dumps(
                    reads_info, indent=1)[1:-1] + '\n'

        report_params = {
            'message': upload_message,
            'workspace_name': workspace_name,
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Beispiel #5
0
    def test_filtered_everything(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "legacy_test.fna"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'FilteredAssembly'
        result = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                       {'file': {'path': fasta_path},
                                                        'workspace_name': self.getWsName(),
                                                        'assembly_name': ws_obj_name,
                                                        'min_contig_length': 500
                                                        })

        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        assembly = dfu.get_objects({'object_refs': [result[0]]})['data'][0]['data']
        self.assertEqual(assembly['dna_size'], 0)
        self.assertEqual(assembly['gc_content'], None)
        self.assertEqual(assembly['num_contigs'], 0)
    def test_load_with_filter_and_options(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "legacy_test.fna"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'FilteredAssembly'
        result = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                       {'file': {'path': fasta_path},
                                                        'workspace_name': self.getWsName(),
                                                        'assembly_name': ws_obj_name,
                                                        'min_contig_length': 9,
                                                        'external_source': 'someplace',
                                                        'external_source_id': 'id',
                                                        'external_source_origination_date': 'sunday',
                                                        'type': 'metagenome',
                                                        'contig_info': {'s3': {'is_circ': 0, 'description': 'somethin'}}
                                                        })

        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        assembly = dfu.get_objects({'object_refs': [result[0]]})['data'][0]['data']

        self.assertEqual(len(assembly['contigs']), 1)
        self.assertEqual(assembly['contigs']['s3']['md5'], '4f339bd56e5f43ecb52e8682a790a111')
        self.assertEqual(assembly['contigs']['s3']['contig_id'], 's3')
        self.assertEqual(assembly['contigs']['s3']['length'], 18)
        self.assertEqual(assembly['contigs']['s3']['is_circ'], 0)
        self.assertEqual(assembly['contigs']['s3']['description'], 'somethin')

        self.assertEqual(assembly['dna_size'], 18)
        self.assertEqual(assembly['gc_content'], 0.44444)
        self.assertEqual(assembly['md5'], 'eba4d1771060e19671a56832d159526e')
        self.assertEqual(assembly['num_contigs'], 1)
        self.assertEqual(assembly['type'], 'metagenome')
        self.assertEqual(assembly['external_source'], 'someplace')
        self.assertEqual(assembly['external_source_id'], 'id')
        self.assertEqual(assembly['external_source_origination_date'], 'sunday')
Beispiel #7
0
def download_fasta(refs, cb_url):
    """
    Args:
      ref - workspace reference in the form 'workspace_id/object_id/obj_version'
      cb_url - callback server URL
    Returns the path of the downloaded fasta file
    """
    dfu = DataFileUtil(cb_url)
    assembly_util = AssemblyUtil(cb_url)
    ws_objects = dfu.get_objects({'object_refs': refs})
    paths = []
    for (obj, ref) in zip(ws_objects['data'], refs):
        ws_type = obj['info'][2]
        if 'KBaseGenomes.Genome' in ws_type:
            assembly_ref = get_assembly_ref_from_genome(ref, obj)
        elif 'KBaseGenomeAnnotations.Assembly' in ws_type:
            assembly_ref = ref
        else:
            raise TypeError('Invalid type ' + ws_type + '. Must be an Assembly or Genome.')
        path = assembly_util.get_assembly_as_fasta({'ref': assembly_ref})['path']
        paths.append(path)
    return paths
Beispiel #8
0
    def DownloadMotifSet(self, ctx, params):
        """
        :param params: instance of type "DownloadParams" -> structure:
           parameter "ws_name" of String, parameter "source_ref" of String,
           parameter "format" of String
        :returns: instance of type "DownloadOutput" -> structure: parameter
           "destination_dir" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN DownloadMotifSet
        #fname = params[]
        dfu = DataFileUtil(self.callback_url)
        get_object_params = {
            'object_refs': [params['source_ref']]
        }  #grab motifset object
        MSO = dfu.get_objects(get_object_params)['data'][0]['data']
        output = ''
        if params['format'] == 'MEME':
            output = MD.MotifSetToMEME(MSO)
        else:
            print('FORMAT IS NOT RECOGNIZED OR SUPPORTED')
            print('Supported Formats: MEME JASPAR TRANSFAC')
            print('Implemented: MEME')
        outFilePath = '/kb/module/work/tmp/' + params['outname']
        with open(outFilePath) as outFile:
            outFile.write(output)
        output = {'destination_path': outFilePath}

        #TODO: add this...
        #END DownloadMotifSet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method DownloadMotifSet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
class PangenomeDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.pga = PanGenomeAPI(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "pangenome_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        files = {}
        working_dir = os.path.join(self.scratch,
                                   'pangenome-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)

        pg_id, id_name_map, genome_df = self.make_genomes_df(
            params['pangenome_ref'])
        files['genomes_path'] = os.path.join(working_dir, pg_id + "_Genomes.tsv")
        genome_df.to_csv(files['genomes_path'], sep="\t")

        ortho_df = self.make_ortholog_df(params['pangenome_ref'], id_name_map)
        files['orthologs_path'] = os.path.join(working_dir,
                                             pg_id + "_Orthologs.tsv")
        ortho_df.to_csv(files['orthologs_path'], sep="\t")

        return pg_id, files

    def to_excel(self, params):
        files = {}
        working_dir = os.path.join(self.scratch,
                                   'pangenome-download-' + str(uuid.uuid4()))
        os.makedirs(working_dir)

        pg_id, id_name_map, genome_df = self.make_genomes_df(
            params['pangenome_ref'])
        files['path'] = os.path.join(working_dir, pg_id + ".xlsx")
        writer = pandas.ExcelWriter(files['path'])
        genome_df.to_excel(writer, "Genomes")

        ortho_df = self.make_ortholog_df(params['pangenome_ref'], id_name_map)
        ortho_df.to_excel(writer, "Orthologs")
        writer.save()

        return pg_id, files

    def make_genomes_df(self, pg_ref):
        summary = self.pga.compute_summary_from_pangenome({
            "pangenome_ref": pg_ref})
        return summary['pangenome_id'], summary['genome_ref_name_map'], \
               pandas.DataFrame(summary['shared_family_map'])

    def make_ortholog_df(self, pg_ref, id_name_map):
        pangen = self.dfu.get_objects({'object_refs': [pg_ref]}
                                      )['data'][0]['data']
        ortho = {}
        for cluster in pangen['orthologs']:
            ortho[cluster['id']] = {
                "representative function": cluster.get('function', ""),
                "type": cluster.get("type", ""),
                "protein sequence": cluster.get("protein_translation", ""),
            }
            for gid, name in id_name_map.items():
                ortho[cluster['id']][name] = ";".join(
                    [x[0] for x in cluster['orthologs'] if x[2] == gid])

        return pandas.DataFrame.from_dict(ortho, 'index')[
            ["representative function", "type", "protein sequence"] +
            sorted([x for x in id_name_map.values()])]

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['pangenome_ref']]
        })

        return {'shock_id': package_details['shock_id']}
Beispiel #10
0
class QualiMapRunner:

    QUALIMAP_PATH = '/kb/module/qualimap-bin/qualimap'
    JAVA_MEM_DEFAULT_SIZE = '16G'
    LARGE_BAM_FILE_SIZE = 20 * 1024 * 1024 * 1024  # 20 GB
    TIMEOUT = 72 * 60 * 60  # 72 hours

    def _get_file_size(self, file_path):
        file_size = os.path.getsize(file_path)
        print('File size: {} -- {}'.format(file_size, file_path))
        return file_size

    def _large_file(self, file_path):

        filename, file_extension = os.path.splitext(file_path)
        multiplier = 0

        if file_extension == '.txt':
            total_file_size = 0
            with open(file_path, 'r') as f:
                for line in f:
                    bam_file_path = line.split('\t')[1]
                    total_file_size += self._get_file_size(bam_file_path)
            print('Total file size: {}'.format(total_file_size))
            multiplier = int(total_file_size) / int(self.LARGE_BAM_FILE_SIZE)
        else:
            multiplier = int(self._get_file_size(file_path)) / int(
                self.LARGE_BAM_FILE_SIZE)

        print('setting number of windows multiplier to: {}'.format(multiplier))

        return multiplier

    def _timeout_handler(self, signum, frame):
        print('Signal handler called with signal', signum)
        raise ValueError('QualiMap takes too long')

    def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
        self.scratch_dir = scratch_dir
        self.rau = ReadsAlignmentUtils(callback_url)
        self.kbr = KBaseReport(callback_url)
        self.dfu = DataFileUtil(callback_url)
        self.gfu = GenomeFileUtil(callback_url)
        self.set_api = SetAPI(srv_wiz_url)
        self.ws = Workspace(workspace_url)
        self.valid_commands = ['bamqc', 'multi-bamqc']

    def run_app(self, params):
        self.validate_params(params)
        print('Validated Params = ')
        pprint(params)
        run_info = self.get_run_info(params)

        if run_info.get('mode') not in ['single', 'multi']:
            raise ValueError(
                'Error in fetching the type to determine run settings.')

        run_error = False
        try:
            signal.signal(signal.SIGALRM, self._timeout_handler)
            signal.alarm(self.TIMEOUT)
            if run_info['mode'] == 'single':
                result = self.run_bamqc(params['input_ref'],
                                        run_info['input_info'])
            elif run_info['mode'] == 'multi':
                result = self.run_multi_sample_qc(params['input_ref'],
                                                  run_info['input_info'])
            signal.alarm(0)
        except Exception:
            run_error = True

            workdir = os.path.join(self.scratch_dir,
                                   'qualimap_' + str(int(time.time() * 10000)))
            os.makedirs(workdir)

            with open(os.path.join(workdir, 'qualimapReport.html'),
                      'w') as report:
                report.write('<html><body><p></p></body></html>')

            package_info = self.package_output_folder(
                workdir, 'QualiMap_report',
                'EMPTY HTML report directory for QualiMap BAM QC',
                'qualimapReport.html')

            result = {
                'qc_result_folder_path': workdir,
                'qc_result_zip_info': package_info,
                'shock_id': None
            }
            error_msg = 'Running QualiMap returned an error:\n{}\n'.format(
                traceback.format_exc())
            error_msg += 'Generating simple report instead\n'
            print(error_msg)

        if params['create_report']:
            result = self.create_report(result, params['output_workspace'],
                                        run_error, params['input_ref'])

        return result

    def create_report(self,
                      result,
                      output_workspace,
                      run_error=None,
                      input_ref=None):

        if run_error:
            objects_created = []
            info = self.get_obj_info(input_ref)
            obj_type = self.get_type_from_obj_info(info)
            if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'Alignment'
                })

            if obj_type in [
                    'KBaseRNASeq.RNASeqAlignmentSet',
                    'KBaseSets.ReadsAlignmentSet'
            ]:
                objects_created.append({
                    'ref': input_ref,
                    'description': 'AlignmentSet'
                })
                reads_alignment_info = self.get_alignments_from_set(input_ref)
                for alignment in reads_alignment_info:
                    alignment_ref = alignment.get('ref')
                    objects_created.append({
                        'ref': alignment_ref,
                        'description': 'Alignment'
                    })

            report_info = self.kbr.create_extended_report({
                'message':
                ' ',
                'objects_created':
                objects_created,
                'report_object_name':
                'qualimap_report' + str(uuid.uuid4()),
                'workspace_name':
                output_workspace
            })
            result['report_name'] = report_info['name']
            result['report_ref'] = report_info['ref']
            return result

        qc_result_zip_info = result['qc_result_zip_info']
        report_info = self.kbr.create_extended_report({
            'message':
            '',
            'objects_created': [],
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': qc_result_zip_info['shock_id'],
                'name': qc_result_zip_info['index_html_file_name'],
                'label': qc_result_zip_info['name']
            }],
            'report_object_name':
            'qualimap_report' + str(uuid.uuid4()),
            'workspace_name':
            output_workspace
        })
        result['report_name'] = report_info['name']
        result['report_ref'] = report_info['ref']
        return result

    def get_gtf_file(self, input_ref, set_op=False):

        print('Start fetching GFF file from genome')

        if set_op:
            set_data = self.set_api.get_reads_alignment_set_v1({
                'ref':
                input_ref,
                'include_item_info':
                1
            })
            input_ref = set_data['data']['items'][0]['ref']

        obj_data = self.dfu.get_objects({"object_refs":
                                         [input_ref]})['data'][0]['data']

        genome_ref = obj_data.get('genome_id')

        if not genome_ref:
            raise ValueError(
                'Alignment is not associated with a Genome object')

        result_directory = os.path.join(self.scratch_dir, str(uuid.uuid4()))
        os.makedirs(result_directory)

        genome_gtf_file = self.gfu.genome_to_gff({
            'genome_ref': genome_ref,
            'is_gtf': True,
            'target_dir': result_directory
        })['file_path']

        return genome_gtf_file

    def run_bamqc(self, input_ref, input_info):
        # download the input and setup a working dir
        alignment_info = self.rau.download_alignment({'source_ref': input_ref})
        bam_file_path = self.find_my_bam_file(
            alignment_info['destination_dir'])
        try:
            gtf_file = self.get_gtf_file(input_ref)
        except:
            gtf_file = ''

        workdir = os.path.join(self.scratch_dir,
                               'qualimap_' + str(int(time.time() * 10000)))

        options = [
            '-bam', bam_file_path, '-c', '-outdir', workdir, '-outformat',
            'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        options.append('--java-mem-size={}'.format(
            self.JAVA_MEM_DEFAULT_SIZE))  # always use large mem
        multiplier = self._large_file(bam_file_path)
        if multiplier:
            window_size = multiplier * 400
            print('using larger window size: {} and Java memory: {}'.format(
                window_size, self.JAVA_MEM_DEFAULT_SIZE))
            options.append(
                '-nw {}'.format(window_size))  # increase size of windows

        self.run_cli_command('bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap BAM QC', 'qualimapReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def run_multi_sample_qc(self, input_ref, input_info):
        # download the input and setup a working dir
        reads_alignment_info = self.get_alignments_from_set(input_ref)
        try:
            gtf_file = self.get_gtf_file(input_ref, set_op=True)
        except:
            gtf_file = ''
        suffix = 'qualimap_' + str(int(time.time() * 10000))
        workdir = os.path.join(self.scratch_dir, suffix)
        os.makedirs(workdir)

        input_file_path = self.create_multi_qualimap_cfg(
            reads_alignment_info, workdir)

        options = [
            '-d', input_file_path, '-r', '-c', '-outdir', workdir,
            '-outformat', 'html'
        ]

        if gtf_file:
            options += ['-gff', gtf_file]

        multiplier = self._large_file(input_file_path)
        if multiplier:
            window_size = multiplier * 400
            print('using larger window size: {} and Java memory: {}'.format(
                window_size, self.JAVA_MEM_DEFAULT_SIZE))
            options.append(
                '-nw {}'.format(window_size))  # increase size of windows
            options.append('--java-mem-size={}'.format(
                self.JAVA_MEM_DEFAULT_SIZE))

        self.run_cli_command('multi-bamqc', options)

        package_info = self.package_output_folder(
            workdir, 'QualiMap_report',
            'HTML report directory for QualiMap Multi-sample BAM QC',
            'multisampleBamQcReport.html')

        return {
            'qc_result_folder_path': workdir,
            'qc_result_zip_info': package_info
        }

    def get_alignments_from_set(self, alignment_set_ref):
        set_data = self.set_api.get_reads_alignment_set_v1({
            'ref':
            alignment_set_ref,
            'include_item_info':
            1
        })
        items = set_data['data']['items']

        reads_alignment_data = []
        for alignment in items:
            alignment_info = self.rau.download_alignment(
                {'source_ref': alignment['ref']})
            bam_file_path = self.find_my_bam_file(
                alignment_info['destination_dir'])
            label = None
            if 'label' in alignment:
                label = alignment['label']
            reads_alignment_data.append({
                'bam_file_path': bam_file_path,
                'ref': alignment['ref'],
                'label': label,
                'info': alignment['info']
            })
        return reads_alignment_data

    def create_multi_qualimap_cfg(self, reads_alignment_info, workdir):
        # Group by labels if there is at least one defined
        use_labels = False
        for alignment in reads_alignment_info:
            if alignment['label']:
                use_labels = True
                break

        # write the file
        input_file_path = os.path.join(workdir, 'multi_input.txt')
        input_file = open(input_file_path, 'w')
        name_lookup = {}
        for alignment in reads_alignment_info:
            name = alignment['info'][1]
            if name in name_lookup:
                name_lookup[name] += 1
                name = name + '_' + str(name_lookup[name])
            else:
                name_lookup[name] = 1

            input_file.write(name + '\t' + alignment['bam_file_path'])
            if use_labels:
                if alignment['label']:
                    input_file.write('\t' + alignment['label'])
                else:
                    input_file.write('\tunlabeled')
            input_file.write('\n')
        input_file.close()
        return input_file_path

    def get_run_info(self, params):
        info = self.get_obj_info(params['input_ref'])
        obj_type = self.get_type_from_obj_info(info)
        if obj_type in ['KBaseRNASeq.RNASeqAlignment']:
            return {'mode': 'single', 'input_info': info}
        if obj_type in [
                'KBaseRNASeq.RNASeqAlignmentSet', 'KBaseSets.ReadsAlignmentSet'
        ]:
            return {'mode': 'multi', 'input_info': info}
        raise ValueError('Object type of input_ref is not valid, was: ' +
                         str(obj_type))

    def validate_params(self, params):
        if 'input_ref' not in params:
            raise ValueError(
                'required parameter field "input_ref" was not set')

        create_report = False
        if 'create_report' in params:
            if int(params['create_report']) == 1:
                if 'output_workspace' not in params:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                if not params['output_workspace']:
                    raise ValueError(
                        'If "create_report" was set, then "output_workspace" is required'
                    )
                create_report = True
        params['create_report'] = create_report

    def run_cli_command(self, command, options, cwd=None):
        if command not in self.valid_commands:
            raise ValueError('Invalid QualiMap command: ' + str(command))
        command = [self.QUALIMAP_PATH, command] + options
        print('Running: ' + ' '.join(command))

        if not cwd:
            cwd = self.scratch_dir

        p = subprocess.Popen(command, cwd=cwd, shell=False)
        exitCode = p.wait()

        if (exitCode == 0):
            print('Success, exit code was: ' + str(exitCode))
        else:
            raise ValueError('Error running command: ' + ' '.join(command) +
                             '\n' + 'Exit Code: ' + str(exitCode))

    def find_my_bam_file(self, dirpath):
        bam_path = None
        for f in os.listdir(dirpath):
            fullpath = os.path.join(dirpath, f)
            if os.path.isfile(fullpath) and f.lower().endswith('.bam'):
                if bam_path is not None:
                    raise ValueError(
                        'Error! Too many BAM files were downloaded for this alignment!'
                    )
                bam_path = fullpath
        if bam_path is None:
            raise ValueError(
                'Error! No BAM files were downloaded for this alignment!')
        return bam_path

    def package_output_folder(self, folder_path, zip_file_name,
                              zip_file_description, index_html_file):
        ''' Simple utility for packaging a folder and saving to shock '''
        output = self.dfu.file_to_shock({
            'file_path': folder_path,
            'make_handle': 0,
            'pack': 'zip'
        })
        return {
            'shock_id': output['shock_id'],
            'name': zip_file_name,
            'description': zip_file_description,
            'index_html_file_name': index_html_file
        }

    def get_type_from_obj_info(self, info):
        return info[2].split('-')[0]

    def get_obj_info(self, ref):
        return self.ws.get_object_info3({'objects': [{
            'ref': ref
        }]})['infos'][0]
Beispiel #11
0
class ImportExpressionMatrixUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.fv = KBaseFeatureValues(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_tsv_as_expression_matrix_from_staging(self, params):
        '''
        import_tsv_as_expression_matrix_from_staging: wrapper method for
                                    KBaseFeatureValues.tsv_file_to_matrix

        required params:
            staging_file_subdir_path: subdirectory file path
              e.g.
                for file: /data/bulk/user_name/file_name
                staging_file_subdir_path is file_name
                for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
                staging_file_subdir_path is subdir_1/subdir_2/file_name
            matrix_name: output Expressin Matirx file name
            workspace_name: workspace name/ID of the object

        optional params:
            genome_ref: optional reference to a Genome object that will be
                  used for mapping feature IDs to
            fill_missing_values: optional flag for filling in missing
                    values in matrix (default value is false)
            data_type: optional filed, value is one of 'untransformed',
                    'log2_level', 'log10_level', 'log2_ratio', 'log10_ratio' or
                    'unknown' (last one is default value)
            data_scale: optional parameter (default value is '1.0')

        return:
            obj_ref: return object reference
        '''

        log('--->\nrunning ImportAssemblyUtil.import_tsv_as_expression_matrix_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_tsv_as_expression_matrix_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')

        import_matrix_params = params
        import_matrix_params['input_file_path'] = scratch_file_path
        import_matrix_params['output_ws_name'] = params.get('workspace_name')
        import_matrix_params['output_obj_name'] = params.get('matrix_name')

        ref = self.fv.tsv_file_to_matrix(import_matrix_params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'),
                                                   ref.get('output_matrix_ref'))
        returnVal = {'obj_ref': ref.get('output_matrix_ref')}

        return returnVal

    def validate_import_tsv_as_expression_matrix_from_staging_params(self, params):
        """
        validate_import_tsv_as_expression_matrix_from_staging_params:
                    validates params passed to import_tsv_as_expression_matrix_from_staging method

        """

        # check for required parameters
        for p in ['staging_file_subdir_path', 'workspace_name', 'matrix_name']:
            if p not in params:
                raise ValueError('"' + p + '" parameter is required, but missing')

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_tsv_as_expression_matrix_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {
            'object_refs': [obj_ref],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "Expression Matrix Object Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported TSV File: {}\n'.format(
                              params.get('staging_file_subdir_path'))

        report_params = {
              'message': upload_message,
              'workspace_name': params.get('workspace_name'),
              'report_object_name': 'kb_upload_mothods_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
Beispiel #12
0
class Utils:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.kbse = KBaseSearchEngine(config['search-url'])
        self.gen_api = GenericsAPI(self.callback_url)
        self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom"
        self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
        self.DEFAULT_UNIT_ID = "Custom:Unit"

    @staticmethod
    def validate_params(params, expected, opt_param=set()):
        """Validates that required parameters are present. Warns if unexpected parameters appear"""
        expected = set(expected)
        opt_param = set(opt_param)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError(
                "Required keys {} not in supplied parameters".format(
                    ", ".join(expected - pkeys)))
        defined_param = expected | opt_param
        for param in params:
            if param not in defined_param:
                logging.warning(
                    "Unexpected parameter {} supplied".format(param))

    def get_conditions(self, params):
        data = self.dfu.get_objects(
            {'object_refs': [params['condition_set_ref']]})['data'][0]['data']
        conditions = {}
        keep_keys = params.get('conditions', data['conditions'].keys())
        for key in keep_keys:
            conditions[key] = defaultdict(list)
            for factor, val in zip(data['factors'], data['conditions'][key]):
                ont_abriv = factor['factor_ont_id'].split(":")[0]
                factor['value'] = val
                conditions[key][ont_abriv].append(copy.copy(factor))
        return {"conditions": conditions}

    def file_to_condition_set(self, params):
        """Convert a user supplied file to a compound set"""
        if 'input_file_path' in params:
            scratch_file_path = params['input_file_path']
        elif 'input_shock_id' in params:
            scratch_file_path = self.dfu.shock_to_file({
                'shock_id':
                params['input_shock_id'],
                'file_path':
                self.scratch
            }).get('file_path')
        else:
            raise ValueError(
                "Must supply either a input_shock_id or input_file_path")
        try:
            df = pd.read_excel(scratch_file_path, dtype='str')
        except XLRDError:
            df = pd.read_csv(scratch_file_path, sep="\t", dtype='str')
        comp_set = self._df_to_cs_obj(df)
        info = self.dfu.save_objects({
            "id":
            params['output_ws_id'],
            "objects": [{
                "type": "KBaseExperiments.ConditionSet",
                "data": comp_set,
                "name": params['output_obj_name']
            }]
        })[0]
        return {"condition_set_ref": "%s/%s/%s" % (info[6], info[0], info[4])}

    def _conditionset_data_to_df(self, data):
        """
        Converts a compound set object data to a dataframe
        """

        factors = pd.DataFrame(data['factors'])
        factors.rename(columns=lambda x: x.replace("ont", "ontology").
                       capitalize().replace("_", " "))
        conditions = pd.DataFrame(data['conditions'])
        cs_df = factors.join(conditions)

        return cs_df

    def _clusterset_data_to_df(self, data):
        """
        Converts a cluster set object data to a dataframe
        """

        original_matrix_ref = data.get('original_data')
        data_matrix = self.gen_api.fetch_data({
            'obj_ref': original_matrix_ref
        }).get('data_matrix')

        data_df = pd.read_json(data_matrix)
        clusters = data.get('clusters')

        id_name_list = [
            cluster.get('id_to_data_position').keys() for cluster in clusters
        ]
        id_names = [item for sublist in id_name_list for item in sublist]

        if set(data_df.columns.tolist()) == set(
                id_names):  # cluster is based on condition
            data_df = data_df.T

        cluster_names = [None] * data_df.index.size

        cluster_id = 0
        for cluster in clusters:
            item_ids = cluster.get('id_to_data_position').keys()
            item_idx = [data_df.index.get_loc(item_id) for item_id in item_ids]

            for idx in item_idx:
                cluster_names[idx] = cluster_id

            cluster_id += 1

        data_df['cluster'] = cluster_names

        return data_df

    def _ws_obj_to_df(self, input_ref):
        """Converts workspace obj to a dataframe"""
        res = self.dfu.get_objects({'object_refs': [input_ref]})['data'][0]
        name = res['info'][1]

        obj_type = res['info'][2]

        if "KBaseExperiments.ConditionSet" in obj_type:
            cs_df = self._conditionset_data_to_df(res['data'])
        elif "KBaseExperiments.ClusterSet" in obj_type:
            cs_df = self._clusterset_data_to_df(res['data'])
        else:
            err_msg = 'Ooops! [{}] is not supported.\n'.format(obj_type)
            err_msg += 'Please supply KBaseExperiments.ConditionSet or KBaseExperiments.ClusterSet'
            raise ValueError("err_msg")

        return name, cs_df, obj_type

    def _df_to_cs_obj(self, cs_df):
        """Converts a dataframe from a user file to a compound set object"""
        condition_set = {'ontology_mapping_method': "User Curation"}
        cs_df.fillna('', inplace=True)
        if not len(cs_df):
            raise ValueError("No factors in supplied files")
        factor_df = cs_df.filter(regex="[Uu]nit|[Ff]actor")
        condition_df = cs_df.drop(factor_df.columns, axis=1)
        if not len(condition_df.columns):
            raise ValueError(
                "Unable to find any condition columns in supplied file")

        factor_df.rename(
            columns=lambda x: x.lower().replace(" ontology ", "_ont_").strip(),
            inplace=True)
        if "factor" not in factor_df.columns:
            raise ValueError(
                "Unable to find a 'Factor' column in supplied file")
        factor_fields = ('factor', 'unit', 'factor_ont_id', 'unit_ont_id')
        factors = factor_df.filter(items=factor_fields).to_dict('records')

        condition_set['factors'] = [
            self._add_ontology_info(f) for f in factors
        ]
        condition_set['conditions'] = condition_df.to_dict('list')
        return condition_set

    def _search_ontologies(self, term, closest=False):
        """
        Match to an existing KBase ontology term
        :param term: Test to match
        :param closest: if false, term must exactly match an ontology ID
        :return: dict(ontology_ref, id)
        """
        params = {
            "object_types": ["OntologyTerm"],
            "match_filter": {
                "lookup_in_keys": {
                    "id": {
                        "value": term
                    }
                }
            },
            "access_filter": {
                "with_private": 0,
                "with_public": 1
            },
            "pagination": {
                "count": 1
            },
            "post_processing": {
                "skip_data": 1
            }
        }
        if closest:
            params['match_filter'] = {"full_text_in_all": term}
        res = self.kbse.search_objects(params)
        if not res['objects']:
            return None
        term = res['objects'][0]
        return {
            "ontology_ref": term['guid'].split(":")[1],
            "id": term['key_props']['id']
        }

    def _add_ontology_info(self, factor):
        """Searches KBASE ontologies for terms matching the user supplied factors and units.
        Add the references if found"""
        optionals = {
            "unit",
            "unit_ont_id",
            "unit_ont_ref",
        }
        factor = {
            k: v
            for k, v in factor.items() if k not in optionals or v != ""
        }
        ont_info = self._search_ontologies(
            factor.get('factor_ont_id', "").replace("_", ":"))
        if ont_info:
            factor['factor_ont_ref'] = ont_info['ontology_ref']
            factor['factor_ont_id'] = ont_info['id']
        else:
            factor['factor_ont_ref'] = self.DEFAULT_ONTOLOGY_REF
            factor['factor_ont_id'] = self.DEFAULT_ONTOLOGY_ID

        if factor.get('unit'):
            ont_info = self._search_ontologies(
                factor.get('unit_ont_id', '').replace("_", ":"))
            if ont_info:
                factor['unit_ont_ref'] = ont_info['ontology_ref']
                factor['unit_ont_id'] = ont_info['id']
            else:
                factor['unit_ont_ref'] = self.DEFAULT_ONTOLOGY_REF
                factor['unit_ont_id'] = self.DEFAULT_UNIT_ID
        return factor

    def to_tsv(self, params):
        """Convert an compound set to TSV file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".tsv")
        df.to_csv(files['file_path'], sep="\t", index=False)

        return _id, files

    def to_excel(self, params):
        """Convert an compound set to Excel file"""
        files = {}

        _id, df, obj_type = self._ws_obj_to_df(params['input_ref'])
        files['file_path'] = os.path.join(params['destination_dir'],
                                          _id + ".xlsx")

        writer = pd.ExcelWriter(files['file_path'])

        if "KBaseExperiments.ConditionSet" in obj_type:
            df.to_excel(writer, "Conditions", index=False)
        elif "KBaseExperiments.ClusterSet" in obj_type:
            df.to_excel(writer, "ClusterSet", index=True)
        # else is checked in `_ws_obj_to_df`

        writer.save()

        return _id, files

    def export(self, file, name, input_ref):
        """Saves a set of files to SHOCK for export"""
        export_package_dir = os.path.join(self.scratch,
                                          name + str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        shutil.move(file,
                    os.path.join(export_package_dir, os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [input_ref]
        })

        return {'shock_id': package_details['shock_id']}
    def generate_report(self, obj_refs, params):
        """
		generate_report: generate summary report

		 
		obj_refs: generated workspace object references. (return of upload_fastq_file)
		params:
		fwd_staging_file_name: single-end fastq file name or forward/left paired-end fastq file name from user's staging area
		rev_staging_file_name: reverse/right paired-end fastq file name user's staging area
		workspace_name: workspace name/ID that reads will be stored to

		"""

        uuid_string = str(uuid.uuid4())
        obj_refs_list = obj_refs.split(',')

        dfu = DataFileUtil(self.callback_url)

        upload_message = 'Import Finished\nImported Reads:\n'

        for obj_ref in obj_refs_list:
            get_objects_params = {
                'object_refs': [obj_ref],
                'ignore_errors': False
            }

            object_data = dfu.get_objects(get_objects_params)

            upload_message += "Reads Name: " + str(
                object_data.get('data')[0].get('info')[1]) + '\n'
            upload_message += "Reads Type: " + str(
                object_data.get('data')[0].get('info')[2]) + '\n'
            if params.get('fwd_staging_file_name'):
                upload_message += "Imported Reads File: %s" % params.get(
                    'fwd_staging_file_name')
                if params.get('rev_staging_file_name'):
                    upload_message += ' and %s\n' % params.get(
                        'rev_staging_file_name')
                else:
                    upload_message += '\n'
            else:
                reads_info = object_data.get('data')[0].get('info')[-1]
                if isinstance(reads_info, dict):
                    upload_message += "Reads Info: " + json.dumps(
                        reads_info, indent=1)[1:-1] + '\n'

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Beispiel #14
0
    def generate_report(self, obj_refs, params):
        """
        generate_report: generate summary report


        obj_refs: generated workspace object references. (return of upload_fastq_file)
        params:
        fwd_staging_file_name:
            single-end fastq file name or forward/left paired-end fastq file name
            from user's staging area
        rev_staging_file_name: reverse/right paired-end fastq file name user's staging area
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        obj_refs_list = obj_refs.split(',')

        dfu = DataFileUtil(self.callback_url)

        reads_number = 1 if 'urls_to_add' not in params else len(
            params['urls_to_add'])

        upload_message = 'Import Finished\nImported Reads: {}\n'.format(
            reads_number)

        for obj_ref in obj_refs_list:
            get_objects_params = {
                'object_refs': [obj_ref],
                'ignore_errors': False
            }

            object_data = dfu.get_objects(get_objects_params)
            number_of_reads = object_data.get('data')[0].get('data').get(
                'read_count')

            upload_message += "Reads Name: "
            upload_message += str(
                object_data.get('data')[0].get('info')[1]) + '\n'
            if params.get('fwd_staging_file_name'):
                if params.get('rev_staging_file_name'):
                    upload_message += 'Imported Reads Files:\n'
                    upload_message += 'Forward: {}\n'.format(
                        params.get('fwd_staging_file_name'))
                    upload_message += 'Reverse: {}\n'.format(
                        params.get('rev_staging_file_name'))
                else:
                    upload_message += 'Imported Reads File: {}\n'.format(
                        params.get('fwd_staging_file_name'))
                if isinstance(number_of_reads, (int, long)):
                    upload_message += 'Number of Reads: {:,}\n'.format(
                        number_of_reads)
            else:
                reads_info = object_data.get('data')[0].get('info')[-1]
                if isinstance(reads_info, dict):
                    upload_message += "Reads Info: "
                    upload_message += json.dumps(reads_info,
                                                 indent=1)[1:-1] + '\n'

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
class ImportGenbankUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = os.path.join(config['scratch'], 'import_GenBank_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev')
        self.uploader_utils = UploaderUtil(config)

    def import_genbank_from_staging(self, params):
        '''
          import_genbank_from_staging: wrapper method for GenomeFileUtil.genbank_to_genome

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          genome_name - becomes the name of the object
          workspace_name - the name of the workspace it gets saved to.
          source - Source of the file typically something like RefSeq or Ensembl

          optional params:
          release - Release or version number of the data
              per example Ensembl has numbered releases of all their data: Release 31
          generate_ids_if_needed - If field used for feature id is not there,
              generate ids (default behavior is raising an exception)
          genetic_code - Genetic code of organism. Overwrites determined GC from
              taxon object
          type - Reference, Representative or User upload

          return:
          genome_ref: return object reference
        '''

        log('--->\nrunning ImportGenbankUtil.import_genbank_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_genbank_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                                 download_staging_file_params).get('copy_file_path')
        file = {
            'path': scratch_file_path
        }
        import_genbank_params = params
        import_genbank_params['file'] = file
        del import_genbank_params['staging_file_subdir_path']

        returnVal = self.gfu.genbank_to_genome(import_genbank_params)

        """
        Update the workspace object related meta-data for staged file
        """
        #self.uploader_utils.update_staging_service(
        #    download_staging_file_params.get('staging_file_subdir_path'),
        #    returnVal['genome_ref'])
        return returnVal

    def validate_import_genbank_from_staging_params(self, params):
        """
        validate_import_genbank_from_staging_params:
                    validates params passed to import_genbank_from_staging method
        """
        # check for required parameters
        for p in ['staging_file_subdir_path', 'genome_name', 'workspace_name', 'source']:
            if p not in params:
                raise ValueError('"' + p + '" parameter is required, but missing')

    def generate_html_report(self, genome_ref, params):
        """
        _generate_html_report: generate html summary report
        """
        log('start generating html report')
        genome_obj = self.dfu.get_objects({'object_refs': [genome_ref]})
        html_report = list()
        result_file_path = os.path.join(self.scratch, 'report.html')

        genome_name = str(genome_obj.get('data')[0].get('info')[1])
        genome_file = params.get('staging_file_subdir_path')

        genome_data = genome_obj.get('data')[0].get('data')
        genome_info = genome_obj.get('data')[0].get('info')
        source = genome_info[10].get('Source')
        num_contigs = genome_info[10].get('Number contigs')
        size = genome_info[10].get('Size')
        gc_content = genome_info[10].get('GC content')
        warnings = genome_data.get('warnings', [])
        feature_counts = sorted(list(genome_data.get('feature_counts', {})
                                     .items()))

        genome_overview_data = collections.OrderedDict()

        genome_overview_data['Name'] = '{} ({})'.format(genome_name, genome_ref)
        #genome_overview_data['Uploaded File'] = genome_file
        genome_overview_data['Date Uploaded'] = time.strftime("%c")
        genome_overview_data['Source'] = source
        genome_overview_data['Number of Contigs'] = num_contigs
        genome_overview_data['Size'] = size
        genome_overview_data['GC Content'] = gc_content
        genome_overview_data['Warnings'] = "\n".join(warnings)
        genome_overview_data.update(feature_counts)

        overview_content = ''
        overview_content += '<br/><table>\n'
        for key, val in genome_overview_data.iteritems():
            overview_content += '<tr><td><b>{}</b></td>'.format(key)
            overview_content += '<td>{}</td>'.format(val)
            overview_content += '</tr>\n'
        overview_content += '</table>'

        feature_content = str([[str(k), v] for k, v in
                               genome_data.get('feature_counts', {}).items()
                               if k != 'gene'])
        contig_content = str([[str(c), l] for c, l in
                              zip(genome_data.get('contig_ids', []),
                                  genome_data.get('contig_lengths', []))])
        with open(result_file_path, 'w') as result_file:
            with open(os.path.join(os.path.dirname(__file__), 'report_template_genome.html'),
                      'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace('<p>Overview_Content</p>',
                                                          overview_content)
                report_template = report_template.replace('*FEATURE_DATA*',
                                                          feature_content)
                report_template = report_template.replace('*CONTIG_DATA*',
                                                          contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({'file_path': self.scratch,
                                                  'pack': 'zip'})['shock_id']

        html_report.append({'shock_id': report_shock_id,
                            'name': os.path.basename(result_file_path),
                            'label': os.path.basename(result_file_path),
                            'description': 'HTML summary report for imported Genome'})
        return html_report

    def generate_report(self, genome_ref, params):
        """
        :param genome_ref:  Return Val from GenomeFileUtil for Uploaded genome
                            Need to get report warnings and message from it.
        :return: 
        """
        uuid_string = str(uuid.uuid4())

        objects_created = [{'ref': genome_ref,
                            'description': 'Imported Genome'}]

        output_html_files = self.generate_html_report(genome_ref, params)
        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 300,
            'report_object_name': 'kb_genome_upload_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
Beispiel #16
0
class ImportAssemblyUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = os.path.join(config['scratch'],
                                    'import_assembly_' + str(uuid.uuid4()))
        handler_utils._mkdir_p(self.scratch)
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_fasta_as_assembly_from_staging(self, params):
        '''
          import_fasta_as_assembly_from_staging: wrapper method for
                                    AssemblyUtil.save_assembly_from_fasta

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          assembly_name - output Assembly file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''
        log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_fasta_as_assembly_from_staging(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')
        file = {'path': scratch_file_path}
        import_assembly_params = params
        import_assembly_params['file'] = file

        ref = self.au.save_assembly_from_fasta(import_assembly_params)
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(
            params.get('staging_file_subdir_path'), ref)

        returnVal = {'obj_ref': ref}
        return returnVal

    def validate_import_fasta_as_assembly_from_staging(self, params):
        """
        validate_import_fasta_as_assembly_from_staging:
                    validates params passed to import_fasta_as_assembly_from_staging method
        """
        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name', 'assembly_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

    def generate_html_report(self, assembly_ref, assembly_object, params):
        """
        _generate_html_report: generate html summary report
        """
        log('start generating html report')
        html_report = list()

        assembly_data = assembly_object.get('data')[0].get('data')
        assembly_info = assembly_object.get('data')[0].get('info')

        result_file_path = os.path.join(self.scratch, 'report.html')

        assembly_name = str(assembly_info[1])
        assembly_file = params.get('staging_file_subdir_path')

        dna_size = assembly_data.get('dna_size')
        num_contigs = assembly_data.get('num_contigs')

        assembly_overview_data = collections.OrderedDict()

        assembly_overview_data['Name'] = '{} ({})'.format(
            assembly_name, assembly_ref)
        assembly_overview_data['Uploaded File'] = assembly_file
        assembly_overview_data['Date Uploaded'] = time.strftime("%c")
        assembly_overview_data['DNA Size'] = dna_size
        assembly_overview_data['Number of Contigs'] = num_contigs

        overview_content = ''
        overview_content += '<br/><table>\n'
        for key, val in assembly_overview_data.iteritems():
            overview_content += '<tr><td><b>{}</b></td>'.format(key)
            overview_content += '<td>{}</td>'.format(val)
            overview_content += '</tr>\n'
        overview_content += '</table>'

        contig_data = assembly_data.get('contigs').values()
        contig_content = str([[str(e['contig_id']), e['length']]
                              for e in contig_data])

        with open(result_file_path, 'w') as result_file:
            with open(
                    os.path.join(os.path.dirname(__file__),
                                 'report_template_assembly.html'),
                    'r') as report_template_file:
                report_template = report_template_file.read()
                report_template = report_template.replace(
                    '<p>*Overview_Content*</p>', overview_content)
                report_template = report_template.replace(
                    '*CONTIG_DATA*', contig_content)
                result_file.write(report_template)
        result_file.close()

        report_shock_id = self.dfu.file_to_shock({
            'file_path': self.scratch,
            'pack': 'zip'
        })['shock_id']

        html_report.append({
            'shock_id':
            report_shock_id,
            'name':
            os.path.basename(result_file_path),
            'label':
            os.path.basename(result_file_path),
            'description':
            'HTML summary report for Imported Assembly'
        })
        return html_report

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_fasta_as_assembly_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to
        
        """
        uuid_string = str(uuid.uuid4())

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}
        object_data = self.dfu.get_objects(get_objects_params)
        objects_created = [{
            'ref': obj_ref,
            'description': 'Imported Assembly'
        }]

        output_html_files = self.generate_html_report(obj_ref, object_data,
                                                      params)

        report_params = {
            'message': '',
            'workspace_name': params.get('workspace_name'),
            'objects_created': objects_created,
            'html_links': output_html_files,
            'direct_html_link_index': 0,
            'html_window_height': 270,
            'report_object_name': 'kb_upload_assembly_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Beispiel #17
0
class kb_fastqc:
    '''
    Module Name:
    kb_fastqc

    Module Description:
    A KBase module: kb_fastqc
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "1.0.4"
    GIT_URL = "https://github.com/Tianhao-Gu/kb_fastqc.git"
    GIT_COMMIT_HASH = "3f552db07e04f4b01eec0b38ec49546a2335d87e"

    #BEGIN_CLASS_HEADER

    def _get_input_file_ref_from_params(self, params):
        if 'input_file_ref' in params:
            return params['input_file_ref']
        else:
            if 'input_ws' not in params and 'input_file' not in params:
                raise ValueError('Either the "input_file_ref" field or the ' +
                                 '"input_ws" with "input_file" fields ' +
                                 'must be set.')
            return str(params['input_ws']) + '/' + str(params['input_file'])

    def create_report(self, token, ws, uuid_string, read_file_path):
        output_html_files = list()
        output_zip_files = list()
        first_file = ""
        html_string = ""
        html_count = 0
        with open('/kb/data/index_start.txt', 'r') as start_file:
            html_string = start_file.read()

        # Make HTML folder
        html_folder = os.path.join(read_file_path, 'html')
        os.mkdir(html_folder)
        for file in os.listdir(read_file_path):
            label = ".".join(file.split(".")[1:])
            if (file.endswith(".zip")):
                desc = 'Zip file generated by fastqc that contains ' + \
                       'original images seen in the report'
                output_zip_files.append({
                    'path':
                    os.path.join(read_file_path, file),
                    'name':
                    file,
                    'label':
                    label,
                    'description':
                    desc
                })
            if (file.endswith(".html")):
                # Move html into html folder
                shutil.move(os.path.join(read_file_path, file),
                            os.path.join(html_folder, file))

                if (first_file == ""):
                    first_file = file

                html_string+="            <button data-button=\"page "+str(html_count) + \
                             "\" data-page=\""+file+"\">Page "+str(html_count+1)+"</button>\n"
                html_count += 1

        html_string += "        </div>    </div>    <div id=\"body\">\n"
        html_string += "        <iframe id=\"content\" "
        html_string += "style=\"width: 100%; border: none; \" src=\"" + first_file + "\"></iframe>\n    </div>"

        with open('/kb/data/index_end.txt', 'r') as end_file:
            html_string += end_file.read()

        with open(os.path.join(html_folder, "index.html"), 'w') as index_file:
            index_file.write(html_string)

        shock = self.dfu.file_to_shock({
            'file_path': html_folder,
            'make_handle': 0,
            'pack': 'zip'
        })
        desc = 'HTML files generated by fastqc that contains report on ' + \
               'quality of reads'
        output_html_files.append({
            'shock_id': shock['shock_id'],
            'name': 'index.html',
            'label': 'html files',
            'description': desc
        })

        report_params = {
            'direct_html_link_index': 0,
            'file_links': output_zip_files,
            'html_links': output_html_files,
            'workspace_name': ws,
            'report_object_name': 'kb_fastqc_report_' + uuid_string
        }
        kbase_report_client = KBaseReport(self.callback_url, token=token)
        output = kbase_report_client.create_extended_report(report_params)
        return output

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.workspaceURL = config['workspace-url']
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def runFastQC(self, ctx, input_params):
        """
        :param input_params: instance of type "FastQCParams" -> structure:
           parameter "input_ws" of String, parameter "input_file" of String,
           parameter "input_file_ref" of String
        :returns: instance of type "FastQCOutput" -> structure: parameter
           "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: reported_output
        #BEGIN runFastQC

        token = ctx['token']
        wsClient = workspaceService(self.workspaceURL, token=token)
        uuid_string = str(uuid.uuid4())
        read_file_path = self.scratch + "/" + uuid_string
        os.mkdir(read_file_path)

        input_file_ref = self._get_input_file_ref_from_params(input_params)

        library = None
        try:
            library = wsClient.get_objects2(
                {'objects': [{
                    'ref': input_file_ref
                }]})['data'][0]
        except Exception as e:
            raise ValueError(
                'Unable to get read library object from workspace: (' +
                input_file_ref + ')' + str(e))

        download_read_params = {'read_libraries': [], 'interleaved': "false"}
        if ("SingleEnd" in library['info'][2]
                or "PairedEnd" in library['info'][2]):
            download_read_params['read_libraries'].append(library['info'][7] +
                                                          "/" +
                                                          library['info'][1])
        elif ("SampleSet" in library['info'][2]):
            for sample_id in library['data']['sample_ids']:
                if ("/" in sample_id):
                    download_read_params['read_libraries'].append(sample_id)
                else:
                    if (sample_id.isdigit()):
                        download_read_params['read_libraries'].append(
                            library['info'][6] + "/" + sample_id)
                    else:
                        download_read_params['read_libraries'].append(
                            library['info'][7] + "/" + sample_id)

        ru = ReadsUtils(os.environ['SDK_CALLBACK_URL'])
        ret = ru.download_reads(download_read_params)

        read_file_list = list()
        for file in ret['files']:

            obj_info = self.dfu.get_objects({'object_refs':
                                             [file]})['data'][0]['info']
            obj_name = obj_info[1]
            obj_ref_suffix = '_' + str(obj_info[6]) + '_' + str(
                obj_info[0]) + '_' + str(obj_info[4])

            files = ret['files'][file]['files']

            fwd_name = files['fwd'].split('/')[-1]
            fwd_name = fwd_name.replace('.gz', '')
            # using object_name + ref_suffix + suffix as file name
            fwd_name = obj_name + obj_ref_suffix + '.' + fwd_name.split(
                '.', 1)[-1]
            shutil.move(files['fwd'], os.path.join(read_file_path, fwd_name))
            read_file_list.append(os.path.join(read_file_path, fwd_name))

            if (files['rev'] is not None):
                rev_name = files['rev'].split('/')[-1]
                rev_name = rev_name.replace('.gz', '')
                rev_name = obj_name + obj_ref_suffix + '.' + rev_name.split(
                    '.', 1)[-1]
                shutil.move(files['rev'], os.path.join(read_file_path,
                                                       rev_name))
                read_file_list.append(os.path.join(read_file_path, rev_name))

        subprocess.check_output(["fastqc"] + read_file_list)
        # report = "Command run: "+" ".join(["fastqc"]+read_file_list)

        output = self.create_report(token, input_params['input_ws'],
                                    uuid_string, read_file_path)
        reported_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        # Remove temp reads directory
        shutil.rmtree(read_file_path, ignore_errors=True)

        #END runFastQC

        # At some point might do deeper type checking...
        if not isinstance(reported_output, dict):
            raise ValueError('Method runFastQC return value ' +
                             'reported_output is not type dict as required.')
        # return the results
        return [reported_output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
class ImportAssemblyUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)

    def import_fasta_as_assembly_from_staging(self, params):
        '''
          import_fasta_as_assembly_from_staging: wrapper method for
                                    AssemblyUtil.save_assembly_from_fasta

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          assembly_name - output Assembly file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportAssemblyUtil.import_fasta_as_assembly_from_staging\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_fasta_as_assembly_from_staging(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
            download_staging_file_params).get('copy_file_path')

        file = {'path': scratch_file_path}

        import_assembly_params = params
        import_assembly_params['file'] = file

        ref = self.au.save_assembly_from_fasta(import_assembly_params)

        returnVal = {'obj_ref': ref}

        return returnVal

    def validate_import_fasta_as_assembly_from_staging(self, params):
        """
        validate_import_fasta_as_assembly_from_staging:
                    validates params passed to import_fasta_as_assembly_from_staging method

        """

        # check for required parameters
        for p in [
                'staging_file_subdir_path', 'workspace_name', 'assembly_name'
        ]:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                         import_fasta_as_assembly_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {'object_refs': [obj_ref], 'ignore_errors': False}

        object_data = self.dfu.get_objects(get_objects_params)
        base_count = object_data.get('data')[0].get('data').get('base_counts')
        dna_size = object_data.get('data')[0].get('data').get('dna_size')

        upload_message += "Assembly Object Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported Fasta File: {}\n'.format(
            params.get('staging_file_subdir_path'))

        if isinstance(dna_size, (int, long)):
            upload_message += 'DNA Size: {:,}\n'.format(dna_size)

        if isinstance(base_count, dict):
            upload_message += 'Base Count:\n{}\n'.format(
                json.dumps(base_count, indent=1)[2:-2])

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'kb_upload_mothods_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output
Beispiel #19
0
    def find_motifs(self, ctx, params):
        """
        :param params: instance of type "find_motifs_params" (Genome is a
           KBase genome Featureset is a KBase featureset Promoter_length is
           the length of promoter requested for all genes) -> structure:
           parameter "workspace_name" of String, parameter "fastapath" of
           String, parameter "motif_min_length" of Long, parameter
           "motif_max_length" of Long
        :returns: instance of type "extract_output_params" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN find_motifs

        #TODO: Things to fix in here...
        #      Use MotifUtils to parse output and create object
        #      create new function for report ?

        if 'motif_min_length' not in params:
            params['motif_min_length'] = 8
        if 'motif_max_length' not in params:
            params['motif_max_length'] = 16
        motMin = params['motif_min_length']
        motMax = params['motif_max_length']

        #promoterFastaFilePath = self.get_promoter_for_gene(ctx,params)[0]
        promoterFastaFilePath = params['fastapath']

        MEMEMotifCommand = MEU.build_meme_command(promoterFastaFilePath)
        MEU.run_meme_command(MEMEMotifCommand)
        meme_out_path = '/kb/module/work/tmp/meme_out/meme.txt'
        meme_params = {
            'ws_name': params['workspace_name'],
            'path': meme_out_path,
            'obj_name': params['obj_name']
        }
        MOU = MotifUtils(self.callback_url)
        dfu = DataFileUtil(self.callback_url)
        locDict = {}
        if 'SS_ref' in params:
            get_ss_params = {'object_refs': [params['SS_ref']]}
            SS = dfu.get_objects(get_ss_params)['data'][0]['data']
            for s in SS['sequences']:
                if s['source'] is not None:
                    locDict['sequence_id'] = {
                        'contig': s['source']['location'][0][0],
                        'start': str(s['source']['location'][0][1])
                    }
        if len(locDict.keys()) > 0:
            meme_params['absolute_locations'] = locDict
        meme_params['min_len'] = motMin
        meme_params['max_len'] = motMax
        obj_ref = MOU.UploadFromMEME(meme_params)['obj_ref']
        #memeMotifList = MEU.parse_meme_output()

        #HERE:
        #we've got object ref
        #we've got html building functions
        #build report, setup return,
        #make report and return it

        #buildReportFromMotifSet()

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        timestamp = str(timestamp)
        htmlDir = self.shared_folder + '/html' + timestamp
        os.mkdir(htmlDir)
        #lineCount = 0
        #with open(promoterFastaFilePath,'r') as pFile:
        #    for line in pFile:
        #        lineCount += 1
        #numFeat = lineCount/2
        #with open(promoterFastaFilePath,'r') as pFile:
        #    fileStr = pFile.read()
        #promHtmlStr = '<html><body> '  + fileStr + ' </body></html>'
        #with open(htmlDir + '/promoters.html','w') as promHTML:
        #    promHTML.write(promHtmlStr)
        #JsonPath = '/kb/module/work/tmp'

        dfu = DataFileUtil(self.callback_url)
        get_obj_params = {'object_refs': [obj_ref]}
        memeMotifSet = dfu.get_objects(get_obj_params)['data'][0]['data']
        MakeReport(htmlDir, memeMotifSet)
        #buildReportFromMotifSet(memeMotifSet,htmlDir,'meme')

        #TODO: Here replace the makereport with a call to motifset utils
        #subprocess.call(['python','/kb/module/lib/identify_promoter/Utils/makeReport.py',JsonPath + '/meme_out/meme.json',htmlDir + '/meme.html',str(numFeat)])
        #fullMotifList = []
        #for m in memeMotifList:
        #    fullMotifList.append(m)

        #What needs to happen here:
        #call makeLogo for each of the json outputs(capture these from somewhere)

        #plt.rcParams['figure.dpi'] = 300

        #htmlFiles = ['index.html','gibbs.html','homer.html']
        #shockParamsList = []
        #for f in htmlFiles:
        #    shockParamsList.append({'file_path': htmlDir + f ,'make_handle': 0, 'pack': 'zip'})

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': htmlDir,
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        #Create motif set object from MotifList
        #TODO set parameters correctly
        #add narrative support to set
        #MSO = {}
        #MSO['Condition'] = 'Temp'
        #MSO['FeatureSet_ref'] = '123'
        #MSO['Motifs'] = []
        #MSO['Alphabet'] = ['A','C','G','T']
        #MSO['Background'] = {}
        #for letter in MSO['Alphabet']:
        #    MSO['Background'][letter] = 0.0

        #MSU.parseMotifList(fullMotifList,MSO)
        #objname = 'MotifSet' + str(int((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()*1000))

        #Pass motif set into this
        #save_objects_params = {}
        #save_objects_params['id'] = self.ws_info[0]
        #save_objects_params['id'] = long(params['workspace_name'].split('_')[1])
        #save_objects_params['id'] = dfu.ws_name_to_id(params['workspace_name'])
        #save_objects_params['objects'] = [{'type': 'KBaseGwasData.MotifSet' , 'data' : MSO , 'name' : objname}]

        #info = dfu.save_objects(save_objects_params)[0]
        #motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        #object_upload_ret = dfu.file_to_shock()

        reportName = 'MEMEMotifFinder_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [{
                'ref': obj_ref,
                'description': 'Motif Set generated by MEME'
            }],
            'message':
            '',
            'direct_html':
            None,
            'direct_html_link_index':
            0,
            'file_links': [],
            'html_links': [],
            'html_window_height':
            220,
            'workspace_name':
            params['workspace_name'],
            'report_object_name':
            reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            #'name': 'promoter_download.zip',
            'name': 'index.html',
            'label': 'Save promoter_download.zip'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }

        #END find_motifs

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method find_motifs return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
class ProkkaAnnotationTest(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        config_file = environ.get("KB_DEPLOYMENT_CONFIG", None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items("kb_prokka"):
            cls.cfg[nameval[0]] = nameval[1]
        # Token validation
        token = environ.get("KB_AUTH_TOKEN", None)
        authServiceUrl = cls.cfg.get(
            "auth-service-url",
            "https://kbase.us/services/authorization/Sessions/Login")
        auth_client = _KBaseAuth(authServiceUrl)
        user_id = auth_client.get_user(token)
        # WARNING: don"t call any logging methods on the context object,
        # it"ll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({
            "token":
            token,
            "user_id":
            user_id,
            "provenance": [{
                "service": "ProkkaAnnotation",
                "method": "please_never_use_it_in_production",
                "method_params": []
            }],
            "authenticated":
            1
        })
        cls.wsURL = cls.cfg["workspace-url"]
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = kb_prokka(cls.cfg)

    @classmethod
    def tearDownClass(cls):
        if hasattr(cls, "wsName"):
            cls.wsClient.delete_workspace({"workspace": cls.wsName})
            print("Test workspace was deleted")

    def getWsClient(self):
        return self.__class__.wsClient

    def getWsName(self):
        if hasattr(self.__class__, "wsName"):
            return self.__class__.wsName
        suffix = int(time.time() * 1000)
        wsName = "test_ProkkaAnnotation_" + str(suffix)
        ret = self.getWsClient().create_workspace({"workspace":
                                                   wsName})  # noqa
        self.__class__.wsName = wsName
        return wsName

    def getImpl(self):
        return self.__class__.serviceImpl

    def getContext(self):
        return self.__class__.ctx

#  def testGenomeOntologyEventsField(self):
#      testwith ontology events
#      test without

    def Xtest_modify_old_genome(self):
        self.callback_url = os.environ["SDK_CALLBACK_URL"]
        self.gfu = GenomeFileUtil(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        old_genome = "30045/15/1"

        new_genome = "30045/14/1"
        genome_name = 'OldRhodo'
        genome_data_old = self.dfu.get_objects({"object_refs":
                                                [old_genome]})["data"][0]
        genome_data_new = self.dfu.get_objects({"object_refs":
                                                [new_genome]})["data"][0]

        sso_1 = {
            "id": "1",
            "evidence": [],
            "term_name": "1",
            "ontology_ref": "1",
            "term_lineage": []
        }

        sso_2 = {
            "id": "2",
            "evidence": [],
            "term_name": "2",
            "ontology_ref": "2",
            "term_lineage": []
        }

        sso_terms = {'SSO1': sso_1, 'SSO2': sso_2}

        print("ABOUT TO MODIFY OLD GENOME")
        for i, item in enumerate(genome_data_old['data']['features']):
            genome_data_old['data']['features'][i]['ontology_terms'] = {
                "SSO": sso_terms
            }

        print("ABOUT TO MODIFY NEW GENOME")
        for i, item in enumerate(genome_data_new['data']['features']):
            genome_data_new['data']['features'][i]['ontology_terms'] = {
                "SSO": sso_terms
            }

        print("ABOUT TO SAVE OLD GENOME")
        info = self.gfu.save_one_genome({
            "workspace": self.getWsName(),
            "name": genome_name,
            "data": genome_data_old["data"],
            "provenance": self.ctx.provenance()
        })["info"]

        print("ABOUT TO SAVE NEW GENOME")
        info = self.gfu.save_one_genome({
            "workspace": self.getWsName(),
            "name": genome_name,
            "data": genome_data_new["data"],
            "provenance": self.ctx.provenance()
        })["info"]

    def test_reannotate_RICKETS(self):
        genome_ref = '31932/5/1'
        genome_ref = '32038/3/2'
        genome_ref = '32132/5/1'
        genome_name = 'Aceti'
        self.callback_url = os.environ["SDK_CALLBACK_URL"]
        self.dfu = DataFileUtil(self.callback_url)

        result = self.getImpl().annotate(
            self.getContext(), {
                "object_ref": genome_ref,
                "output_workspace": self.getWsName(),
                "output_genome_name": genome_name,
                "evalue": None,
                "fast": 0,
                "gcode": 0,
                "genus": "genus",
                "kingdom": "Bacteria",
                "metagenome": 0,
                "mincontiglen": 1,
                "norrna": 0,
                "notrna": 0,
                "rawproduct": 0,
                "rfam": 1,
                "scientific_name": "RhodoBacter"
            })[0]

        genome_data = self.dfu.get_objects(
            {"object_refs": [result['output_genome_ref']]})["data"][0]['data']
        scratch = "/kb/module/work/tmp/"

        with open(scratch + 'OUTPUT_GENOME.txt', 'w+') as outfile:
            json.dump(genome_data, outfile)

    def Xtest_reannotate_new_genome(self):
        genome_ref = '30045/14/1'
        genome_name = 'NewRhodo'

        result = self.getImpl().annotate(
            self.getContext(), {
                "object_ref": genome_ref,
                "output_workspace": self.getWsName(),
                "output_genome_name": genome_name,
                "evalue": None,
                "fast": 0,
                "gcode": 0,
                "genus": "genus",
                "kingdom": "Bacteria",
                "metagenome": 0,
                "mincontiglen": 1,
                "norrna": 0,
                "notrna": 0,
                "rawproduct": 0,
                "rfam": 1,
                "scientific_name": "RhodoBacter"
            })[0]

    def Xtest_reannotate_old_genome(self):
        genome_ref = '30045/15/1'
        genome_name = 'OldRhodo'

        result = self.getImpl().annotate(
            self.getContext(), {
                "object_ref": genome_ref,
                "output_workspace": self.getWsName(),
                "output_genome_name": genome_name,
                "evalue": None,
                "fast": 0,
                "gcode": 0,
                "genus": "genus",
                "kingdom": "Bacteria",
                "metagenome": 0,
                "mincontiglen": 1,
                "norrna": 0,
                "notrna": 0,
                "rawproduct": 0,
                "rfam": 1,
                "scientific_name": "RhodoBacter"
            })[0]
Beispiel #21
0
class kb_plant_rast:
    '''
    Module Name:
    kb_plant_rast

    Module Description:
    A KBase module: kb_plant_rast
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbaseapps/kb_plant_rast"
    GIT_COMMIT_HASH = "a652c0120abf90e97d0f0214f8ed4174f27b9a09"
    
    #BEGIN_CLASS_HEADER
    KMER_THRESHOLD = 1
    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.workspaceURL = config['workspace-url']
        self.token = os.environ['KB_AUTH_TOKEN']
        self.scratch = os.path.abspath(config['scratch'])
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        #END_CONSTRUCTOR
        pass

    def annotate_plant_transcripts(self, ctx, input):
        """
        :param input: instance of type "AnnotatePlantTranscriptsParams" ->
           structure: parameter "input_ws" of String, parameter
           "input_genome" of String, parameter "output_genome" of String
        :returns: instance of type "AnnotatePlantTranscriptsResults" ->
           structure: parameter "report_name" of String, parameter
           "report_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN annotate_plant_transcripts
        
        # Retrieve plant genome
        plant_genome = self.dfu.get_objects({'object_refs': [input['input_ws']+'/'+input['input_genome']]})['data'][0]

        # Force upgrade
        if("feature_counts" in plant_genome['data']):
            del(plant_genome['data']['feature_counts'])

        use_cds=1
        features = plant_genome['data']['cdss']
        if(len(features)==0):
            features = plant_genome['data']['features']
            use_cds=0
            if(len(features)==0):
                raise Exception("The genome does not contain any CDSs or features!")

        output = {'ftrs': len(features)}

        # Retrieve kmers
        Functions = set()
        Kmers_Functions = dict()
        for line in open('/data/functions_kmers.txt'):
            line=line.strip()
            (function_string,kmers_string)=line.split('\t')
            Functions.add(function_string)
            for kmer in kmers_string.split(', '):
                Kmers_Functions[kmer]=function_string
        output['fns']=len(Functions)
        output['kmers']=len(Kmers_Functions)

        Kmer_Length=8
        Hit_Proteins=dict()
        Hit_Kmers=set()
        output['short']=0
        for ftr in features:
            if('protein_translation' not in ftr):
                output['short']+=1
                continue

            Seq = ftr['protein_translation']
            SeqLen = len(Seq);
            if(SeqLen < 10):
                output['short']+=1
                continue
            seq_kmers = [Seq[i:i + Kmer_Length] for i in range(SeqLen-Kmer_Length+1)]
            for kmer in seq_kmers:
                if(kmer in Kmers_Functions):
                    if(ftr['id'] not in Hit_Proteins):
                        Hit_Proteins[ftr['id']]=dict()
                    if(Kmers_Functions[kmer] not in Hit_Proteins[ftr['id']]):
                        Hit_Proteins[ftr['id']][Kmers_Functions[kmer]]=0
                    Hit_Proteins[ftr['id']][Kmers_Functions[kmer]]+=1
                    Hit_Kmers.add(kmer)
        output['hit_kmers']=len(Hit_Kmers)

        #Eliminate hits that have a small number of kmers
        #Each function must have more than 1 kmer in order to be assigned
        Deleted_Proteins = set()
        output['few']=0
        for ftr in Hit_Proteins.keys():
            Deleted_Functions = set()
            for function in Hit_Proteins[ftr].keys():
                N_Kmers = Hit_Proteins[ftr][function]
                if(N_Kmers <= self.KMER_THRESHOLD):
                    Deleted_Functions.add(function)
                
            for function in Deleted_Functions:
                del(Hit_Proteins[ftr][function])
                
            if(len(Hit_Proteins[ftr])==0):
                output['few']+=1
                Deleted_Proteins.add(ftr)

        #Scan for multi-functional hits
        #If a function has more hits than others, it takes precendence
        #If there are more than one function with an equal number of hits, the feature is removed
        output['ambiguous']=0
        for ftr in Hit_Proteins:
            if(len(Hit_Proteins[ftr])==1):
                continue

            if(ftr in Deleted_Proteins):
                continue

            Top_Hit_Functions=dict()
            for function in Hit_Proteins[ftr].keys():
                if(Hit_Proteins[ftr][function] not in Top_Hit_Functions):
                    Top_Hit_Functions[Hit_Proteins[ftr][function]]=dict()
                Top_Hit_Functions[Hit_Proteins[ftr][function]][function]=1

            Top_Number = (sorted(Top_Hit_Functions.keys(),reverse=True))[0]
            if(len(Top_Hit_Functions[Top_Number].keys())>1):
                output['ambiguous']+=1
                Deleted_Proteins.add(ftr)
            else:
                Top_Function = Top_Hit_Functions[Top_Number].keys()[0]
                Hit_Proteins[ftr]={Top_Function:Top_Number}
            
        #remove the egregious proteins
        for ftr in Deleted_Proteins:
            del(Hit_Proteins[ftr])

        #count functions
        Hit_Functions=set()
        for ftr in Hit_Proteins.keys():
            Hit_Functions.add(Hit_Proteins[ftr].keys()[0])

        output['hit_ftrs']=len(Hit_Proteins)
        output['hit_fns']=len(Hit_Functions)

        #But, if annotating CDS, need to be able to retrieve parent feature
        parent_feature_index = dict()
        if(use_cds==1):
            for i in range(len(plant_genome['data']['features'])):
                parent_feature_index[plant_genome['data']['features'][i]['id']]=i

        #Now, re-populate feature functions, and save genome object
        #But, if annotating CDS, need to be able to retrieve parent feature
        parent_feature_index = dict()
        if(use_cds==1):
            parent_feature_index = dict([(f['id'], i) for i, f in enumerate(plant_genome['data']['features'])])
#            parent_feature_index = dict([(f['id'], i) for i, f in plant_genome['data']['features']])
#            for i in range(len(plant_genome['data']['features'])):
#                parent_feature_index[plant_genome['data']['features'][i]['id']]=i

        for ftr in features:
            if(ftr['id'] in Hit_Proteins):
                new_function = Hit_Proteins[ftr['id']].keys()[0]
                ftr['function'] = new_function
                if(use_cds==1):
                    plant_genome['data']['features'][parent_feature_index[ftr['parent_gene']]]['function']=new_function
        
        if('output_genome' not in input):
            input['output_genome']=input['input_genome']

        save_result = self.gfu.save_one_genome({'workspace' : input['input_ws'],
                                                'name' : input['output_genome'],
                                                'data' : plant_genome['data'],
                                                'upgrade' : 1});

        html_string="<html><head><title>KBase Plant Rast Report</title></head><body>"
        html_string+="<p>The Plant Rast app has finished running. "
        html_string+=str(output['ftrs'])+" protein sequences were scanned for "+str(output['kmers'])+" signature kmers.</p>"
        html_string+="<p>The app found "+str(output['hit_kmers'])+" signature kmers and was able to predict "
        html_string+=str(output['hit_fns'])+" enzymatic functions for "+str(output['hit_ftrs'])+" protein sequences.</p>"
#        html_string+="<p>During the annotation process, "+str(output['short'])+" features "
#        html_string+="were ignored because they were too short (<10 AA in length). "
#        html_string+=str(output['few'])+" features were ignored because they were hit by fewer than 2 kmers, and "
#        html_string+=str(output['ambiguous'])+" features were ignored because they were too ambiguous "
#        html_string+="(connected to multiple distinct metabolic functions).</p>"
        fraction_plantseed = float( (float(output['hit_fns']) / float(output['fns'])) * 100.0 )
        html_string+="<p>This result indicates that, for this set of protein sequences, the app detected {0:.0f}%".format(fraction_plantseed)
        html_string+=" of the enzymatic functions of plant primary metabolism that were curated as part of the PlantSEED project.</p></body></html>"

        saved_genome = "{}/{}/{}".format(save_result['info'][6],save_result['info'][0],save_result['info'][4])
        description = "Plant genome "+plant_genome['data']['id']+" annotated with metabolic functions"
        uuid_string = str(uuid.uuid4())
        report_params = { 'objects_created' : \
                          [{"ref":saved_genome,"description":description}],
                          'direct_html' : html_string,
                          'workspace_name' : input['input_ws'],
                          'report_object_name' : 'kb_plant_rast_report_' + uuid_string }
        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        report_client_output = kbase_report_client.create_extended_report(report_params)
        output['report_name']=report_client_output['name']
        output['report_ref']=report_client_output['ref']

        #END annotate_plant_transcripts

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method annotate_plant_transcripts return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {'state': "OK",
                     'message': "",
                     'version': self.VERSION,
                     'git_url': self.GIT_URL,
                     'git_commit_hash': self.GIT_COMMIT_HASH}
        #END_STATUS
        return [returnVal]
    def assembly_metadata_report(self, ctx, params):
        """
        :param params: instance of type "AssemblyMetadataReportParams" ->
           structure: parameter "assembly_input_ref" of type "assembly_ref",
           parameter "workspace_name" of String, parameter "showContigs" of
           type "boolean" (A boolean. 0 = false, other = true.)
        :returns: instance of type "AssemblyMetadataResults" -> structure:
           parameter "report_name" of String, parameter "report_ref" of String
        """

        # ctx is the context object
        # return variables are: output
        #BEGIN assembly_metadata_report

        token = ctx['token']
        uuid_string = str(uuid.uuid4())
        write_file_path = self.scratch + "/" + uuid_string

        # Print statements to stdout/stderr are captured and available as the App log
        print('Starting Assembly MetaData Report Function. Params=')
        pprint(params)

        # Step 1 - Parse/examine the parameters and catch any errors
        # It is important to check that parameters exist and are defined, and that nice error
        # messages are returned to users.  Parameter values go through basic validation when
        # defined in a Narrative App, but advanced users or other SDK developers can call
        # this function directly, so validation is still important.
        print('Validating parameters.')
        if 'workspace_name' not in params:
            raise ValueError(
                'Parameter workspace_name is not set in input arguments')
        workspace_name = params['workspace_name']
        if 'assembly_input_ref' not in params:
            raise ValueError(
                'Parameter assembly_input_ref is not set in input arguments')
        assembly_input_ref = params['assembly_input_ref']
        if 'showContigs' not in params:
            raise ValueError(
                'Parameter showContigs is not set in input arguments')
        showContigs_orig = params['showContigs']
        showContigs = None
        try:
            showContigs = int(showContigs_orig)
        except ValueError:
            raise ValueError(
                'Cannot parse integer from showContigs parameter (' +
                str(showContigs_orig) + ')')
        if showContigs < 0:
            raise ValueError('showContigs parameter cannot be negative (' +
                             str(showContigs) + ')')
        if showContigs > 1:
            raise ValueError(
                'showContigs parameter cannot be greater than one (' +
                str(showContigs) + ')')

        # Step 2 - Download the input data as a Fasta and
        # We can use the AssemblyUtils module to download a FASTA file from our Assembly data object.
        # The return object gives us the path to the file that was created.
        print('Downloading Assembly data as a Fasta file.')
        #        assemblyUtil = AssemblyUtil(self.callback_url)
        #        fasta_file = assemblyUtil.get_assembly_as_fasta({'ref': assembly_input_ref})

        # Step 3 - Actually perform the filter operation, saving the good contigs to a new fasta file.
        # We can use BioPython to parse the Fasta file and build and save the output to a file.

        data_file_cli = DataFileUtil(self.callback_url)
        #        assembly_metadata = data_file_cli.get_objects({'object_refs': ['assembly_input_ref']})['data'][0]['data']
        assembly = data_file_cli.get_objects(
            {'object_refs': [assembly_input_ref]})
        assembly_metadata = assembly['data'][0]['data']

        string = "\nAssembly Metadata\n"
        list = [
            'assembly_id', 'dna_size', 'gc_content', 'num_contigs',
            'fasta_handle_ref', 'md5', 'type', 'taxon_ref'
        ]
        for item in list:
            if item in assembly_metadata:
                string += "\t{:20} = {}".format(item,
                                                assembly_metadata[item]) + "\n"

        if 'fasta_handle_info' in assembly_metadata and 'node_file_name' in assembly_metadata[
                'fasta_handle_info']:
            string += "\tfilename             = " + assembly_metadata[
                'fasta_handle_info']['node_file_name'] + "\n"
        string += "BASE counts\n"
        for base in assembly_metadata['base_counts']:
            #            string += "\t" + base + str(assembly_metadata['base_counts'][base]) + "\n"
            string += "\t{:5} = {}".format(
                base, str(assembly_metadata['base_counts'][base])) + "\n"
        string += "\nName\tLength\tGC content\tContigID\tDescription\n"
        if 'contigs' in assembly_metadata:
            myContig = assembly_metadata['contigs']
            for ctg in myContig:
                list = ['length', 'gc_content', 'contig_id', 'description']
                string += ctg
                #                describeDict(myContig[ctg])
                for item in list:
                    if item in myContig[ctg]:
                        string += "\t{}".format(myContig[ctg][item])
                    else:
                        string += "\t"
                string += "\n"

        report_path = os.path.join(write_file_path,
                                   'assembly_metadata_report.txt')
        report_txt = open(report_path, "w")
        report_txt.write(string)
        report_txt.close()
        #        with open('assembly_metadata_report.txt',"w") as report_txt:
        #            report_txt.write(string)
        #        with open('assembly_metadata_report.html',"w") as report_txt:
        #            report_txt.write(string)
        #        output_file = []
        #        output_file.append({'path' : os.path.join(self.shared_folder, 'assembly_metadata_report.txt'),
        #                            'name' : 'assembly_metadata_report.txt',
        #                            'label' : 'AssemblyMetadata.label',
        #                            'description' : 'Text output for the assembly metadata'})
        #        html_file = []
        #        html_file.append({'path' : os.path.join(self.shared_folder, 'assembly_metadata_report.html'),
        #                           'name' : 'assembly_metadata_report.html',
        #                           'label' : 'AssemblyMetadata.label.html',
        #                           'description' : 'Text output for the assembly metadata'})

        print string

        # Step 5 - Build a Report and return
        #        report_params = {'message': string,
        #                         'direct_html_link_index': 0,
        #                         'html_links': [html_file],
        #                         'file_links': [output_file],
        #                         'report_object_name': 'assembly_metadata_report_' + str(uuid.uuid4()),
        #                         'workspace_name': params['workspace_name']
        #                        }
        #        reportObj = {
        #            'objects_created': [{'ref': 'assembly_metadata_report_' + str(uuid.uuid4()), 'description': 'AssemblyMetadata'}],
        #            'report_object_name' : 'assembly_metadata_report',
        #            'text_message':  "\n" + string
        #        }
        #        report = KBaseReport(self.callback_url)
        #        report_info = report.create_extended_report({'report': reportObj, 'workspace_name': params['workspace_name']})
        #        report_info = report.create_extended_report(report_params)

        # STEP 6: contruct the output to send back
        #        output = {'report_name': 'My_report',
        #        'report_ref': report_info['ref']
        #                   }

        output = self.create_report(token, params['workspace_name'],
                                    uuid_string, write_file_path)

        reported_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        print('returning: ' + pformat(output))
        #END assembly_metadata_report

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method assembly_metadata_report return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Beispiel #23
0
class staging_downloader:

    # staging file prefix
    STAGING_GLOBAL_FILE_PREFIX = '/data/bulk/'
    STAGING_USER_FILE_PREFIX = '/staging/'

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _get_staging_file_prefix(self, token_user):
        """
        _get_staging_file_prefix: return staging area file path prefix

        directory pattern:
            perfered to return user specific path: /staging/
            if this path is not visible to user, use global bulk path: /data/bulk/user_name/
        """

        if os.path.exists(self.STAGING_USER_FILE_PREFIX):
            return self.STAGING_USER_FILE_PREFIX
        else:
            return os.path.join(self.STAGING_GLOBAL_FILE_PREFIX, token_user)

    def _validate_export_params(self, params):
        """
        validates params passed to export_to_staging
        """

        log('start validating export_to_staging params')

        # check for required parameters
        for p in ['input_ref', 'workspace_name', 'destination_dir']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _generate_export_report(self, file_names, obj_name, workspace_name):
        log('start creating report')

        msg = 'Successfully exported object [{}] to staging area\n\n'.format(
            obj_name)
        msg += 'Exported files:\n' + '\n'.join(file_names)

        report_params = {
            'message': msg,
            'workspace_name': workspace_name,
            'report_object_name': 'staging_exporter_' + str(uuid.uuid4())
        }

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _download_reads(self, reads_ref, reads_name):
        """
        download Reads as FASTQ
        """

        log('start downloading Reads file')

        download_params = {'read_libraries': [reads_ref]}

        download_ret = self.ru.download_reads(download_params)
        files = download_ret['files'][reads_ref]['files']

        # create the output directory and move the file there
        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)
        fwd = files['fwd']
        rev = files.get('rev')

        result_zip_name = reads_name + '_' + reads_ref.replace(
            '/', '_') + '.FASTQ.zip'
        result_zip = os.path.join(result_dir, result_zip_name)

        with ZipFile(result_zip, 'w', ZIP_DEFLATED) as zipObj2:
            zipObj2.write(fwd, os.path.basename(fwd))
            if rev:
                zipObj2.write(rev, os.path.basename(rev))

        log('downloaded files:\n' + str(os.listdir(result_dir)))

        return result_dir

    def _download_assembly(self, assembly_ref, assembly_name):
        """
        download Assembly as FASTA
        """

        log('start downloading Assembly file')

        file_name = assembly_name + '_' + assembly_ref.replace('/',
                                                               '_') + '.fa'

        download_params = {'ref': assembly_ref, 'filename': file_name}
        download_ret = self.au.get_assembly_as_fasta(download_params)

        # create the output directory and move the file there
        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)
        shutil.move(download_ret.get('path'), result_dir)

        log('downloaded files:\n' + str(os.listdir(result_dir)))

        return result_dir

    def _download_alignment(self, alignment_ref, alignment_name,
                            export_alignment):
        """
        downloand Alignment as BAM or SAM
        """
        log('start downloading Alignment file')

        if not export_alignment:
            log('start downloading BAM as default')
            export_alignment = {'export_alignment_bam': 1}

        # create the output directory and move the file there
        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        if export_alignment.get('export_alignment_bam'):
            download_params = {
                'source_ref': alignment_ref,
                'downloadBAI': True
            }
            download_ret = self.rau.download_alignment(download_params)

            destination_dir = download_ret.get('destination_dir')

            file_names = os.listdir(destination_dir)
            for filename in file_names:
                new_file_name = alignment_name + '_' + alignment_ref.replace('/', '_') + \
                                '.' + filename.split('.', 1)[1]
                os.rename(os.path.join(destination_dir, filename),
                          os.path.join(destination_dir, new_file_name))

                shutil.copy2(os.path.join(destination_dir, new_file_name),
                             result_dir)

        if export_alignment.get('export_alignment_sam'):
            download_params = {
                'source_ref': alignment_ref,
                'downloadBAI': True,
                'downloadSAM': True
            }
            download_ret = self.rau.download_alignment(download_params)

            destination_dir = download_ret.get('destination_dir')

            file_names = os.listdir(destination_dir)
            for filename in file_names:
                new_file_name = alignment_name + '_' + alignment_ref.replace('/', '_') + \
                                '.' + filename.split('.', 1)[1]
                os.rename(os.path.join(destination_dir, filename),
                          os.path.join(destination_dir, new_file_name))

                shutil.copy2(os.path.join(destination_dir, new_file_name),
                             result_dir)

        log('downloaded files:\n' + str(os.listdir(result_dir)))

        return result_dir

    def _download_metagenome(self, metagenome_ref, metagenome_name):
        """
        """
        log("start downloading Annotated Metagenome Assembly files")
        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)
        download_ret = self.gfu.metagenome_to_gff(
            {'metagenome_ref': metagenome_ref})
        gff_file = download_ret.get('file_path')
        gff_file_name = os.path.basename(gff_file)
        shutil.move(gff_file, result_dir)

        new_file_name = metagenome_name + '_' + metagenome_ref.replace('/', '_') + \
            '.' + gff_file_name.split('.', 1)[1]

        os.rename(os.path.join(result_dir, gff_file_name),
                  os.path.join(result_dir, new_file_name))

        return result_dir

    def _download_genome(self, genome_ref, genome_name, export_genome):
        """
        download Genome as GENBANK or GFF
        """

        log('start downloading Genome file')

        if not export_genome:
            log('start downloading GENBANK as default')
            export_genome = {'export_genome_genbank': 1}

        # create the output directory and move the file there
        result_dir = os.path.join(self.scratch, str(uuid.uuid4()))
        self._mkdir_p(result_dir)

        if export_genome.get('export_genome_genbank'):
            download_params = {'genome_ref': genome_ref}
            download_ret = self.gfu.genome_to_genbank(download_params)

            genbank_file = download_ret.get('genbank_file').get('file_path')
            genbank_file_name = os.path.basename(genbank_file)
            shutil.move(genbank_file, result_dir)

            new_file_name = genome_name + '_' + genome_ref.replace('/', '_') + \
                '.' + genbank_file_name.split('.', 1)[1]

            os.rename(os.path.join(result_dir, genbank_file_name),
                      os.path.join(result_dir, new_file_name))

        if export_genome.get('export_genome_gff'):
            download_params = {'genome_ref': genome_ref}
            download_ret = self.gfu.genome_to_gff(download_params)

            gff_file = download_ret.get('file_path')
            gff_file_name = os.path.basename(gff_file)
            shutil.move(gff_file, result_dir)

            new_file_name = genome_name + '_' + genome_ref.replace('/', '_') + \
                '.' + gff_file_name.split('.', 1)[1]

            os.rename(os.path.join(result_dir, gff_file_name),
                      os.path.join(result_dir, new_file_name))

        log('downloaded files:\n' + str(os.listdir(result_dir)))

        return result_dir

    def __init__(self, config):
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)

    def export_to_staging(self, ctx, params):
        """
        export large file associated with workspace object to staging area

        params:
        input_ref: workspace object reference
        workspace_name: workspace name objects to be saved to
        destination_dir: destination directory for downloaded files

        optional:
        generate_report: indicator for generating workspace report. (default False)
        """

        self._validate_export_params(params)

        input_ref = params.get('input_ref')
        workspace_name = params.get('workspace_name')
        destination_dir = params.get('destination_dir')
        generate_report = params.get('generate_report', False)

        obj_source = self.dfu.get_objects({"object_refs":
                                           [input_ref]})['data'][0]

        obj_info = obj_source.get('info')
        obj_type = obj_info[2].split('-')[0]
        obj_name = obj_info[1]

        if obj_type in [
                'KBaseFile.PairedEndLibrary', 'KBaseFile.SingleEndLibrary'
        ]:
            result_dir = self._download_reads(input_ref, obj_name)
        elif obj_type in ['KBaseGenomeAnnotations.Assembly']:
            result_dir = self._download_assembly(input_ref, obj_name)
        elif obj_type in ['KBaseRNASeq.RNASeqAlignment']:
            result_dir = self._download_alignment(
                input_ref, obj_name, params.get('export_alignment'))
        elif obj_type in ['KBaseGenomes.Genome']:
            result_dir = self._download_genome(input_ref, obj_name,
                                               params.get('export_genome'))
        elif obj_type in ['KBaseMetagenomes.AnnotatedMetagenomeAssembly']:
            result_dir = self._download_metagenome(input_ref, obj_name)
        else:
            raise ValueError('Unexpected data type')

        staging_dir_prefix = self._get_staging_file_prefix(ctx['user_id'])
        staging_dir = os.path.join(staging_dir_prefix, destination_dir)
        self._mkdir_p(staging_dir)
        files = os.listdir(result_dir)
        for file in files:
            shutil.copy2(os.path.join(result_dir, file), staging_dir)

        if not (set(os.listdir(staging_dir)) >= set(files)):
            raise ValueError('Unexpected error occurred during copying files')

        returnVal = dict()
        returnVal['result_dir'] = result_dir

        if generate_report:
            report_output = self._generate_export_report(
                files, obj_name, workspace_name)
            returnVal.update(report_output)

        return returnVal
Beispiel #24
0
class ReadsAlignmentUtils:
    '''
    Module Name:
    ReadsAlignmentUtils

    Module Description:
    A KBase module: ReadsAlignmentUtils

This module is intended for use by Aligners and Assemblers to upload and download alignment files.
The alignment may be uploaded as a sam or bam file. If a sam file is given, it is converted to
the sorted bam format and saved. Upon downloading, optional parameters may be provided to get files
in sam and bai formats from the downloaded bam file. This utility also generates stats from the
stored alignment.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.0.1"
    GIT_URL = "https://github.com/kbaseapps/ReadsAlignmentUtils.git"
    GIT_COMMIT_HASH = "a807d122b097a4c6713a81d5a82eef335835f77a"

    #BEGIN_CLASS_HEADER

    PARAM_IN_FILE = 'file_path'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_CONDITION = 'condition'
    PARAM_IN_READ_LIB_REF = 'read_library_ref'
    PARAM_IN_ASM_GEN_REF = 'assembly_or_genome_ref'

    PARAM_IN_ALIGNED_USING = 'aligned_using'
    PARAM_IN_ALIGNER_VER = 'aligner_version'
    PARAM_IN_ALIGNER_OPTS = 'aligner_opts'
    PARAM_IN_REPLICATE_ID = 'replicate_id'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_BOWTIE2_INDEX = 'bowtie2_index'
    PARAM_IN_SAMPLESET_REF = 'sampleset_ref'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'

    PARAM_IN_DOWNLOAD_SAM = 'downloadSAM'
    PARAM_IN_DOWNLOAD_BAI = 'downloadBAI'
    PARAM_IN_VALIDATE = 'validate'

    INVALID_WS_OBJ_NAME_RE = re.compile('[^\\w\\|._-]')
    INVALID_WS_NAME_RE = re.compile('[^\\w:._-]')

    def _get_file_path_info(self, file_path):
        """
        Given a file path, returns the directory, file name, file base and file extension
        """
        dir, file_name = os.path.split(file_path)
        file_base, file_ext = os.path.splitext(file_name)

        return dir, file_name, file_base, file_ext

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _check_required_param(self, in_params, param_list):
        """
        Checks if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Checks the validity of workspace and object params and returns them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = self.dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _proc_upload_alignment_params(self, ctx, params):
        """
        Checks the presence and validity of upload alignment params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_FILE, self.PARAM_IN_CONDITION,
            self.PARAM_IN_READ_LIB_REF, self.PARAM_IN_ASM_GEN_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        file_path = params.get(self.PARAM_IN_FILE)

        if not (os.path.isfile(file_path)):
            raise ValueError('File does not exist: ' + file_path)

        lib_type = self._get_ws_info(params.get(self.PARAM_IN_READ_LIB_REF))[2]
        if lib_type.startswith('KBaseFile.SingleEndLibrary') or \
           lib_type.startswith('KBaseFile.PairedEndLibrary') or \
           lib_type.startswith('KBaseAssembly.SingleEndLibrary') or \
           lib_type.startswith('KBaseAssembly.PairedEndLibrary'):
            pass
        else:
            raise ValueError(self.PARAM_IN_READ_LIB_REF +
                             ' parameter should be of type' +
                             ' KBaseFile.SingleEndLibrary or' +
                             ' KBaseFile.PairedEndLibrary or' +
                             ' KBaseAssembly.SingleEndLibrary or' +
                             ' KBaseAssembly.PairedEndLibrary')

        obj_type = self._get_ws_info(params.get(self.PARAM_IN_ASM_GEN_REF))[2]
        if obj_type.startswith('KBaseGenomes.Genome') or \
           obj_type.startswith('KBaseGenomeAnnotations.Assembly') or \
           obj_type.startswith('KBaseGenomes.ContigSet'):
            pass
        else:
            raise ValueError(self.PARAM_IN_ASM_GEN_REF +
                             ' parameter should be of type' +
                             ' KBaseGenomes.Genome or' +
                             ' KBaseGenomeAnnotations.Assembly or' +
                             ' KBaseGenomes.ContigSet')
        return ws_name_id, obj_name_id, file_path, lib_type

    def _get_aligner_stats(self, bam_file):
        """
        Gets the aligner stats from BAM file
        """
        path, file = os.path.split(bam_file)
        return self.samtools.get_stats(file, path)

    def _validate(self, params):
        samt = SamTools(self.config, self.__LOGGER)
        if 'ignore' in params:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file,
                                 ipath=path,
                                 ignore=params['ignore'])
        else:
            path, file = os.path.split(params['file_path'])
            rval = samt.validate(ifile=file, ipath=path)

        return rval

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.__LOGGER = logging.getLogger('KBaseRNASeq')
        if 'log_level' in config:
            self.__LOGGER.setLevel(config['log_level'])
        else:
            self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - \
                                       %(levelname)s - %(message)s")
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        script_utils.check_sys_stat(self.__LOGGER)

        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.samtools = SamTools(config)
        #END_CONSTRUCTOR
        pass

    def validate_alignment(self, ctx, params):
        """
        :param params: instance of type "ValidateAlignmentParams" (* Input
           parameters for validating a reads alignment. For validation errors
           to ignore, see
           http://broadinstitute.github.io/picard/command-line-overview.html#V
           alidateSamFile) -> structure: parameter "file_path" of String,
           parameter "ignore" of list of String
        :returns: instance of type "ValidateAlignmentOutput" (* Results from
           validate alignment *) -> structure: parameter "validated" of type
           "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN validate_alignment

        rval = self._validate(params)

        if rval == 0:
            returnVal = {'validated': True}
        else:
            returnVal = {'validated': False}

        #END validate_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method validate_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def upload_alignment(self, ctx, params):
        """
        Validates and uploads the reads alignment  *
        :param params: instance of type "UploadAlignmentParams" (* Required
           input parameters for uploading a reads alignment string
           destination_ref -  object reference of alignment destination. The
           object ref is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id
           is the workspace name or id and obj_name_or_id is the object name
           or id file_path              -  File with the path of the sam or
           bam file to be uploaded. If a sam file is provided, it will be
           converted to the sorted bam format before being saved
           read_library_ref       -  workspace object ref of the read sample
           used to make the alignment file condition              -
           assembly_or_genome_ref -  workspace object ref of genome assembly
           or genome object that was used to build the alignment *) ->
           structure: parameter "destination_ref" of String, parameter
           "file_path" of String, parameter "read_library_ref" of String,
           parameter "condition" of String, parameter
           "assembly_or_genome_ref" of String, parameter "aligned_using" of
           String, parameter "aligner_version" of String, parameter
           "aligner_opts" of mapping from String to String, parameter
           "replicate_id" of String, parameter "platform" of String,
           parameter "bowtie2_index" of type "ws_bowtieIndex_id", parameter
           "sampleset_ref" of type "ws_Sampleset_ref", parameter
           "mapped_sample_id" of mapping from String to mapping from String
           to String, parameter "validate" of type "boolean" (A boolean - 0
           for false, 1 for true. @range (0, 1)), parameter "ignore" of list
           of String
        :returns: instance of type "UploadAlignmentOutput" (*  Output from
           uploading a reads alignment  *) -> structure: parameter "obj_ref"
           of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_alignment

        self.__LOGGER.info(
            'Starting upload Reads Alignment, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, file_path, lib_type = self._proc_upload_alignment_params(
            ctx, params)

        dir, file_name, file_base, file_ext = self._get_file_path_info(
            file_path)

        if self.PARAM_IN_VALIDATE in params and params[
                self.PARAM_IN_VALIDATE] is True:
            if self._validate(params) == 1:
                raise Exception('{0} failed validation'.format(file_path))

        bam_file = file_path
        if file_ext.lower() == '.sam':
            bam_file = os.path.join(dir, file_base + '.bam')
            self.samtools.convert_sam_to_sorted_bam(ifile=file_name,
                                                    ipath=dir,
                                                    ofile=bam_file)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': bam_file,
            'make_handle': 1
        })
        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        aligner_stats = self._get_aligner_stats(file_path)
        aligner_data = {
            'file': file_handle,
            'size': file_size,
            'condition': params.get(self.PARAM_IN_CONDITION),
            'read_sample_id': params.get(self.PARAM_IN_READ_LIB_REF),
            'library_type': lib_type,
            'genome_id': params.get(self.PARAM_IN_ASM_GEN_REF),
            'alignment_stats': aligner_stats
        }
        optional_params = [
            self.PARAM_IN_ALIGNED_USING, self.PARAM_IN_ALIGNER_VER,
            self.PARAM_IN_ALIGNER_OPTS, self.PARAM_IN_REPLICATE_ID,
            self.PARAM_IN_PLATFORM, self.PARAM_IN_BOWTIE2_INDEX,
            self.PARAM_IN_SAMPLESET_REF, self.PARAM_IN_MAPPED_SAMPLE_ID
        ]
        for opt_param in optional_params:
            if opt_param in params and params[opt_param] is not None:
                aligner_data[opt_param] = params[opt_param]

        self.__LOGGER.info('=========  Adding extra_provenance_refs')
        self.__LOGGER.info(params.get(self.PARAM_IN_READ_LIB_REF))
        self.__LOGGER.info(params.get(self.PARAM_IN_ASM_GEN_REF))
        self.__LOGGER.info('=======================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqAlignment",
                "data":
                aligner_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs": [
                    params.get(self.PARAM_IN_READ_LIB_REF),
                    params.get(self.PARAM_IN_ASM_GEN_REF)
                ]
            }]
        })[0]
        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        self.__LOGGER.info(returnVal)

        #END upload_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_alignment(self, ctx, params):
        """
        Downloads alignment files in .bam, .sam and .bai formats. Also downloads alignment stats *
        :param params: instance of type "DownloadAlignmentParams" (* Required
           input parameters for downloading a reads alignment string
           source_ref -  object reference of alignment source. The object ref
           is 'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "downloadSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "downloadBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "DownloadAlignmentOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String, parameter "stats" of type
           "AlignmentStats" -> structure: parameter "properly_paired" of
           Long, parameter "multiple_alignments" of Long, parameter
           "singletons" of Long, parameter "alignment_rate" of Double,
           parameter "unmapped_reads" of Long, parameter "mapped_reads" of
           Long, parameter "total_reads" of Long
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_alignment

        self.__LOGGER.info('Running download_alignment with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        try:
            alignment = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        uuid_str = str(uuid.uuid4())
        output_dir = os.path.join(self.scratch, 'download_' + uuid_str)
        self._mkdir_p(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            alignment[0]['data']['file']['id'],
            'file_path':
            output_dir
        })
        if zipfile.is_zipfile(file_ret.get('file_path')):
            with zipfile.ZipFile(file_ret.get('file_path')) as z:
                z.extractall(output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        bam_files = glob.glob(output_dir + '/*.bam')

        uuid_prefix = uuid_str[:8]
        if len(bam_files) == 0:
            raise ValueError("Alignment object does not contain a bam file")

        for bam_file_path in bam_files:
            dir, file_name, file_base, file_ext = self._get_file_path_info(
                bam_file_path)
            if params.get(self.PARAM_IN_VALIDATE, False):
                validate_params = {'file_path': bam_file_path}
                if self._validate(validate_params) == 1:
                    raise Exception(
                        '{0} failed validation'.format(bam_file_path))

            if params.get('downloadBAI', False):
                bai_file = uuid_prefix + '_' + file_base + '.bai'
                bai_file_path = os.path.join(output_dir, bai_file)
                self.samtools.create_bai_from_bam(ifile=file_name,
                                                  ipath=output_dir,
                                                  ofile=bai_file)
                if not os.path.isfile(bai_file_path):
                    raise ValueError('Error creating {}'.format(bai_file_path))

            if params.get('downloadSAM', False):
                sam_file = uuid_prefix + '_' + file_base + '.sam'
                sam_file_path = os.path.join(output_dir, sam_file)
                self.samtools.convert_bam_to_sam(ifile=file_name,
                                                 ipath=output_dir,
                                                 ofile=sam_file)
                if not os.path.isfile(sam_file_path):
                    raise ValueError('Error creating {}'.format(sam_file_path))

        returnVal = {
            'destination_dir': output_dir,
            'stats': alignment[0]['data']['alignment_stats']
        }

        #END download_alignment

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_alignment return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_alignment(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download alignments from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting a reads alignment string source_ref - 
           object reference of alignment source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String, parameter
           "exportSAM" of type "boolean" (A boolean - 0 for false, 1 for
           true. @range (0, 1)), parameter "exportBAI" of type "boolean" (A
           boolean - 0 for false, 1 for true. @range (0, 1)), parameter
           "validate" of type "boolean" (A boolean - 0 for false, 1 for true.
           @range (0, 1)), parameter "ignore" of list of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_alignment

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError('{} parameter is required'.format(
                self.PARAM_IN_SRC_REF))

        if params.get(self.PARAM_IN_VALIDATE, False) or \
           params.get('exportBAI', False) or \
           params.get('exportSAM', False):
            """
            Need to validate or convert files. Use download_alignment
            """
            download_params = {}
            for key, val in params.iteritems():
                download_params[key.replace('export', 'download')] = val

            download_retVal = self.download_alignment(ctx, download_params)[0]

            export_dir = download_retVal['destination_dir']

            # package and load to shock
            ret = self.dfu.package_for_download({
                'file_path': export_dir,
                'ws_refs': [inref]
            })
            output = {'shock_id': ret['shock_id']}
        else:
            """
            return shock id from the object
            """
            try:
                alignment = self.dfu.get_objects({'object_refs':
                                                  [inref]})['data']
            except DFUError as e:
                self.__LOGGER.error(
                    'Logging stacktrace from workspace exception:\n' + e.data)
                raise
            output = {'shock_id': alignment[0]['data']['file']['id']}

        #END export_alignment

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_alignment return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
class ImportMediaUtil:
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url)
        self.fba = fba_tools(self.callback_url)
        self.uploader_utils = UploaderUtil(config)

    def import_media_from_staging(self, params):
        '''
          import_media_from_staging: wrapper method for
                                    FBAFileUtil.tsv_file_to_media
                                    and
                                    FBAFileUtil.excel_file_to_media

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          media_name - output Media file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''
        log('--->\nrunning ImportMediaUtil.import_media_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_media_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')
        file = {
            'path': scratch_file_path
        }
        import_media_params = params
        import_media_params['media_file'] = file

        try:
            ref = self.fba.tsv_file_to_media(import_media_params)
        except:
            try:
                ref = self.fba.excel_file_to_media(import_media_params)
            except:
                raise ValueError('"{}" is not a valid EXCEL nor TSV file'.format(
                                                params.get('staging_file_subdir_path')))
        """
        Update the workspace object related meta-data for staged file
        """
        self.uploader_utils.update_staging_service(params.get('staging_file_subdir_path'), ref.get('ref'))

        returnVal = {'obj_ref': ref.get('ref')}

        return returnVal

    def import_tsv_as_media_from_staging(self, params):
        '''
          import_tsv_as_media_from_staging: wrapper method for
                                    FBAFileUtil.tsv_file_to_media

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          media_name - output Media file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportMediaUtil.import_tsv_as_media_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_media_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')

        file = {
            'path': scratch_file_path
        }

        import_media_params = params
        import_media_params['media_file'] = file

        ref = self.fba.tsv_file_to_media(import_media_params)

        returnVal = {'obj_ref': ref.get('ref')}

        return returnVal

    def import_excel_as_media_from_staging(self, params):
        '''
          import_excel_as_media_from_staging: wrapper method for
                                    FBAFileUtil.excel_file_to_media

          required params:
          staging_file_subdir_path - subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
          media_name - output Media file name
          workspace_name - the name of the workspace it gets saved to.

          return:
          obj_ref: return object reference
        '''

        log('--->\nrunning ImportMediaUtil.import_excel_as_media_from_staging\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self.validate_import_media_from_staging_params(params)

        download_staging_file_params = {
            'staging_file_subdir_path': params.get('staging_file_subdir_path')
        }
        scratch_file_path = self.dfu.download_staging_file(
                        download_staging_file_params).get('copy_file_path')

        file = {
            'path': scratch_file_path
        }

        import_media_params = params
        import_media_params['media_file'] = file

        ref = self.fba.excel_file_to_media(import_media_params)

        returnVal = {'obj_ref': ref.get('ref')}

        return returnVal

    def validate_import_media_from_staging_params(self, params):
        """
        validate_import_media_from_staging_params:
                    validates params passed to import_excel(tsv)_as_media_from_staging method

        """

        # check for required parameters
        for p in ['staging_file_subdir_path', 'workspace_name', 'media_name']:
            if p not in params:
                raise ValueError('"{}" parameter is required, but missing'.format(p))

    def generate_report(self, obj_ref, params):
        """
        generate_report: generate summary report

        obj_ref: generated workspace object references. (return of
                                                        import_excel(tsv)_as_media_from_staging)
        params:
        staging_file_subdir_path: subdirectory file path
          e.g.
            for file: /data/bulk/user_name/file_name
            staging_file_subdir_path is file_name
            for file: /data/bulk/user_name/subdir_1/subdir_2/file_name
            staging_file_subdir_path is subdir_1/subdir_2/file_name
        workspace_name: workspace name/ID that reads will be stored to

        """

        uuid_string = str(uuid.uuid4())
        upload_message = 'Import Finished\n'

        get_objects_params = {
            'object_refs': [obj_ref],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)

        upload_message += "Media Object Name: "
        upload_message += str(object_data.get('data')[0].get('info')[1]) + '\n'
        upload_message += 'Imported File: {}\n'.format(
                              params.get('staging_file_subdir_path'))

        report_params = {
              'message': upload_message,
              'workspace_name': params.get('workspace_name'),
              'report_object_name': 'kb_upload_mothods_report_' + uuid_string}

        kbase_report_client = KBaseReport(self.callback_url, token=self.token)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {'report_name': output['name'], 'report_ref': output['ref']}

        return report_output
class RNASeqDownloaderUtils:
    def __init__(self, config):
        log('--->\nInitializing RNASeqDownloaderUtils instance:\n config: %s' %
            config)
        self.scratch = config['scratch']
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.dfu = DataFileUtil(self.callback_url, token=self.token)
        self.rau = ReadsAlignmentUtils(self.callback_url, token=self.token)

    def download_RNASeq(self, params):
        """
        download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file

        params:
        input_ref: RNASeq object reference ID
        rna_seq_type: one of ['RNASeqAlignment', 
                              'RNASeqExpression', 
                              'RNASeqDifferentialExpression']

        return:
        shock_id: Shock ID of stored zip file
    
        """
        log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq:\nparams: %s'
            % params)

        # Validate params
        self.validate_download_rna_seq_alignment_parameters(params)

        # Download RNASeq zip file
        # RNASeq Alignemnt, Expression and DifferentialExpression
        # has same object_data/handle_data structure
        returnVal = self._download_rna_seq_zip(params.get('input_ref'))

        return returnVal

    def download_RNASeq_Alignment(self, params):
        """
        download_RNASeq: download RNASeq Alignment/Expression/DifferentialExpression zip file

        params:
        input_ref: RNASeq object reference ID
        rna_seq_type: 'RNASeqAlignment'
        download_file_type: one of 'bam', 'sam' or 'bai'

        return:
        shock_id: Shock ID of stored zip file
    
        """
        log('--->\nrunning RNASeqDownloaderUtils.download_RNASeq_Alignment:\nparams: %s'
            % params)

        # Validate params
        self.validate_download_rna_seq_alignment_parameters(params)

        input_ref = params.get('input_ref')
        returnVal = dict()

        download_file_type = params.get('download_file_type')
        if download_file_type == 'bam':
            destination_dir = self.rau.download_alignment({
                'source_ref': input_ref,
                'downloadBAI': True
            })['destination_dir']
            shock_id = self._upload_dir_to_shock(destination_dir)
        elif download_file_type == 'sam':
            destination_dir = self.rau.download_alignment({
                'source_ref': input_ref,
                'downloadSAM': True,
                'downloadBAI': True
            })['destination_dir']
            files = os.listdir(destination_dir)
            bam_files = [x for x in files if re.match('.*\.bam', x)]
            for bam_file in bam_files:
                log('removing file: {}'.format(bam_file))
                os.remove(os.path.join(destination_dir, bam_file))
            shock_id = self._upload_dir_to_shock(destination_dir)

        returnVal['shock_id'] = shock_id

        return returnVal

    def validate_download_rna_seq_alignment_parameters(self, params):
        """
        validate_download_rna_seq_alignment_parameters: 
                        validates params passed to download_rna_seq_alignment method
    
        """

        # check required parameters
        for p in ['input_ref', 'rna_seq_type']:
            if p not in params:
                raise ValueError('"' + p +
                                 '" parameter is required, but missing')

        # check supportive RNASeq types
        valid_rnaseq_types = [
            'RNASeqAlignment', 'RNASeqExpression',
            'RNASeqDifferentialExpression'
        ]
        if params['rna_seq_type'] not in valid_rnaseq_types:
            raise ValueError('Unexpected RNASeq type: %s' %
                             params['rna_seq_type'])

    def _download_rna_seq_zip(self, input_ref):
        """
        _download_rna_seq_zip: download RNASeq's archive zip file

        returns:
        shock_id: Shock ID of stored zip file

        """

        # get object data
        object_data = self._get_object_data(input_ref)
        log('---> getting object data\n object_date: %s' %
            json.dumps(object_data, indent=1))

        # get handle data
        handle = self._get_handle_data(object_data)
        log('---> getting handle data\n handle data: %s' %
            json.dumps(object_data, indent=1))

        # make tmp directory for downloading
        dstdir = os.path.join(self.scratch, 'tmp')
        if not os.path.exists(dstdir):
            os.makedirs(dstdir)

        # download original zip file and save to tmp directory
        handle_id = handle.get('hid')
        original_zip_file_path = self._download_original_zip_file(
            handle_id, dstdir)

        log('---> loading %s to shock' % original_zip_file_path)
        shock_id = self._upload_to_shock(original_zip_file_path)

        log('---> removing folder: %s' % dstdir)
        shutil.rmtree(dstdir)

        returnVal = {"shock_id": shock_id}

        return returnVal

    def _get_object_data(self, input_ref):
        """
        _get_object_data: get object_data using DataFileUtil

        """

        get_objects_params = {
            'object_refs': [input_ref],
            'ignore_errors': False
        }

        object_data = self.dfu.get_objects(get_objects_params)

        return object_data

    def _get_handle_data(self, object_data):
        """
        _get_handle_data: get Handle from object_data

        """

        try:
            handle = object_data.get('data')[0].get('data').get('file')
        except:
            error_msg = "Unexpected object format. Refer to DataFileUtil.get_objects definition\n"
            error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1)
            raise ValueError(error_msg)

        if handle is None:
            error_msg = "object_data does NOT have Handle(file key)\n"
            error_msg += "object_data:\n%s" % json.dumps(object_data, indent=1)
            raise ValueError(error_msg)
        elif handle.get('hid') is None:
            error_msg = "Handle does have NOT HandleId(hid key)\n"
            error_msg += "handle_data:\n%s" % json.dumps(handle, indent=1)
            raise ValueError(error_msg)
        else:
            return handle

    def _download_original_zip_file(self, handle_id, dstdir):
        """
        _download_original_zip_file: download original archive .zip file using DataFileUtil
        
        """

        shock_to_file_params = {'handle_id': handle_id, 'file_path': dstdir}
        original_zip_file = self.dfu.shock_to_file(shock_to_file_params)

        original_zip_file_path = original_zip_file.get('file_path')

        return original_zip_file_path

    def _upload_to_shock(self, file_path):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
    
        """

        file_to_shock_params = {'file_path': file_path}
        shock_file = self.dfu.file_to_shock(file_to_shock_params)

        shock_id = shock_file.get('shock_id')

        return shock_id

    def _upload_dir_to_shock(self, directory):
        """
        _upload_to_shock: upload target file to shock using DataFileUtil
    
        """

        file_to_shock_params = {'file_path': directory, 'pack': 'zip'}
        shock_file = self.dfu.file_to_shock(file_to_shock_params)

        shock_id = shock_file.get('shock_id')

        return shock_id
class ExpressionUtils:
    '''
    Module Name:
    ExpressionUtils

    Module Description:
    A KBase module: ExpressionUtils

This module is intended for use by Assemblers to upload RNASeq Expression files
(gtf, fpkm and ctab). This module generates the ctab files and tpm data if they are absent.
The expression files are uploaded as a single compressed file.This module also generates
expression levels and tpm expression levels from the input files and saves them in the
workspace object. Once uploaded, the expression files can be downloaded onto an output directory.
    '''

    ######## WARNING FOR GEVENT USERS ####### noqa
    # Since asynchronous IO can lead to methods - even the same method -
    # interrupting each other, you must be *very* careful when using global
    # state. A method could easily clobber the state set by another while
    # the latter method is running.
    ######################################### noqa
    VERSION = "0.1.1"
    GIT_URL = "https://github.com/JamesJeffryes/ExpressionUtils.git"
    GIT_COMMIT_HASH = "62ce653aa5c5b39a597486613bc140b173a35c99"

    #BEGIN_CLASS_HEADER

    PARAM_IN_SRC_DIR = 'source_dir'
    PARAM_IN_SRC_REF = 'source_ref'
    PARAM_IN_DST_REF = 'destination_ref'
    PARAM_IN_ALIGNMENT_REF = 'alignment_ref'

    PARAM_IN_GENOME_REF = 'genome_ref'
    PARAM_IN_ANNOTATION_ID = 'annotation_id'
    PARAM_IN_BAM_FILE_PATH = 'bam_file_path'
    PARAM_IN_DESCRIPTION = 'description'
    PARAM_IN_DATA_QUAL_LEVEL = 'data_quality_level'
    PARAM_IN_PROC_COMMENTS = 'processing_comments'
    PARAM_IN_PLATFORM = 'platform'
    PARAM_IN_MAPPED_SAMPLE_ID = 'mapped_sample_id'
    PARAM_IN_ORIG_MEDIAN = 'original_median'
    PARAM_IN_EXT_SRC_DATE = 'external_source_date'
    PARAM_IN_TRANSCRIPTS = 'transcripts'
    PARAM_IN_SRC = 'source'

    def _check_required_param(self, in_params, param_list):
        """
        Check if each of the params in the list are in the input params
        """
        for param in param_list:
            if (param not in in_params or not in_params[param]):
                raise ValueError('{} parameter is required'.format(param))

    def _proc_ws_obj_params(self, ctx, params):
        """
        Check the validity of workspace and object params and return them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        dfu = DataFileUtil(self.callback_url)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id

    def _proc_upload_expression_params(self, ctx, params):
        """
        Check the presence and validity of upload expression params
        """
        self._check_required_param(params, [
            self.PARAM_IN_DST_REF, self.PARAM_IN_SRC_DIR,
            self.PARAM_IN_ALIGNMENT_REF
        ])

        ws_name_id, obj_name_id = self._proc_ws_obj_params(ctx, params)

        source_dir = params.get(self.PARAM_IN_SRC_DIR)

        if not (os.path.isdir(source_dir)):
            raise ValueError('Source directory does not exist: ' + source_dir)

        if not os.listdir(source_dir):
            raise ValueError('Source directory is empty: ' + source_dir)

        return ws_name_id, obj_name_id, source_dir

    def _get_ws_info(self, obj_ref):

        ws = Workspace(self.ws_url)
        try:
            info = ws.get_object_info_new({'objects': [{'ref': obj_ref}]})[0]
        except WorkspaceError as wse:
            self.__LOGGER.error('Logging workspace exception')
            self.__LOGGER.error(str(wse))
            raise
        return info

    def _get_genome_ref(self, assembly_or_genome_ref, params):
        if self.PARAM_IN_GENOME_REF in params and params[
                self.PARAM_IN_GENOME_REF] is not None:
            return params[self.PARAM_IN_GENOME_REF]

        obj_type = self._get_ws_info(assembly_or_genome_ref)[2]
        if obj_type.startswith('KBaseGenomes.Genome'):
            return assembly_or_genome_ref

        raise ValueError('Alignment object does not contain genome_ref; '
                         '"{}" parameter is required'.format(
                             self.PARAM_IN_GENOME_REF))

    def _get_expression_levels(self,
                               source_dir,
                               genome_ref,
                               transcripts=False):

        fpkm_file_path = os.path.join(source_dir, 'genes.fpkm_tracking')
        if transcripts:
            fpkm_file_path = os.path.join(source_dir, 't_data.ctab')

        if not os.path.isfile(fpkm_file_path):
            raise ValueError('{} file is required'.format(fpkm_file_path))

        id_col = 5 if transcripts else 0
        self.__LOGGER.info(
            'Generating expression levels from {}'.format(fpkm_file_path))
        return self.expression_utils.get_expression_levels(
            fpkm_file_path, genome_ref, id_col)

    def _gen_ctab_files(self, params, alignment_ref):

        source_dir = params.get(self.PARAM_IN_SRC_DIR)
        if len(glob.glob(source_dir + '/*.ctab')) < 5:

            self.__LOGGER.info(' =======  Generating ctab files ==========')
            gtf_file = os.path.join(source_dir, 'transcripts.gtf')
            if not os.path.isfile(gtf_file):
                raise ValueError(
                    "{} file is required to generate ctab files, found missing"
                    .format(gtf_file))

            if self.PARAM_IN_BAM_FILE_PATH in params and \
               params[self.PARAM_IN_BAM_FILE_PATH] is not None:
                bam_file_path = params[self.PARAM_IN_BAM_FILE_PATH]
            else:
                self.__LOGGER.info(
                    'Downloading bam file from alignment object')
                rau = ReadsAlignmentUtils(self.callback_url)
                alignment_retVal = rau.download_alignment(
                    {'source_ref': alignment_ref})
                alignment_dir = alignment_retVal.get('destination_dir')

                allbamfiles = glob.glob(alignment_dir + '/*.bam')
                if len(allbamfiles) == 0:
                    raise ValueError('bam file does not exist in {}'.format(d))
                elif len(allbamfiles) == 1:
                    bam_file_path = allbamfiles[0]
                elif len(allbamfiles) > 1:
                    tmp_file_path = os.path.join(alignment_dir,
                                                 'accepted_hits.bam')
                    if os.path.isfile(tmp_file_path):
                        bam_file_path = tmp_file_path
                    else:
                        tmp_file_path = os.path.join(
                            alignment_dir, 'accepted_hits_sorted.bam')
                        if os.path.isfile(tmp_file_path):
                            bam_file_path = tmp_file_path
                        else:
                            raise ValueError(
                                'accepted_hits.bam, accepted_hits_sorted.bam or other bam file not found in {}'
                                .format(alignment_dir))

            result = self.table_maker.build_ctab_files(
                ref_genome_path=gtf_file,
                alignment_path=bam_file_path,
                output_dir=source_dir)
            if result != 0:
                raise ValueError('Tablemaker failed')

    #END_CLASS_HEADER

    # config contains contents of config file in a hash or None if it couldn't
    # be found
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.__LOGGER = logging.getLogger('ExpressionUtils')
        self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s"
        )
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        self.config = config
        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.config['SDK_CALLBACK_URL'] = self.callback_url
        self.expression_utils = Expression_Utils(self.config)
        self.dfu = DataFileUtil(self.callback_url)
        self.table_maker = TableMaker(config, self.__LOGGER)
        self.expr_matrix_utils = ExprMatrixUtils(config, self.__LOGGER)
        #END_CONSTRUCTOR
        pass

    def upload_expression(self, ctx, params):
        """
        Uploads the expression  *
        :param params: instance of type "UploadExpressionParams" (*   
           Required input parameters for uploading a reads expression data
           string   destination_ref        -   object reference of expression
           data. The object ref is 'ws_name_or_id/obj_name_or_id' where
           ws_name_or_id is the workspace name or id and obj_name_or_id is
           the object name or id string   source_dir             -  
           directory with the files to be uploaded string   alignment_ref    
           -   alignment workspace object reference *) -> structure:
           parameter "destination_ref" of String, parameter "source_dir" of
           String, parameter "alignment_ref" of String, parameter
           "genome_ref" of String, parameter "annotation_id" of String,
           parameter "bam_file_path" of String, parameter "transcripts" of
           type "boolean" (A boolean - 0 for false, 1 for true. @range (0,
           1)), parameter "data_quality_level" of Long, parameter
           "original_median" of Double, parameter "description" of String,
           parameter "platform" of String, parameter "source" of String,
           parameter "external_source_date" of String, parameter
           "processing_comments" of String
        :returns: instance of type "UploadExpressionOutput" (*     Output
           from upload expression    *) -> structure: parameter "obj_ref" of
           String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN upload_expression

        self.__LOGGER.info('Starting upload expression, parsing parameters ')
        pprint(params)

        ws_name_id, obj_name_id, source_dir = self._proc_upload_expression_params(
            ctx, params)

        alignment_ref = params.get(self.PARAM_IN_ALIGNMENT_REF)
        try:
            alignment_obj = self.dfu.get_objects(
                {'object_refs': [alignment_ref]})['data'][0]
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        alignment = alignment_obj['data']
        assembly_or_genome_ref = alignment['genome_id']

        genome_ref = self._get_genome_ref(assembly_or_genome_ref, params)

        expression_levels, tpm_expression_levels = self._get_expression_levels(
            source_dir, genome_ref, params.get(self.PARAM_IN_TRANSCRIPTS))

        self._gen_ctab_files(params, alignment_ref)

        uploaded_file = self.dfu.file_to_shock({
            'file_path': source_dir,
            'make_handle': 1,
            'pack': 'zip'
        })
        """
        move the zipfile created in the source directory one level up
        """
        path, dir = os.path.split(source_dir)
        zipfile = dir + '.zip'
        if os.path.isfile(os.path.join(source_dir, zipfile)):
            shutil.move(os.path.join(source_dir, zipfile),
                        os.path.join(path, zipfile))

        file_handle = uploaded_file['handle']
        file_size = uploaded_file['size']

        expression_data = {
            'numerical_interpretation': 'FPKM',
            'genome_id': genome_ref,
            'mapped_rnaseq_alignment': {
                alignment['read_sample_id']: alignment_ref
            },
            'condition': alignment['condition'],
            'file': file_handle,
            'expression_levels': expression_levels,
            'tpm_expression_levels': tpm_expression_levels
        }
        additional_params = [
            self.PARAM_IN_ANNOTATION_ID, self.PARAM_IN_DESCRIPTION,
            self.PARAM_IN_DATA_QUAL_LEVEL, self.PARAM_IN_PLATFORM,
            self.PARAM_IN_PROC_COMMENTS, self.PARAM_IN_MAPPED_SAMPLE_ID,
            self.PARAM_IN_ORIG_MEDIAN, self.PARAM_IN_EXT_SRC_DATE,
            self.PARAM_IN_SRC
        ]

        for opt_param in additional_params:
            if opt_param in params and params[opt_param] is not None:
                expression_data[opt_param] = params[opt_param]

        extra_provenance_input_refs = list()
        extra_provenance_input_refs.append(
            params.get(self.PARAM_IN_ALIGNMENT_REF))
        if self.PARAM_IN_GENOME_REF in params and params.get(
                self.PARAM_IN_GENOME_REF) is not None:
            extra_provenance_input_refs.append(
                params.get(self.PARAM_IN_GENOME_REF))

        self.__LOGGER.info('===========   Adding extra_provenance_refs')
        self.__LOGGER.info(str(extra_provenance_input_refs))
        self.__LOGGER.info('==========================================')

        res = self.dfu.save_objects({
            "id":
            ws_name_id,
            "objects": [{
                "type":
                "KBaseRNASeq.RNASeqExpression",
                "data":
                expression_data,
                "name":
                obj_name_id,
                "extra_provenance_input_refs":
                extra_provenance_input_refs
            }]
        })[0]

        self.__LOGGER.info('save complete')

        returnVal = {
            'obj_ref': str(res[6]) + '/' + str(res[0]) + '/' + str(res[4])
        }

        self.__LOGGER.info('Uploaded object: ')
        print(returnVal)
        #END upload_expression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method upload_expression return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def download_expression(self, ctx, params):
        """
        Downloads expression *
        :param params: instance of type "DownloadExpressionParams" (*
           Required input parameters for downloading expression string
           source_ref         -       object reference of expression source.
           The object ref is 'ws_name_or_id/obj_name_or_id' where
           ws_name_or_id is the workspace name or id and obj_name_or_id is
           the object name or id *) -> structure: parameter "source_ref" of
           String
        :returns: instance of type "DownloadExpressionOutput" (*  The output
           of the download method.  *) -> structure: parameter
           "destination_dir" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN download_expression

        self.__LOGGER.info('Running download_expression with params:\n' +
                           pformat(params))

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required')

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        # set the output dir
        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        output_dir = os.path.join(self.scratch, 'download_' + str(timestamp))
        os.mkdir(output_dir)

        file_ret = self.dfu.shock_to_file({
            'shock_id':
            expression[0]['data']['file']['id'],
            'file_path':
            output_dir,
            'unpack':
            'unpack'
        })

        if not os.listdir(output_dir):
            raise ValueError('No files were downloaded: ' + output_dir)

        for f in glob.glob(output_dir + '/*.zip'):
            os.remove(f)

        returnVal = {'destination_dir': output_dir}

        #END download_expression

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method download_expression return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def export_expression(self, ctx, params):
        """
        Wrapper function for use by in-narrative downloaders to download expressions from shock *
        :param params: instance of type "ExportParams" (* Required input
           parameters for exporting expression string   source_ref         - 
           object reference of expression source. The object ref is
           'ws_name_or_id/obj_name_or_id' where ws_name_or_id is the
           workspace name or id and obj_name_or_id is the object name or id
           *) -> structure: parameter "source_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_expression

        inref = params.get(self.PARAM_IN_SRC_REF)
        if not inref:
            raise ValueError(self.PARAM_IN_SRC_REF + ' parameter is required')

        try:
            expression = self.dfu.get_objects({'object_refs': [inref]})['data']
        except DFUError as e:
            self.__LOGGER.error(
                'Logging stacktrace from workspace exception:\n' + e.data)
            raise

        output = {'shock_id': expression[0]['data']['file']['id']}

        #END export_expression

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_expression return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]

    def get_expressionMatrix(self, ctx, params):
        """
        :param params: instance of type "getExprMatrixParams" (* Following
           are the required input parameters to get Expression Matrix *) ->
           structure: parameter "workspace_name" of String, parameter
           "output_obj_name" of String, parameter "expressionset_ref" of
           String
        :returns: instance of type "getExprMatrixOutput" -> structure:
           parameter "exprMatrix_FPKM_ref" of String, parameter
           "exprMatrix_TPM_ref" of String
        """
        # ctx is the context object
        # return variables are: returnVal
        #BEGIN get_expressionMatrix
        fpkm_ref, tpm_ref = self.expr_matrix_utils.get_expression_matrix(
            params)

        returnVal = {
            'exprMatrix_FPKM_ref': fpkm_ref,
            'exprMatrix_TPM_ref': tpm_ref
        }
        #END get_expressionMatrix

        # At some point might do deeper type checking...
        if not isinstance(returnVal, dict):
            raise ValueError('Method get_expressionMatrix return value ' +
                             'returnVal is not type dict as required.')
        # return the results
        return [returnVal]

    def status(self, ctx):
        #BEGIN_STATUS
        returnVal = {
            'state': "OK",
            'message': "",
            'version': self.VERSION,
            'git_url': self.GIT_URL,
            'git_commit_hash': self.GIT_COMMIT_HASH
        }
        #END_STATUS
        return [returnVal]
Beispiel #28
0
class AssemblyToFasta:
    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

    def export_as_fasta(self, ctx, params):
        ''' Used almost exclusively for download only '''
        # validate parameters
        if 'input_ref' not in params:
            raise ValueError(
                'Cannot export Assembly- not input_ref field defined.')

        # export to a file
        file = self.assembly_as_fasta(ctx, {'ref': params['input_ref']})

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.scratch, file['assembly_name'])
        os.makedirs(export_package_dir)
        shutil.move(
            file['path'],
            os.path.join(export_package_dir, os.path.basename(file['path'])))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path':
            export_package_dir,
            'ws_refs': [params['input_ref']]
        })

        return {'shock_id': package_details['shock_id']}

    def assembly_as_fasta(self, ctx, params):
        ''' main function that accepts a ref to an object and writes a file '''

        self.validate_params(params)

        print('downloading ws object data (' + params['ref'] + ')')
        assembly_object = self.dfu.get_objects(
            {'object_refs': [params['ref']]})['data'][0]
        ws_type = assembly_object['info'][2]
        obj_name = assembly_object['info'][1]

        if 'filename' in params:
            output_filename = params['filename']
        else:
            output_filename = obj_name + '.fa'

        output_fasta_file_path = os.path.join(self.scratch, output_filename)

        if 'KBaseGenomes.ContigSet' in ws_type:
            self.process_legacy_contigset(output_fasta_file_path,
                                          assembly_object['data'])
        elif 'KBaseGenomeAnnotations.Assembly' in ws_type:
            self.process_assembly(output_fasta_file_path,
                                  assembly_object['data'])

        else:
            raise ValueError(
                'Cannot write data to fasta; invalid WS type (' + ws_type +
                ').  Supported types are KBaseGenomes.ContigSet and ' +
                'KBaseGenomeAnnotations.Assembly')

        return {'path': output_fasta_file_path, 'assembly_name': obj_name}

    def fasta_rows_generator_from_contigset(self, contig_list):
        ''' generates SeqRecords iterator for writing from a legacy contigset object '''
        for contig in contig_list:
            description = ''
            if 'description' in contig and contig['description']:
                description = contig['description']
            yield SeqRecord(Seq(contig['sequence'], SingleLetterAlphabet),
                            id=contig['id'],
                            description=description)

    def process_legacy_contigset(self, output_fasta_path, data):
        ''' '''
        SeqIO.write(self.fasta_rows_generator_from_contigset(data['contigs']),
                    output_fasta_path, "fasta")

    def process_assembly(self, output_fasta_path, data):
        ''' '''
        self.dfu.shock_to_file({
            'handle_id': data['fasta_handle_ref'],
            'file_path': output_fasta_path,
            'unpack': 'uncompress'
        })

    def validate_params(self, params):
        for key in ['ref']:
            if key not in params:
                raise ValueError('required "' + key +
                                 '" field was not defined')
Beispiel #29
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1]
            res = self.gsu.search({'ref': genome,
                                   'structured_query': {'feature_id': fids},
                                   'sort_by': [['contig_id', 1]],
                                   'start': 0,
                                   'limit': len(fids)
                                   })

            for feat in res['features']:
                features.append({'Feature Id': feat['feature_id'],
                                 'Aliases': ", ".join(feat['aliases'].keys()),
                                 'Genome': "{} ({})".format(genome_name, genome),
                                 'Type': feat['feature_type'],
                                 'Function': feat['function']
                                 })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}
Beispiel #30
0
class MetagenomeFileUtils:
    def _validate_merge_bins_from_binned_contig_params(self, params):
        """
        _validate_merge_bins_from_binned_contig_params:
                validates params passed to merge_bins_from_binned_contig method

        """
        log('Start validating merge_bins_from_binned_contig params')

        # check for required parameters
        for p in [
                'old_binned_contig_ref', 'bin_merges',
                'output_binned_contig_name', 'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        bin_merges = params.get('bin_merges')

        if not isinstance(bin_merges, list):
            error_msg = 'expecting a list for bin_merges param, '
            error_msg += 'but getting a [{}]'.format(type(bin_merges))
            raise ValueError(error_msg)

        for bin_merge in bin_merges:
            for p in ['new_bin_id', 'bin_to_merge']:
                if p not in bin_merge:
                    raise ValueError(
                        '"{}" key is required in bin_merges, but missing'.
                        format(p))

            bin_to_merge = bin_merge.get('bin_to_merge')

            if not isinstance(bin_to_merge, list):
                error_msg = 'expecting a list for bin_to_merge, '
                error_msg += 'but getting a [{}]'.format(type(bin_to_merge))
                raise ValueError(error_msg)

    def _validate_remove_bins_from_binned_contig_params(self, params):
        """
        _validate_remove_bins_from_binned_contig_params:
                validates params passed to remove_bins_from_binned_contig method

        """
        log('Start validating remove_bins_from_binned_contig params')

        # check for required parameters
        for p in [
                'old_binned_contig_ref', 'bins_to_remove',
                'output_binned_contig_name', 'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        bins_to_remove = params.get('bins_to_remove')

        if not isinstance(bins_to_remove, list):
            error_msg = 'expecting a list for bins_to_remove param, '
            error_msg += 'but getting a [{}]'.format(type(bins_to_remove))
            raise ValueError(error_msg)

    def _validate_file_to_binned_contigs_params(self, params):
        """
        _validate_file_to_binned_contigs_params:
                validates params passed to file_to_binned_contigs method

        """
        log('Start validating file_to_binned_contigs params')

        # check for required parameters
        for p in [
                'assembly_ref', 'file_directory', 'binned_contig_name',
                'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _validate_binned_contigs_to_file_params(self, params):
        """
        _validate_binned_contigs_to_file_params:
                validates params passed to binned_contigs_to_file method

        """

        log('Start validating binned_contigs_to_file params')

        # check for required parameters
        for p in ['input_ref']:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

    def _validate_extract_binned_contigs_as_assembly_params(self, params):
        """
        _validate_extract_binned_contigs_as_assembly_params:
                validates params passed to extract_binned_contigs_as_assembly method

        """

        log('Start validating extract_binned_contigs_as_assembly params')

        # check for required parameters
        for p in [
                'binned_contig_obj_ref', 'extracted_assemblies',
                'assembly_suffix', 'workspace_name'
        ]:
            if p not in params:
                raise ValueError(
                    '"{}" parameter is required, but missing'.format(p))

        # convert comma-separated list of bins into a list of individual ids (the python
        # comprehension construction deals with the fact that split(',') returns a list of
        # length one, [''], for an empty string input

        extracted_assemblies = [
            x for x in params.get('extracted_assemblies').split(',') if x
        ]

        # parameter assembly_set_name is required if extracted_assemblies list has more
        # than one element

        if len(extracted_assemblies) > 1 and 'assembly_set_name' not in params:
            raise ValueError(
                '"assembly_set_names" parameter is required for more than one extracted assembly'
            )

    def _mkdir_p(self, path):
        """
        _mkdir_p: make directory for given path
        """
        if not path:
            return
        try:
            os.makedirs(path)
        except OSError as exc:
            if exc.errno == errno.EEXIST and os.path.isdir(path):
                pass
            else:
                raise

    def _get_bin_ids(self, file_directory):
        """
        _get_bin_ids: getting bin contig ids from files

        NOTE: This method is very specific to MaxBin2 app result.
              Bin contig files generated by MaxBin2 follow 'header.0xx.fasta' name pattern
        """

        bin_ids = []

        result_files = os.listdir(file_directory)

        for file in result_files:
            if re.match(r'.*\.\d{3}\.fasta', file):
                bin_ids.append(file)

        log('generated bin ids:\n{}'.format('\n'.join(bin_ids)))

        return bin_ids

    def _process_summary_file(self, bin_id, lines):
        """
        _process_summary_file: process header.summary file content
                               getting GC content (gc), Genome size (sum_contig_len)
                               and Completeness (cov) from header.summary file

        NOTE: This method is very specific to MaxBin2 app result.

        header.summary file could be one of below fomat:
        Bin name                  Abundance  Completeness    Genome size     GC content
        maxbin_output.001.fasta   0.00       97.2%           2690533         52.9

        Bin name                  Completeness    Genome size     GC content
        maxbin_output.001.fasta   97.2%           2690533         52.9
        """

        for line in lines:
            line_list = line.split('\t')
            if line_list[0] == bin_id:
                if len(line_list) == 5:
                    gc = round(float(line_list[4]) / 100, 5)
                    sum_contig_len = int(line_list[3])
                    cov = round(float(line_list[2].partition('%')[0]) / 100, 5)
                elif len(line_list) == 4:
                    gc = round(float(line_list[3]) / 100, 5)
                    sum_contig_len = int(line_list[2])
                    cov = round(float(line_list[1].partition('%')[0]) / 100, 5)

        return gc, sum_contig_len, cov

    def _get_total_contig_len(self, file_directory):
        """
        _get_total_contig_len: process header.summary file content
                               getting total contig length from header.summary file

        NOTE: This method is very specific to MaxBin2 app result.
        """

        log('generating total contig length')
        total_contig_len = 0

        file_list = os.listdir(file_directory)
        for file in file_list:
            if file.endswith('.summary'):
                with open(os.path.join(file_directory, file),
                          'r') as summary_file:
                    lines = summary_file.readlines()
                    for line in lines[1:]:
                        line_list = line.split('\t')
                        if len(line_list) == 5:
                            total_contig_len += int(line_list[3])
                        elif len(line_list) == 4:
                            total_contig_len += int(line_list[2])

        log('generated total contig length: {}'.format(total_contig_len))
        return total_contig_len

    def _generate_contig_bin_summary(self, bin_id, file_directory):
        """
        _generate_contig_bin_summary: getting ContigBin summary from header.summary file

        NOTE: This method is very specific to MaxBin2 app result.
        """
        log('generating summary for bin_id: {}'.format(bin_id))

        file_list = os.listdir(file_directory)

        for file in file_list:
            if file.endswith('.summary'):
                with open(os.path.join(file_directory, file),
                          'r') as summary_file:
                    lines = summary_file.readlines()
                    gc, sum_contig_len, cov = self._process_summary_file(
                        bin_id, lines)

        log('generated GC content: {}, Genome size: {} '.format(
            gc, sum_contig_len))
        log('and Completeness: {} for bin_id: {}'.format(cov, bin_id))
        return gc, sum_contig_len, cov

    def _generate_contigs(self, file_name, file_directory, assembly_ref):
        """
        _generate_contigs: generate contigs from assembly object

        file_name: file name of fasta file
        file_directory: fasta file directory
        assembly_ref: associated assembly object reference
        """

        log('start generating contig objects for file: {}'.format(file_name))

        assembly = self.dfu.get_objects({'object_refs':
                                         [assembly_ref]})['data'][0]
        assembly_contigs = assembly.get('data').get('contigs')

        contigs = {}
        for record in SeqIO.parse(os.path.join(file_directory, file_name),
                                  "fasta"):

            contig_id = record.id
            contig = assembly_contigs.get(contig_id)

            if contig:
                # using assembly object data
                contig_gc = contig.get('gc_content')
                sequence_length = contig.get('length')
            else:
                log('cannot find contig [{}] from assembly.'.format(contig_id))
                log('computing contig info')

                sequence = str(record.seq).upper()
                sequence_length = len(sequence)

                contig_gc_len = 0
                contig_gc_len += sequence.count('G')
                contig_gc_len += sequence.count('C')

                contig_gc = round(
                    float(contig_gc_len) / float(sequence_length), 5)

            contig = {'gc': contig_gc, 'len': sequence_length}
            contigs[contig_id] = contig

        log('complete generating contig objects for file: {}'.format(
            file_name))

        return contigs

    def _generate_contig_bin(self, bin_id, file_directory, assembly_ref):
        """
        _generate_contig_bin: gerneate ContigBin structure
        """
        log('start generating BinnedContig info for bin: {}'.format(bin_id))

        # generate ContigBin summery info
        gc, sum_contig_len, cov = self._generate_contig_bin_summary(
            bin_id, file_directory)

        # generate Contig info
        contigs = self._generate_contigs(bin_id, file_directory, assembly_ref)

        contig_bin = {
            'bid': bin_id,
            'contigs': contigs,
            'n_contigs': len(contigs),
            'gc': gc,
            'sum_contig_len': sum_contig_len,
            'cov': cov
        }

        log('complete generating BinnedContig info for bin: {}'.format(bin_id))

        return contig_bin

    def _get_contig_file(self, assembly_ref):
        """
        _get_contig_file: get contif file from GenomeAssembly object
        """

        log('retrieving contig file from assembly: {}'.format(assembly_ref))
        contig_file = self.au.get_assembly_as_fasta({
            'ref': assembly_ref
        }).get('path')

        sys.stdout.flush()
        contig_file = self.dfu.unpack_file({'file_path':
                                            contig_file})['file_path']

        log('saved contig file to: {}'.format(contig_file))

        return contig_file

    def _get_contig_string(self, contig_id, assembly_contig_file,
                           parsed_assembly):
        """
        _get_contig_string: find and return contig string from assembly contig file
        """

        # parsed_assembly = SeqIO.to_dict(SeqIO.parse(assembly_contig_file, "fasta"))

        contig_record = parsed_assembly.get(contig_id)

        if contig_record:
            string_contig = ''
            string_contig += '>{}\n'.format(contig_id)
            string_contig += str(contig_record.seq).upper()
            string_contig += '\n'
        else:
            error_msg = 'Cannot find contig [{}] from file [{}].'.format(
                contig_id, assembly_contig_file)
            raise ValueError(error_msg)

        return string_contig

    def _pack_file_to_shock(self, result_files):
        """
        _pack_file_to_shock: pack files in result_files list and save in shock
        """

        log('start packing and uploading files:\n{}'.format(
            '\n'.join(result_files)))

        output_directory = os.path.join(
            self.scratch, 'packed_binned_contig_' + str(uuid.uuid4()))
        self._mkdir_p(output_directory)
        result_file = os.path.join(
            output_directory,
            'packed_binned_contig_' + str(uuid.uuid4()) + '.zip')

        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for file in result_files:
                zip_file.write(file, os.path.basename(file))

        shock_id = self.dfu.file_to_shock({
            'file_path': result_file
        }).get('shock_id')

        log('saved file to shock: {}'.format(shock_id))

        return shock_id

    def _generate_report(self, report_message, params):
        """
        generate_report: generate summary report

        """
        log('Generating report')

        uuid_string = str(uuid.uuid4())
        upload_message = 'Job Finished\n\n'
        upload_message += report_message

        log('Report message:\n{}'.format(upload_message))

        report_params = {
            'message': upload_message,
            'workspace_name': params.get('workspace_name'),
            'report_object_name': 'MetagenomeUtils_report_' + uuid_string
        }

        kbase_report_client = KBaseReport(self.callback_url)
        output = kbase_report_client.create_extended_report(report_params)

        report_output = {
            'report_name': output['name'],
            'report_ref': output['ref']
        }

        return report_output

    def _generate_report_message(self, new_binned_contig_ref):
        """
        _generate_report_message: generate a report message for BinnedContig object
        """

        report_message = ''

        binned_contig = self.dfu.get_objects(
            {'object_refs': [new_binned_contig_ref]})['data'][0]
        binned_contig_info = binned_contig.get('info')
        binned_contig_name = binned_contig_info[1]
        report_message += 'Generated BinnedContig: {} [{}]\n'.format(
            binned_contig_name, new_binned_contig_ref)

        binned_contig_count = 0
        total_bins = binned_contig.get('data').get('bins')
        total_bins_count = len(total_bins)
        bin_ids = []
        for bin in total_bins:
            binned_contig_count += len(bin.get('contigs'))
            bin_ids.append(bin.get('bid'))

        report_message += '--------------------------\nSummary:\n\n'
        report_message += 'Binned contigs: {}\n'.format(binned_contig_count)
        report_message += 'Total size of bins: {}\n'.format(total_bins_count)
        report_message += 'Bin IDs:\n{}\n'.format('\n'.join(bin_ids))

        return report_message

    def _merge_bins(self, new_bin_id, bin_objects_to_merge):
        """
        _merge_bins: merge a list of bins into new_bin_id

        """
        total_contigs = {}
        total_gc_count = 0
        total_sum_contig_len = 0
        total_cov_len = 0

        for bin in bin_objects_to_merge:
            total_contigs.update(bin.get('contigs'))
            sum_contig_len = bin.get('sum_contig_len')
            total_sum_contig_len += sum_contig_len
            total_gc_count += sum_contig_len * bin.get('gc')
            total_cov_len += sum_contig_len * bin.get('cov')

        contig_bin = {
            'bid': new_bin_id,
            'contigs': total_contigs,
            'n_contigs': len(total_contigs),
            'gc': round(float(total_gc_count) / total_sum_contig_len, 5),
            'sum_contig_len': total_sum_contig_len,
            'cov': round(float(total_cov_len) / total_sum_contig_len, 5)
        }

        return contig_bin

    def _save_binned_contig(self, binned_contigs, workspace_name,
                            binned_contig_name):
        """
        _build_binned_contig: save BinnedContig object
        """

        workspace_name = workspace_name
        if isinstance(workspace_name, int) or workspace_name.isdigit():
            workspace_id = workspace_name
        else:
            workspace_id = self.dfu.ws_name_to_id(workspace_name)

        object_type = 'KBaseMetagenomes.BinnedContigs'
        save_object_params = {
            'id':
            workspace_id,
            'objects': [{
                'type': object_type,
                'data': binned_contigs,
                'name': binned_contig_name
            }]
        }

        dfu_oi = self.dfu.save_objects(save_object_params)[0]
        new_binned_contig_ref = str(dfu_oi[6]) + '/' + str(
            dfu_oi[0]) + '/' + str(dfu_oi[4])

        return new_binned_contig_ref

    def _check_bin_merges(self, bin_merges):
        """
        _check_bin_merges: checking bin_merges
        """
        bin_id_list = map(lambda item: item.get('bin_to_merge'), bin_merges)
        bin_ids = []
        map(lambda item: map(lambda bin_id: bin_ids.append(bin_id), item),
            bin_id_list)

        for bin_id in bin_id_list:
            if len(bin_id) <= 1:
                raise ValueError(
                    "Please provide at least two bin_ids to merge")
            for id in bin_id:
                if bin_ids.count(id) > 1:
                    raise ValueError(
                        "Same bin [{}] appears in muliple merges".format(id))

        new_bin_id_list = map(lambda item: item.get('new_bin_id'), bin_merges)
        for new_bin_id in new_bin_id_list:
            if new_bin_id_list.count(new_bin_id) > 1:
                raise ValueError(
                    "Same new Bin ID [{}] appears in muliple merges".format(
                        id))

    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.scratch = config['scratch']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.setapi = SetAPI(self.callback_url)
        self.wss = workspaceService(config['workspace-url'])

    def file_to_binned_contigs(self, params):
        """
        file_to_binned_contigs: Generating BinnedContigs ojbect from files

        input params:
        file_directory: file directory containing compressed/unpacked contig file(s) to
                        build BinnedContig object
        assembly_ref: metagenome assembly object reference
        binned_contig_name: BinnedContig object name
        workspace_name: the name/id of the workspace it gets saved to

        return params:
        binned_contig_obj_ref: generated result BinnedContig object reference
        """

        log('--->\nrunning MetagenomeFileUtils.file_to_binned_contigs\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_file_to_binned_contigs_params(params)

        file_directory = params.get('file_directory')
        assembly_ref = params.get('assembly_ref')

        log('starting generating BinnedContig object')
        bin_ids = self._get_bin_ids(file_directory)

        bins = []
        for bin_id in bin_ids:
            contig_bin = self._generate_contig_bin(bin_id, file_directory,
                                                   assembly_ref)
            bins.append(contig_bin)
        log('finished generating BinnedContig object')

        total_contig_len = self._get_total_contig_len(file_directory)

        binned_contigs = {
            'assembly_ref': assembly_ref,
            'bins': bins,
            'total_contig_len': total_contig_len
        }

        binned_contig_obj_ref = self._save_binned_contig(
            binned_contigs, params.get('workspace_name'),
            params.get('binned_contig_name'))

        returnVal = {'binned_contig_obj_ref': binned_contig_obj_ref}
        log('successfully saved BinnedContig object')

        return returnVal

    def binned_contigs_to_file(self, params):
        """
        binned_contigs_to_file: Convert BinnedContig object to fasta files and pack them to shock

        input params:
        input_ref: BinnedContig object reference

        optional params:
        save_to_shock: saving result bin files to shock. default to True
        bin_id_list: only extract bin_id_list

        return params:
        shock_id: saved packed file shock id
        bin_file_directory: directory that contains all bin files
        """

        log('--->\nrunning MetagenomeFileUtils.binned_contigs_to_file\n' +
            'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_binned_contigs_to_file_params(params)

        binned_contig_object = self.dfu.get_objects(
            {'object_refs': [params.get('input_ref')]})['data'][0]

        assembly_ref = binned_contig_object.get('data').get('assembly_ref')
        assembly_contig_file = self._get_contig_file(assembly_ref)
        log('parsing assembly file [{}] to dictionary'.format(
            assembly_contig_file))
        parsed_assembly = SeqIO.to_dict(
            SeqIO.parse(assembly_contig_file, "fasta"))

        bins = binned_contig_object.get('data').get('bins')

        result_directory = os.path.join(
            self.scratch, 'binned_contig_files_' + str(uuid.uuid4()))
        self._mkdir_p(result_directory)

        result_files = []
        bin_id_list = params.get('bin_id_list')
        for bin in bins:
            bin_id = bin.get('bid')
            if bin_id_list:
                if bin_id in bin_id_list:
                    log('processing bin: {}'.format(bin_id))
                    with open(os.path.join(result_directory, bin_id),
                              'w') as file:
                        contigs = bin.get('contigs')
                        for contig_id in contigs.keys():
                            contig_string = self._get_contig_string(
                                contig_id, assembly_contig_file,
                                parsed_assembly)
                            file.write(contig_string)
                    result_files.append(os.path.join(result_directory, bin_id))
                    log('saved contig file to: {}'.format(result_files[-1]))
            else:
                log('processing bin: {}'.format(bin_id))
                with open(os.path.join(result_directory, bin_id), 'w') as file:
                    contigs = bin.get('contigs')
                    for contig_id in contigs.keys():
                        contig_string = self._get_contig_string(
                            contig_id, assembly_contig_file, parsed_assembly)
                        file.write(contig_string)
                result_files.append(os.path.join(result_directory, bin_id))
                log('saved contig file to: {}'.format(result_files[-1]))

        if params.get('save_to_shock') or params.get('save_to_shock') is None:
            shock_id = self._pack_file_to_shock(result_files)
        else:
            shock_id = None

        returnVal = {
            'shock_id': shock_id,
            'bin_file_directory': result_directory
        }

        return returnVal

    def _get_object_name_from_ref(self, obj_ref):
        """given the object reference, return the object_name as a string"""
        return (self.wss.get_object_info_new({"objects": [{
            'ref': obj_ref
        }]})[0][1])

    def extract_binned_contigs_as_assembly(self, params):
        """
        extract_binned_contigs_as_assembly: extract one/multiple Bins from BinnedContigs as
                                            Assembly

        input params:
        binned_contig_obj_ref: BinnedContig object reference
        extracted_assemblies: a string, a comma-separated list of bin_ids to be extracted
        workspace_name: the name of the workspace it gets saved to

        return params:
        assembly_ref_list: a list of generated result Assembly object reference
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        log('--->\nrunning MetagenomeFileUtils.extract_binned_contigs_as_assembly\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_extract_binned_contigs_as_assembly_params(params)

        # convert comma-separated list of bins into a list of individual ids (the python
        # comprehension construction deals with the fact that split(',') returns a list of
        # length one, [''], for an empty string input

        extracted_assemblies = [
            x for x in params.get('extracted_assemblies').split(',') if x
        ]

        binned_contig_obj_ref = params.get('binned_contig_obj_ref')
        contigs_to_file_ret = self.binned_contigs_to_file({
            'input_ref':
            binned_contig_obj_ref,
            'save_to_shock':
            False,
            'bin_id_list':
            extracted_assemblies
        })

        bin_file_directory = contigs_to_file_ret.get('bin_file_directory')
        # bin_files will be either a list of the bin contig files corresponding to the
        # target bin ids, or a list of all bin contig files if extracted_assemblies is empty
        bin_files = os.listdir(bin_file_directory)

        # if extracted_assemblies is empty list, create a full one here
        if not extracted_assemblies:
            extracted_assemblies = bin_files
            log("extracted_assemblies was empty, is now " +
                pformat(extracted_assemblies))

        generated_assembly_ref_list = []
        assembly_suffix = params.get('assembly_suffix').strip()
        for bin_id in extracted_assemblies:
            if bin_id not in map(os.path.basename, bin_files):
                error_msg = 'bin_id [{}] cannot be found in BinnedContig '.format(
                    bin_id)
                error_msg += '[{}]'.format(binned_contig_obj_ref)
                raise ValueError(error_msg)
            else:
                output_assembly_name = bin_id + assembly_suffix
                log('saving assembly: {}'.format(output_assembly_name))
                for bin_file in bin_files:
                    if os.path.basename(bin_file) == bin_id:
                        log('starting generating assembly from {}'.format(
                            bin_id))
                        assembly_params = {
                            'file': {
                                'path': os.path.join(bin_file_directory,
                                                     bin_file)
                            },
                            'workspace_name': params.get('workspace_name'),
                            'assembly_name': output_assembly_name
                        }
                        assembly_ref = self.au.save_assembly_from_fasta(
                            assembly_params)
                        log('finished generating assembly from {}'.format(
                            bin_id))
                        generated_assembly_ref_list.append(assembly_ref)
        setref = None
        if (len(generated_assembly_ref_list) > 1):
            binned_contig_object_name = self._get_object_name_from_ref(
                binned_contig_obj_ref)
            assembly_set_name = params.get('assembly_set_name')
            log("saving assembly set {0}".format(assembly_set_name))
            setref = self.setapi.save_assembly_set_v1({
                'workspace':
                params.get('workspace_name'),
                'output_object_name':
                assembly_set_name,
                'data': {
                    'description':
                    'binned assemblies from {0}'.format(
                        binned_contig_object_name),
                    'items': [{
                        'ref': r
                    } for r in generated_assembly_ref_list]
                }
            })
            log("save assembly set_ref is {0}".format(setref.get('set_ref')))

        report_message = 'Generated Assembly Reference: {}'.format(
            ', '.join(generated_assembly_ref_list))

        reportVal = self._generate_report(report_message, params)

        returnVal = {'assembly_ref_list': generated_assembly_ref_list}
        returnVal.update(reportVal)

        if setref:
            returnVal.update({'assembly_set_ref': setref})

        return returnVal

    def remove_bins_from_binned_contig(self, params):
        """
        remove_bins_from_binned_contig: remove a list of bins from BinnedContig object

        input params:
        old_binned_contig_ref: Original BinnedContig object reference
        bins_to_remove: a list of bin ids to be removed
        output_binned_contig_name: Name for the output BinnedContigs object
        workspace_name: the name of the workspace new object gets saved to

        return params:
        new_binned_contig_ref: newly created BinnedContig object referece
        """

        log('--->\nrunning MetagenomeFileUtils.remove_bins_from_binned_contig\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_remove_bins_from_binned_contig_params(params)

        binned_contig_object = self.dfu.get_objects(
            {'object_refs': [params.get('old_binned_contig_ref')]})['data'][0]

        assembly_ref = binned_contig_object.get('data').get('assembly_ref')
        total_contig_len = int(
            binned_contig_object.get('data').get('total_contig_len'))

        old_bins = binned_contig_object.get('data').get('bins')
        bins_to_remove = params.get('bins_to_remove')

        for bin in list(old_bins):
            bin_id = bin.get('bid')
            if bin_id in bins_to_remove:
                log('removing bin_id: {}'.format(bin_id))
                old_bins.remove(bin)
                total_contig_len -= int(bin.get('sum_contig_len'))
                log('removed bin_id: {} from BinnedContig object'.format(
                    bin_id))

        binned_contigs = {
            'assembly_ref': assembly_ref,
            'bins': old_bins,
            'total_contig_len': total_contig_len
        }

        new_binned_contig_ref = self._save_binned_contig(
            binned_contigs, params.get('workspace_name'),
            params.get('output_binned_contig_name'))

        returnVal = {'new_binned_contig_ref': new_binned_contig_ref}
        log('successfully saved BinnedContig object')

        return returnVal

    def merge_bins_from_binned_contig(self, params):
        """
        merge_bins_from_binned_contig: merge a list of bins from BinnedContig object

        input params:
        old_binned_contig_ref: Original BinnedContig object reference
        bin_merges: a list of bin merges dicts
            new_bin_id: newly created bin id
            bin_to_merge: list of bins to merge
        output_binned_contig_name: Name for the output BinnedContigs object
        workspace_name: the name of the workspace new object gets saved to

        return params:
        new_binned_contig_ref: newly created BinnedContig object referece
        """

        log('--->\nrunning MetagenomeFileUtils.merge_bins_from_binned_contig\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        self._validate_merge_bins_from_binned_contig_params(params)

        bin_merges = params.get('bin_merges')
        self._check_bin_merges(bin_merges)

        binned_contig_object = self.dfu.get_objects(
            {'object_refs': [params.get('old_binned_contig_ref')]})['data'][0]

        assembly_ref = binned_contig_object.get('data').get('assembly_ref')
        total_contig_len = int(
            binned_contig_object.get('data').get('total_contig_len'))

        bins = binned_contig_object.get('data').get('bins')
        old_bin_ids = map(lambda item: item.get('bid'), bins)

        for bin_merge in bin_merges:
            new_bin_id = bin_merge.get('new_bin_id')
            bin_id_to_merge = bin_merge.get('bin_to_merge')
            if set(bin_id_to_merge) <= set(old_bin_ids):
                bin_objects_to_merge = []
                for bin in list(bins):
                    bin_id = bin.get('bid')
                    if bin_id in bin_id_to_merge:
                        bin_objects_to_merge.append(bin)
                        log('removing bin_id: {}'.format(bin_id))
                        bins.remove(bin)
                        total_contig_len -= int(bin.get('sum_contig_len'))
                        log('removed bin_id: {} from BinnedContig object'.
                            format(bin_id))
                new_bin = self._merge_bins(new_bin_id, bin_objects_to_merge)
                log('appending bin_id: {}'.format(new_bin_id))
                bins.append(new_bin)
                total_contig_len += int(new_bin.get('sum_contig_len'))
                log('appended bin_id: {} to BinnedContig object'.format(
                    new_bin_id))
            else:
                bad_bin_ids = list(set(bin_id_to_merge) - set(old_bin_ids))
                error_msg = 'bin_id: [{}] '.format(', '.join(bad_bin_ids))
                error_msg += 'is not listed in BinnedContig object'
                raise ValueError(error_msg)

        binned_contigs = {
            'assembly_ref': assembly_ref,
            'bins': bins,
            'total_contig_len': total_contig_len
        }

        new_binned_contig_ref = self._save_binned_contig(
            binned_contigs, params.get('workspace_name'),
            params.get('output_binned_contig_name'))

        returnVal = {'new_binned_contig_ref': new_binned_contig_ref}
        log('successfully saved BinnedContig object')

        return returnVal

    def edit_bins_from_binned_contig(self, params):
        """
        edit_bins_from_binned_contig: merge/remove a list of bins from BinnedContig object
                                    a wrapper method of:
                                    merge_bins_from_binned_contig
                                    remove_bins_from_binned_contig


        input params:
        old_binned_contig_ref: Original BinnedContig object reference
        bins_to_remove: a list of bin ids to be removed
        bin_merges: a list of bin merges dicts
            new_bin_id: newly created bin id
            bin_to_merge: list of bins to merge
        output_binned_contig_name: Name for the output BinnedContigs object
        workspace_name: the name of the workspace new object gets saved to

        return params:
        new_binned_contig_ref: newly created BinnedContig object referece
        report_name: report name generated by KBaseReport
        report_ref: report reference generated by KBaseReport
        """

        log('--->\nrunning MetagenomeFileUtils.edit_bins_from_binned_contig\n'
            + 'params:\n{}'.format(json.dumps(params, indent=1)))

        input_params = params.copy()
        if params.get('bins_to_remove'):
            bins_to_remove = input_params.get('bins_to_remove')
            if isinstance(bins_to_remove, string_types):
                input_params['bins_to_remove'] = bins_to_remove.split(',')
            new_binned_contig_ref = self.remove_bins_from_binned_contig(
                input_params).get('new_binned_contig_ref')
            input_params['old_binned_contig_ref'] = new_binned_contig_ref

        if params.get('bin_merges'):
            new_binned_contig_ref = self.merge_bins_from_binned_contig(
                input_params).get('new_binned_contig_ref')

        returnVal = {'new_binned_contig_ref': new_binned_contig_ref}

        report_message = self._generate_report_message(new_binned_contig_ref)
        reportVal = self._generate_report(report_message, params)
        returnVal.update(reportVal)

        return returnVal
Beispiel #31
0
    def get_promoter_for_gene(self, ctx, params):
        """
        :param params: instance of type "get_promoter_for_gene_input" (Genome
           is a KBase genome Featureset is a KBase featureset Promoter_length
           is the length of promoter requested for all genes) -> structure:
           parameter "workspace_name" of String, parameter "genome_ref" of
           String, parameter "featureSet_ref" of String, parameter
           "promoter_length" of Long
        :returns: instance of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN get_promoter_for_gene
        #code goes here
        dfu = DataFileUtil(self.callback_url)
        #objectRefs = {'object_refs':[params['genome_ref'],params['featureSet_ref']]}
        objectRefs = {'object_refs': [params['featureSet_ref']]}
        ws = Workspace('https://appdev.kbase.us/services/ws')
        ws_name = params['workspace_name']
        subset = ws.get_object_subset([{
            'included':
            ['/features/[*]/location', '/features/[*]/id', '/assembly_ref'],
            'ref':
            params['genome_ref']
        }])
        features = subset[0]['data']['features']
        aref = subset[0]['data']['assembly_ref']
        objects = dfu.get_objects(objectRefs)
        #genome = objects['data'][0]['data']
        #featureSet = objects['data'][1]['data']
        featureSet = objects['data'][0]['data']
        assembly_ref = {'ref': aref}
        #print assembly_ref
        #with open(self.shared_folder + '/genome.json','w') as f:
        #    json.dump(genome,f)
        #with open(self.shared_folder + '/featureSet.json','w') as f:
        #    json.dump(featureSet,f)
        #with open('/kb/module/work/asssembly.json','w') as f:
        #    json.dump(assembly,f)
        print('Downloading Assembly data as a Fasta file.')
        assemblyUtil = AssemblyUtil(self.callback_url)
        fasta_file = assemblyUtil.get_assembly_as_fasta(assembly_ref)

        #pprint(fasta_file)
        #loop over featureSet
        #find matching feature in genome
        #get record, start, orientation, length
        #TODO: add some error checking logic to the bounds of the promoter
        prom = ""
        featureFound = False
        for feature in featureSet['elements']:
            #print(feature)
            #print(featureSet['elements'][feature])
            featureFound = False
            for f in features:
                #print f['id']
                #print feature
                if f['id'] == feature:
                    attributes = f['location'][0]
                    featureFound = True
                    #print('found match ' + feature)
                    #print(f['location'])
                    break
            if featureFound:
                for record in SeqIO.parse(fasta_file['path'], 'fasta'):
                    #for record in SeqIO.parse('/kb/module/work/Gmax_189_genome_assembly.fa', 'fasta'):
                    #print(record.id)
                    #print(attributes[0])
                    if record.id == attributes[0]:
                        #print('adding to prom string')
                        #print(attributes[0])
                        if attributes[2] == '+':
                            #print('1')
                            #might need to offset by 1?
                            end = attributes[1]
                            start = end - params['promoter_length']
                            if end < 0:
                                end = 0
                            promoter = record.seq[start:end].upper()
                            #HERE: resolve ambiguous characters
                            prom += ">" + feature + "\n"
                            prom += promoter + "\n"

                        elif attributes[2] == '-':
                            #print('2')
                            start = attributes[1]
                            end = start + params['promoter_length']
                            if end > len(record.seq) - 1:
                                end = len(record.seq) - 1
                            promoter = record.seq[start:end].upper()
                            complement = {
                                'A': 'T',
                                'C': 'G',
                                'G': 'C',
                                'T': 'A',
                                'N': 'N'
                            }
                            promoter = ''.join(
                                [complement[base] for base in promoter[::-1]])
                            #HERE: resolve ambiguous characters
                            prom += ">" + feature + "\n"
                            prom += promoter + "\n"

                        else:
                            print('Error on orientation')
            else:
                print('Could not find feature ' + feature + 'in genome')
        promOutputPath = '/kb/module/work/tmp/promFile.fa'
        #print('prom string\n' + str(prom))
        with open(promOutputPath, 'w') as promFile:
            promFile.write(str(prom))

        timestamp = int(
            (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
            * 1000)
        html_output_dir = os.path.join(self.shared_folder,
                                       'output_html.' + str(timestamp))
        if not os.path.exists(html_output_dir):
            os.makedirs(html_output_dir)
        html_file = 'promoter.html'
        output_html_file_path = os.path.join(html_output_dir, html_file)

        html_report_lines = '<html><body>'
        html_report_lines += '<pre>' + prom + '</pre>'
        html_report_lines += '</body></html>'

        with open(output_html_file_path, 'w', 0) as html_handle:
            html_handle.write(str(html_report_lines))

        try:
            html_upload_ret = dfu.file_to_shock({
                'file_path': html_output_dir,
                #html_upload_ret = dfu.file_to_shock({'file_path': output_html_file_path,
                #'make_handle': 0})
                'make_handle': 0,
                'pack': 'zip'
            })
        except:
            raise ValueError('error uploading HTML file to shock')

        reportName = 'identify_promoter_report_' + str(uuid.uuid4())

        reportObj = {
            'objects_created': [],
            'message': '',
            'direct_html': None,
            'direct_html_index': 0,
            'file_links': [],
            'html_links': [],
            'html_window_height': 220,
            'workspace_name': params['workspace_name'],
            'report_object_name': reportName
        }

        # attach to report obj
        #reportObj['direct_html'] = None
        reportObj['direct_html'] = ''
        reportObj['direct_html_link_index'] = 0
        reportObj['html_links'] = [{
            'shock_id': html_upload_ret['shock_id'],
            'name': html_file,
            'label': 'View'
        }]

        report = KBaseReport(self.callback_url, token=ctx['token'])
        #report_info = report.create({'report':reportObj, 'workspace_name':input_params['input_ws']})
        report_info = report.create_extended_report(reportObj)
        output = {
            'report_name': report_info['name'],
            'report_ref': report_info['ref']
        }
        #changing output to be path string
        #TODO: get rid of this html maybe and move into find_motifs
        output = promOutputPath

        #iterate over records in fasta
        #for record in SeqIO.parse(fasta_file['path'], 'fasta'):

        #objects list of Genome and featureSet

        #pprint(objects)
        #END get_promoter_for_gene

        # At some point might do deeper type checking...
        if not isinstance(output, basestring):
            raise ValueError('Method get_promoter_for_gene return value ' +
                             'output is not type basestring as required.')
        # return the results
        return [output]
Beispiel #32
0
class variation_importer_utils:
    def __init__(self, utility_params):
        self.params = utility_params
        # self.scratch = utility_params['scratch']
        self.scratch = os.path.join(utility_params['scratch'],
                                    'variation_importer_' + str(uuid.uuid4()))
        os.mkdir(self.scratch)
        self.service_wiz_url = utility_params['srv-wiz-url']
        self.callback_url = utility_params['callback_url']

        self.dfu = DataFileUtil(self.callback_url)
        self.kbr = KBaseReport(self.callback_url,
                               token=utility_params['token'])

    def _create_fake_location_data(self):
        location = {
            'lat':
            random.uniform(-90, 90),
            'lon':
            random.uniform(-180, 180),
            'elevation':
            random.uniform(0, 100),
            'description':
            "".join([random.choice(string.ascii_letters) for n in xrange(20)])
        }
        return location

    def _create_fake_straininfo(self, genotype_id):
        straininfo = {
            'source_id': genotype_id,
            'location_info': self._create_fake_location_data()
        }
        return straininfo

    def _create_fake_population(self, genotypes):
        population = {'description': 'Faker population data.', 'strains': []}
        for genome in genotypes:
            population['strains'].append(self._create_fake_straininfo(genome))
        return population

    def _create_fake_kinship_matrix(self):
        kinship = {
            'row_ids': ['one', 'two'],
            'col_ids': ['one', 'two'],
            'kinship_coefficients': [[0.1, 0.1], [0.1, 0.1]]
        }
        return kinship

    def _compare(self, s, t):
        return Counter(s) == Counter(t)

    def pretend_download_staging_file(self, vcf_filename, scratch):
        vcf_filepath = os.path.join(scratch, vcf_filename)
        shutil.copy('/kb/module/data/' + vcf_filename, vcf_filepath)
        return {'copy_file_path': vcf_filepath}

    def _generate_population(self,
                             location_filepath,
                             genotypes,
                             population_description="None Provided"):
        locations = pd.read_csv(location_filepath, delimiter='\t')

        # Drop any missing data from id, latitude, or longitude.
        locations.dropna(subset=['id', 'latitude', 'longitude'], inplace=True)

        # Compare the location IDs with the genotype IDs
        if not (self._compare(locations.iloc[:, 0].astype(str).tolist(),
                              genotypes)):
            log("Location IDs do not match Sample IDs in Variation file!")
            raise ValueError(
                "Location IDs do not match Sample IDs in Variation file!")

        col_names = [x.lower() for x in locations.columns.values]
        expected_columns = ['id', 'latitude', 'longitude']
        optional_columns = ['elevation', 'description']

        # CHeck that first three columns match the expected columns.
        if not (self._compare(col_names[0:3], expected_columns)):
            raise ValueError("Missing or unexpected column names in {}".format(
                location_filepath))

        # If optional columns are not present, give default value for each.
        for col in optional_columns:
            if col not in col_names:
                if col == 'elevation':
                    locations[col] = 0.0
                else:
                    locations[col] = "None provided."

        population = {'description': population_description, 'strains': []}
        for idx, row in locations.iterrows():
            population['strains'].append({
                'source_id': str(row['id']),
                'location_info': {
                    'lat': row['latitude'],
                    'lon': row['longitude'],
                    'elevation': row['elevation'],
                    'description': row['description']
                }
            })

        return population

    def _validate_vcf(self, vcf_filepath, vcf_version):
        validation_output_dir = os.path.join(self.scratch,
                                             'validation_' + str(uuid.uuid4()))
        os.mkdir(validation_output_dir)

        if vcf_version >= 4.1:
            print("Using vcf_validator_linux...")
            validator_cmd = ["vcf_validator_linux"]
            validator_cmd.append("-i")
            validator_cmd.append(vcf_filepath)
            validator_cmd.append("-o")
            validator_cmd.append(validation_output_dir)
        else:
            print("Using vcftools to validate...")
            validator_cmd = ["vcf-validator"]
            validator_cmd.append(vcf_filepath)
            print("VCF version below 4.1.  No validation logging.")

        print("Validator command: {}".format(validator_cmd))
        p = subprocess.Popen(validator_cmd,
                             cwd=self.scratch,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        validator_output = []
        while True:
            line = p.stdout.readline()
            if not line:
                break
            validator_output.append(line)

        p.wait()

        validation_output_filename = [
            f for f in os.listdir(validation_output_dir) if f.endswith('.txt')
        ][0]
        validation_output_filepath = os.path.join(validation_output_dir,
                                                  validation_output_filename)

        if not validation_output_filename:
            print('Validator did not generate log file!')
            raise Exception("Validator did not generate a log file.")

        log("Validator output filepath: {}".format(validation_output_filepath))

        log("Return code from validator {}".format(p.returncode))

        return validation_output_filepath, p.returncode

    # Retrieve contigs from assembly file.
    def _get_contigs_from_assembly(self, assembly_ref, type='Assembly'):
        try:
            assembly_data = self.dfu.get_objects(
                {'object_refs': [assembly_ref]})['data'][0]['data']
        except Exception as e:
            print("Unable to retrieve Assembly reference: {}".format(
                assembly_ref))
            raise ValueError(e)
        raw_contigs = assembly_data['contigs']
        contigs = {}

        # Contigs returns just a dict with key and contig_id
        for key, value in raw_contigs.iteritems():
            contigs[str(key)] = value['contig_id']
        return raw_contigs

    def _get_version_contigs_genotypes(self, vcf_filepath):
        contigs = []
        genotypes = []
        version = ''
        with (gzip.open if vcf_filepath.endswith('.gz') else open)(
                vcf_filepath, 'rt') as vcf:
            line = vcf.readline()
            tokens = line.split('=')

            if not (tokens[0].startswith('##fileformat')):
                log("Invalid VCF.  ##fileformat line in meta is improperly formatted."
                    )
                raise ValueError(
                    "Invalid VCF.  ##fileformat line in meta is improperly formatted."
                )
            version = float(tokens[1][-4:].rstrip())
            log("VCF version: {}".format(version))
            for line in vcf:
                if line.startswith("#CHROM"):
                    log("#CHROM encountered, exiting loop.")
                    genotypes = line.split()[9:]
                    log("Number Genotypes in vcf: {}".format(len(genotypes)))
                    break
                tokens = line.split("=")

                if tokens[0].startswith('##contig'):
                    contigs.append(tokens[2][:-2])
        return version, contigs, genotypes

    # Arabidopsis ref: 18590/2/8
    def _get_assembly_ref_from_genome(self, genome_ref):
        ga = GenomeAnnotationAPI(self.service_wiz_url)
        inputs_get_assembly = {'ref': genome_ref}
        try:
            assembly_object_ref = ga.get_assembly(inputs_get_assembly)
        except Exception as e:
            print(
                "Unable to retrieve Assembly reference ID from Genome ref_id: {}"
                .format(genome_ref))
            raise Exception(e)

        return assembly_object_ref

    def _generate_output_file_list(self):
        log('Start packing result files')
        output_files = list()

        result_file = os.path.join(self.scratch,
                                   'variation_importer_results.zip')
        excluded_extensions = ['.zip', '.vcf', '.vcf.gz', '.html', '.DS_Store']
        with zipfile.ZipFile(result_file,
                             'w',
                             zipfile.ZIP_DEFLATED,
                             allowZip64=True) as zip_file:
            for root, dirs, files in os.walk(self.scratch):
                for file in files:
                    if not (file.endswith(tuple(excluded_extensions))
                            # file.endswith('.zip') or
                            # file.endswith('.vcf') or
                            # file.endswith('.vcf.gz') or
                            # file.endswith('.html') or
                            # file.endswith('.DS_Store')
                            ):
                        zip_file.write(os.path.join(root, file), file)

        output_files.append({
            'path':
            result_file,
            'name':
            os.path.basename(result_file),
            'label':
            os.path.basename(result_file),
            'description':
            'File(s) generated by Variation Importer'
        })
        log("Importer output generated: {}".format(output_files))

        return output_files

    def _generate_report(self, params, variation_results, variation_file_path):

        stats_results = self._generate_variation_stats(
            params['additional_output_type'], variation_file_path)

        html_report = self._generate_html_report(variation_results,
                                                 stats_results)

        file_links = self._generate_output_file_list()
        objects = []
        if (variation_results['valid_variation_file']):
            objects = [{
                'ref':
                variation_results['variation_obj_ref'],
                'description':
                'Variation Object created by VCF Importer'
            }]

        report_params = {
            'objects_created': objects,
            'message': '',
            'direct_html_link_index': 0,
            'file_links': file_links,
            'html_links': html_report,
            'html_window_height': 330,
            'workspace_name': params['workspace_name'],
            'report_object_name':
            'variation_importer_report_' + str(uuid.uuid4())
        }
        kbr_output = self.kbr.create_extended_report(report_params)
        report_output = {
            'report_name': kbr_output['name'],
            'report_ref': kbr_output['ref'],
            'variation_ref': variation_results['variation_obj_ref']
        }
        log("Returning from _generate_report!")
        return report_output

    def _generate_html_report(self, variation_results, stats_output=None):
        """
            _generate_html_report: generate html report from output files
        """
        html_report = list()
        print("Validation output filepath passed to html report: {}".format(
            variation_results['validation_output_filepath']))
        try:
            report_dir = os.path.join(self.scratch, 'html')
            os.mkdir(report_dir)

            with open(template_dir, 'r') as html, open(
                    variation_results['validation_output_filepath'],
                    'r') as validation:

                validation_content = '<p><h4>{} '.format(
                    variation_results['variation_filename'])
                if variation_results.get('valid_variation_file'):
                    validation_content += '<em><i>is</i> a valid </em> variation file.'
                else:
                    validation_content += '<em><i>is not</i> a valid </em>variation file. Details below.'
                validation_content += '</h4></p>'

                report = html.read()

                # Discard the first line of the validation file.  It is irrelevant.
                validation.readline()

                validation_content += '<p><h4>Errors and warning generated by VCF validator:</h4></p>'
                validation_content += '<ul>'
                for line in validation.readlines():
                    validation_content += '<li>{}</li>'.format(line)
                validation_content += '</ul>'

                if variation_results.get('invalid_contigs'):
                    validation_content += '<h4>The following Contigs were not found in the reference genome.  The possible contigs have been written to the file {}.  Please see the associated links to download.</h4>'.format(
                        variation_results.get('genome_ref'),
                        'valid_contigs.txt')
                    validation_content += '<ul>'
                    for contig in variation_results.get('invalid_contigs'):
                        validation_content += '<li>{}</li>'.format(contig)
                    validation_content += '</ul>'

                # if not variation_results.get('contigs'):
                #     validation_content += '<h4>No contig information was included in the VCF file header!  Please recreate the VCF file with each contig described in the meta description </h4>'
                report = report.replace('Validation_Results',
                                        validation_content)

                if (stats_output.get('stats_file_dir')):
                    summary_results = '<p><h4>Summary Statistics</h4></p>'
                    summary_results += '''
                                        <table>
                                            <tr>
                                                <th>Number of SNPs</th>
                                                <th>Number of Genotypes </th>
                                            </tr>
                                        '''
                    summary_results += '<tr>'
                    summary_results += '<td>{}</td><td>{}</td>'.format(
                        'To be added later',
                        variation_results['num_genotypes'])
                    summary_results += '</tr></table>'
                    report = report.replace('Variation_Statistics',
                                            summary_results)

                # visualization
                image_content = ''
                if (stats_output.get('stats_img_dir')):
                    image_dir = stats_output.get('stats_img_dir')

                    for file in glob.glob(os.path.join(image_dir, '*.png')):
                        shutil.move(file, report_dir)

                    for image in glob.glob(report_dir + "/*.png"):
                        image = image.replace(report_dir + '/', '')
                        caption = image.replace(report_dir + '/',
                                                '').replace('.png', '')
                        image_content += '<p style="text-align:center"><img align="center" src="{}" ' \
                            '></a><a target="_blank"><br>' \
                            '<p align="center">{}</p></p>'.format(image, caption)

                else:
                    image_content += 'No visualizations generated.'

                report = report.replace("Visualization_Results", image_content)
        except Exception as e:
            print("Error generating HTML report.")
            raise

        report_file_path = os.path.join(report_dir, 'index.html')
        with open(report_file_path, 'w') as output:
            output.write(report)
        try:
            html_upload_ret = self.dfu.file_to_shock({
                'file_path': report_file_path,
                'make_handle': 0,
                'pack': 'zip'
            })
            log("Variation HTML report to shock ref: {}".format(
                html_upload_ret))
        except:
            raise ValueError('Error uploading HTML to shock')

        html_report.append({
            'shock_id': html_upload_ret['shock_id'],
            'name': os.path.basename(report_file_path),
            'label': os.path.basename(report_file_path),
            'description': 'HTML report for Variation Importer'
        })

        return html_report

    def _generate_variation_stats(self, additional_output_type,
                                  variation_filepath):
        """
            :param commments go here
        """
        file_output_directory = os.path.join(self.scratch,
                                             'stats_' + str(uuid.uuid4()))
        os.mkdir(file_output_directory)

        image_output_directory = os.path.join(
            self.scratch, 'stats_images_' + str(uuid.uuid4()))
        os.mkdir(image_output_directory)

        # TODO: Validate user supplied params and build PLINK command
        plink_cmd = ["plink"]
        plink_cmd.append('--vcf')
        plink_cmd.append(variation_filepath)

        # plink_cmd.append('--recode12')
        # plink_cmd.append('transpose')
        # plink_cmd.append('--output-missing-genotype')
        # plink_cmd.append("0")
        plink_cmd.append('--freq')
        plink_cmd.append('--hardy')
        # plink_cmd.append('gz')

        plink_cmd.append('--out')
        plink_cmd.append(variation_filepath)

        print("PLINK arguments: {}".format(plink_cmd))

        plink_output = {
            "errors": [],
            "warnings": []
            # "notes" : []
        }
        p = subprocess.Popen(plink_cmd,
                             cwd=file_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        while True:
            line = p.stdout.readline()
            if not line:
                break
            # log(line)
            tokens = line.split(':')
            if (tokens[0] == 'Error'):
                plink_output['errors'].append(line)
                raise ValueError('PLINK 1.9 error: ' + line)
            elif (tokens[0] == 'Warning'):
                plink_output['warnings'].append(line)
                print(line)
            # elif(tokens[0] == 'Note'):
            #     plink_output['notes'].append(line)
            #     print(line)

        p.stdout.close()
        p.wait()
        plink_output_filepath = os.path.join(file_output_directory,
                                             'plink_cli_output.txt')
        with open(plink_output_filepath, 'w') as plink:
            for data in plink_output:
                plink.write("{}: {}\n".format(data, plink_output[data]))

        plink_output_files = [
            f for f in os.listdir(self.scratch)
            if f.startswith(os.path.basename(variation_filepath) + '.')
        ]

        for file in plink_output_files:
            shutil.move(os.path.join(self.scratch, file),
                        file_output_directory)

        if p.returncode != 0:
            log("PLINK encountered an error during runtime.  Please see log file."
                )

        variation_filename = os.path.basename(variation_filepath)
        base_filepath = os.path.join(file_output_directory, variation_filename)
        freq_filepath = base_filepath + '.frq'

        maf_script_filepath = '/kb/module/lib/VariationImporter/Utils/MAF_check.R'
        hwe_script_filepath = '/kb/module/lib/VariationImporter/Utils/HWE.R'
        log("Frequency filepath: {}".format(freq_filepath))
        # TODO: make function to do Rscript calls.
        # generate visualizations and store in directory
        maf_command = ['Rscript']
        maf_command.append('--no-save')
        maf_command.append('--vanilla')
        maf_command.append(maf_script_filepath)
        maf_command.append(freq_filepath)
        maf_command.append("Minor Allele Frequencies.png")
        r = subprocess.Popen(maf_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        r.wait()
        if r.returncode != 0:
            log("Error creating MAF histogram in R")

        hwe_filepath = base_filepath + '.hwe'
        zoom_filepath = hwe_filepath + '.zoom'
        zoom_command = '''awk '{{ if ($9 < 0.00001) print $0 }}' {} > {}'''.format(
            hwe_filepath, zoom_filepath)
        try:
            z = subprocess.Popen(zoom_command,
                                 cwd=file_output_directory,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.STDOUT,
                                 shell=True)
            z.wait()

            if z.returncode != 0:
                log("Error creating HWE zoom file.")

        except Exception as e:
            log("Error creating zoom HWE file: {}".format(e))

        hwe_command = ['Rscript']
        hwe_command.append('--no-save')
        hwe_command.append('--vanilla')
        hwe_command.append(hwe_script_filepath)
        hwe_command.append(hwe_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium.png")
        hwe_command.append(zoom_filepath)
        hwe_command.append("Hardy-Weinberg Equilibrium Zoom.png")
        print("MAF command: {}".format(hwe_command))
        h = subprocess.Popen(hwe_command,
                             cwd=image_output_directory,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=False)
        h.wait()

        if h.returncode != 0:
            log("Error generating HWE Zoom plot")

        return {
            'stats_file_dir': file_output_directory,
            'stats_img_dir': image_output_directory
        }

    def _save_variation_to_ws(self, workspace_name, variation_obj,
                              variation_filepath, kinship_matrix):
        ws_id = self.dfu.ws_name_to_id(workspace_name)
        try:
            vcf_shock_return = self.dfu.file_to_shock({
                'file_path': variation_filepath,
                'make_handle': 1,
                'pack': 'gzip'
            })
        except Exception as e:
            print("Error uploading file to shock!")
            raise ValueError(e)

        variation_obj['variation_file_reference'] = vcf_shock_return.get(
            'shock_id')

        info = self.dfu.save_objects({
            'id':
            ws_id,
            'objects': [{
                'type': 'KBaseGwasData.Variations',
                'data': variation_obj,
                'name': 'TestVariationImporterName'
            }]
        })[0]

        variation_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        log("Variation reference created: {}".format(variation_ref))
        return variation_ref

    def validate_vcf(self, params):
        """
            :param params: dict containing all input parameters.
        """

        returnVal = {}
        valid_vcf_file = True

        try:
            vcf_filepath = self.pretend_download_staging_file(
                params['staging_file_subdir_path'],
                self.scratch).get('copy_file_path')

            location_filepath = self.pretend_download_staging_file(
                params['location_file_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['staging_file_subdir_path']))

        try:
            location_filepath = self.pretend_download_staging_file(
                params['location_file_subdir_path'],
                self.scratch).get('copy_file_path')

        except Exception as e:
            raise Exception("Unable to download {} from staging area.".format(
                params['location_file_subdir_path']))

        # Check file size
        log("{} file size: {}".format(vcf_filepath,
                                      os.path.getsize(vcf_filepath)))
        log('\nValidating {}...'.format(vcf_filepath))

        vcf_version, vcf_contigs, vcf_genotypes = self._get_version_contigs_genotypes(
            vcf_filepath)

        if not vcf_contigs:
            log("No contig data in {} header.".format(vcf_filepath))
            raise ValueError(
                "No contig data in {} header.".format(vcf_filepath))

        if (vcf_version < 4.1):
            log("VCF file is version {}.  Must be at least version 4.1".format(
                vcf_version))
            raise ValueError(
                "VCF file is version {}.  Must be at least version 4.1".format(
                    vcf_version))

        # Generate population object
        population = self._generate_population(location_filepath,
                                               vcf_genotypes)

        # Retrieve Assembly object reference associated with genome.
        try:
            assembly_ref = self._get_assembly_ref_from_genome(
                params['genome_ref'])
        except Exception as e:
            print("Unable to retrieve {}".format(params['genome_ref']))
            raise ValueError(e)

        # Retrieve contig list from Assembly object.
        try:
            assembly_contigs = self._get_contigs_from_assembly(assembly_ref)
        except Exception as e:
            print("Unable to retrieve contigs from Assembly ref: {}".format(
                assembly_ref))
            raise ValueError(e)

        log("Length of assembly contigs: {}".format(len(assembly_contigs)))
        # Compare contig IDs from VCF to those in the Assembly object
        invalid_contigs = []
        for contig in vcf_contigs:
            if contig not in assembly_contigs.keys():
                invalid_contigs.append(contig)

        if invalid_contigs:
            log("Invalid contig IDs found in {}".format(vcf_filepath))
            valid_contig_filepath = os.path.join(self.scratch,
                                                 'valid_contigs.txt')
            log("Writing valid contigs to file: {}".format(
                valid_contig_filepath))
            with open(valid_contig_filepath, 'w') as icf:
                for contig in assembly_contigs:
                    icf.write(contig + '\n')
            valid_vcf_file = False

        validation_output_filepath, returncode = self._validate_vcf(
            vcf_filepath, vcf_version)

        if returncode != 0:
            valid_vcf_file = False

        kinship_matrix = self._create_fake_kinship_matrix()

        variation_obj_ref = ''
        if valid_vcf_file:
            variation_object = {
                "genome": params['genome_ref'],
                "population": population,
                "contigs": vcf_contigs,
                "comment": "Comments go here",
                "assay": "Assay data goes gere.",
                "originator": "PI/Lab info goes here",
                "pubmed_id": "PubMed ID goes here",
                "kinship_info": kinship_matrix
            }

            variation_obj_ref = self._save_variation_to_ws(
                params['workspace_name'], variation_object, vcf_filepath,
                kinship_matrix)

        log("Variation object reference: {}".format(variation_obj_ref))
        variation_report_metadata = {
            'valid_variation_file': valid_vcf_file,
            'variation_obj_ref': variation_obj_ref,
            'variation_filename': os.path.basename(vcf_filepath),
            'validation_output_filepath': validation_output_filepath,
            'vcf_version': vcf_version,
            'num_genotypes': len(vcf_genotypes),
            'num_contigs': len(vcf_contigs),
            'invalid_contigs': invalid_contigs
        }

        returnVal = self._generate_report(params, variation_report_metadata,
                                          vcf_filepath)

        return returnVal