Example #1
0
 def load_test_genome_with_cache(self, filename, gbff_cache_filename):
     """ cache filename needs to in scratch space """
     with open(filename, 'r') as file:
         data_str = file.read()
     data = json.loads(data_str)
     # save to ws
     save_info = {
         'workspace':
         self.getWsName(),
         'objects': [{
             'type': 'KBaseGenomes.Genome',
             'data': data,
             'name': 'e_coli'
         }]
     }
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     shock_file = dfu.file_to_shock({
         'file_path': gbff_cache_filename,
         'make_handle': 1
     })
     data['genbank_handle_ref'] = shock_file['handle']['hid']
     # save to ws
     save_info['objects'][0]['name'] = 'e_coli_with_genbank'
     result = self.ws.save_objects(save_info)
     info = result[0]
     ref = str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
     print('created test genome with gbff cache: ' + ref + ' from file ' +
           filename)
     return ref
    def __init__(self, config):
        self.callback_url = config['SDK_CALLBACK_URL']
        self.token = config['KB_AUTH_TOKEN']
        self.scratch = config['scratch']

        self.dfu = DataFileUtil(self.callback_url)
        self.ru = ReadsUtils(self.callback_url)
Example #3
0
    def UploadFromMEME(self, ctx, params):
        """
        :param params: instance of type "UploadGibbsInParams" -> structure:
           parameter "path" of String, parameter "ws_name" of String,
           parameter "obj_name" of String
        :returns: instance of type "UploadOutput" -> structure: parameter
           "obj_ref" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN UploadFromMEME
        print('Extracting motifs')
        motifList = MU.parse_meme_output(params['path'])
        print(motifList)

        MSO = {}
        MSO['Condition'] = 'Temp'
        MSO['SequenceSet_ref'] = '123'
        MSO['Motifs'] = []
        MSO['Alphabet'] = ['A', 'C', 'G', 'T']
        MSO['Background'] = {}
        for letter in MSO['Alphabet']:
            MSO['Background'][letter] = 0.0

        MSU.parseMotifList(motifList, MSO)
        MSU.CheckLength(MSO, params['min_len'], params['max_len'])
        if 'absolute_locations' in params:
            for motif in MSO['Motifs']:
                for loc in motif['Motif_Locations']:
                    if loc['sequence_id'] in params['absolute_locations']:
                        loc['sequence_id'] = params['contig']
                        absStart = int(params['start'])
                        loc['start'] = absStart
                        loc['end'] = absStart + loc['end']

        dfu = DataFileUtil(self.callback_url)
        save_objects_params = {}
        save_objects_params['id'] = dfu.ws_name_to_id(params['ws_name'])
        save_objects_params['objects'] = [{
            'type': 'KBaseGeneRegulation.MotifSet',
            'data': MSO,
            'name': params['obj_name']
        }]

        info = dfu.save_objects(save_objects_params)[0]
        print('SAVED OBJECT')
        print(info)
        motif_set_ref = "%s/%s/%s" % (info[6], info[0], info[4])
        print(motif_set_ref)
        output = {'obj_ref': motif_set_ref}
        print(output)

        #END UploadFromMEME

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method UploadFromMEME return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Example #4
0
def make_fake_expression(callback_url, dummy_file, name, genome_ref,
                         annotation_ref, alignment_ref, ws_name, ws_client):
    """
    Makes a Fake KBaseRNASeq.RNASeqExpression object and returns a ref to it.
    genome_ref: reference to a genome object
    annotation_ref: reference to a KBaseRNASeq.GFFAnnotation
    alignment_ref: reference to a KBaseRNASeq.RNASeqAlignment
    """
    dfu = DataFileUtil(callback_url)
    dummy_shock_info = dfu.file_to_shock({
        "file_path": dummy_file,
        "make_handle": 1
    })
    exp = {
        "id": "fake",
        "type": "fake",
        "numerical_interpretation": "fake",
        "expression_levels": {
            "feature_1": 0,
            "feature_2": 1,
            "feature_3": 2
        },
        "genome_id": genome_ref,
        "annotation_id": annotation_ref,
        "mapped_rnaseq_alignment": {
            "id1": alignment_ref
        },
        "condition": "",
        "tool_used": "none",
        "tool_version": "0.0.0",
        "file": dummy_shock_info['handle']
    }
    return make_fake_object(exp, "KBaseRNASeq.RNASeqExpression", name, ws_name,
                            ws_client)
Example #5
0
def download_genome_to_json_files(token, genome_ref, target_dir):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    file_name_to_data_map = {}
    dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                       token=token,
                       service_ver='dev')
    genome_data = dfu.get_objects({'object_refs': [genome_ref]})['data'][0]
    genome_obj = genome_data['data']
    genome_meta = genome_data['info'][10]
    file_name_to_data_map["genome.json"] = genome_obj
    file_name_to_data_map["genome.meta.json"] = genome_meta
    if 'genbank_handle_ref' in genome_obj:
        gbk_file_name = "genome.gbk"
        dfu.shock_to_file({
            'handle_id': genome_obj['genbank_handle_ref'],
            'file_path': os.path.join(target_dir, gbk_file_name)
        })
        genome_obj['genbank_handle_ref'] = gbk_file_name
    if 'contigset_ref' in genome_obj:
        contigset_data = dfu.get_objects(
            {'object_refs': [genome_obj['contigset_ref']]})['data'][0]
        contigset_obj = contigset_data['data']
        contigset_meta = contigset_data['info'][10]
        file_name_to_data_map["contigset.json"] = contigset_obj
        file_name_to_data_map["contigset.meta.json"] = contigset_meta
        genome_obj['contigset_ref'] = "contigset.json"
    elif 'assembly_ref' in genome_obj:
        assembly_data = dfu.get_objects(
            {'object_refs': [genome_obj['assembly_ref']]})['data'][0]
        assembly_obj = assembly_data['data']
        assembly_meta = assembly_data['info'][10]
        file_name_to_data_map["assembly.json"] = assembly_obj
        file_name_to_data_map["assembly.meta.json"] = assembly_meta
        genome_obj['assembly_ref'] = "assembly.json"
        fasta_handle_ref = assembly_obj['fasta_handle_ref']
        fasta_file_name = "assembly.fa"
        dfu.shock_to_file({
            'handle_id':
            fasta_handle_ref,
            'file_path':
            os.path.join(target_dir, fasta_file_name)
        })
        assembly_obj['fasta_handle_ref'] = fasta_file_name
        assembly_obj['external_source_id'] = fasta_file_name
        if 'taxon_ref' in assembly_obj:
            taxon_obj = dfu.get_objects(
                {'object_refs':
                 [assembly_obj['taxon_ref']]})['data'][0]['data']
            file_name_to_data_map["taxon.json"] = taxon_obj
            assembly_obj['taxon_ref'] = "taxon.json"
            if 'taxon_ref' in genome_obj:
                genome_obj['taxon_ref'] = "taxon.json"
            taxon_obj['parent_taxon_ref'] = ""
    for target_file_name in file_name_to_data_map:
        with open(os.path.join(target_dir, target_file_name), 'w') as f:
            json.dump(file_name_to_data_map[target_file_name],
                      f,
                      sort_keys=True,
                      indent=4)
Example #6
0
 def test_shock_handle_ws(self):
     test_phrase = "Hi there!"
     path_to_temp_file = "/kb/module/work/tmp/temp_" + str(
         time.time()) + ".fq"
     self.textToFile(test_phrase, path_to_temp_file)
     dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'],
                        token=self.ctx['token'])
     uploaded = dfu.file_to_shock({
         'file_path': path_to_temp_file,
         'make_handle': 1
     })
     fhandle = uploaded['handle']
     self.assertTrue('hid' in fhandle, "Handle: " + str(fhandle))
     data = {'hid': fhandle['hid']}
     obj_name = 'TestObject.1'
     info = self.getWsClient().save_objects({
         'workspace':
         self.getWsName(),
         'objects': [{
             'type': 'Empty.AHandle',
             'data': data,
             'name': obj_name
         }]
     })[0]
     self.assertEqual(info[1], obj_name)
     ref = self.getWsName() + '/' + obj_name
     handle_data = self.getWsClient().get_objects([{'ref': ref}])[0]['data']
     self.assertTrue('hid' in handle_data, "Data: " + str(handle_data))
     hid = handle_data['hid']
     path_to_temp_file2 = "/kb/module/work/tmp/temp2_" + str(
         time.time()) + ".fq"
     dfu.shock_to_file({'handle_id': hid, 'file_path': path_to_temp_file2})
     self.assertEqual(test_phrase, self.fileToText(path_to_temp_file2))
Example #7
0
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.__LOGGER = logging.getLogger('KBaseRNASeq')
        if 'log_level' in config:
            self.__LOGGER.setLevel(config['log_level'])
        else:
            self.__LOGGER.setLevel(logging.INFO)
        streamHandler = logging.StreamHandler(sys.stdout)
        formatter = logging.Formatter(
            "%(asctime)s - %(filename)s - %(lineno)d - \
                                       %(levelname)s - %(message)s")
        formatter.converter = time.gmtime
        streamHandler.setFormatter(formatter)
        self.__LOGGER.addHandler(streamHandler)
        self.__LOGGER.info("Logger was set")

        script_utils.check_sys_stat(self.__LOGGER)

        self.scratch = config['scratch']
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.ws_url = config['workspace-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.samtools = SamTools(config)
        #END_CONSTRUCTOR
        pass
Example #8
0
    def BuildFastaFromSequenceSet(self, ctx, params):
        """
        :param params: instance of type "BuildSeqIn" -> structure: parameter
           "workspace_name" of String, parameter "SequenceSetRef" of String,
           parameter "fasta_outpath" of String
        :returns: instance of type "BuildSeqOut" -> structure: parameter
           "fasta_outpath" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN BuildFastaFromSequenceSet
        dfu = DataFileUtil(self.callback_url)
        get_objects_params = {'object_refs': [params['SequenceSetRef']]}
        SeqSet = dfu.get_objects(get_objects_params)['data'][0]['data']

        outFile = open(params['fasta_outpath'], 'w')
        for s in SeqSet['sequences']:
            sname = '>' + s['sequence_id'] + '\n'
            outFile.write(sname)
            sseq = s['sequence'] + '\n'
            outFile.write(sseq)
        outFile.close()
        output = {'fasta_outpath': params['fasta_outpath']}

        #END BuildFastaFromSequenceSet

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method BuildFastaFromSequenceSet return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Example #9
0
def make_fake_alignment(callback_url, dummy_file, name, reads_ref, genome_ref,
                        ws_name, ws_client):
    """
    Makes a Fake KBaseRNASeq.RNASeqAlignment object and returns a ref to it.
    callback_url: needed for DataFileUtil,
    dummy_file: path to some dummy "alignment" file (make it small - needs to be uploaded to shock)
    name: the name of the object
    reads_ref: a reference to a valid (probably fake) reads library
    genome_ref: a reference to a valid (also probably fake) genome
    workspace_name: the name of the workspace to save this object
    workspace_client: a Workspace client tuned to the server of your choice
    """
    dfu = DataFileUtil(callback_url)
    dummy_shock_info = dfu.file_to_shock({
        "file_path": dummy_file,
        "make_handle": 1
    })
    fake_alignment = {
        "file": dummy_shock_info['handle'],
        "library_type": "fake",
        "read_sample_id": reads_ref,
        "condition": "fake",
        "genome_id": genome_ref
    }
    return make_fake_object(fake_alignment, "KBaseRNASeq.RNASeqAlignment",
                            name, ws_name, ws_client)
Example #10
0
 def run_skip(self, reads_file):
     """
     Doesn't run RQCFilter, but a dummy skip version. It returns the same
     result structure, so it doesn't derail the other pipeline steps. However, the
     "filtered_fastq_file" is the unchanged fastq file, other than gzipping it.
     run_log is just an empty (but existing!) file.
     """
     print("NOT running RQCFilter, just putting together some results.")
     # make the dummy output dir
     outdir = os.path.join(
         self.scratch_dir,
         "dummy_rqcfilter_output_{}".format(int(time() * 1000)))
     mkdir(outdir)
     # mock up a log file
     dummy_log = os.path.join(outdir, "dummy_rqcfilter_log.txt")
     open(dummy_log, 'w').close()
     # just compress the reads and move them into that output dir (probably don't need to
     # move them, but let's be consistent)
     dfu = DataFileUtil(self.callback_url)
     compressed_reads = dfu.pack_file({
         "file_path": reads_file,
         "pack": "gzip"
     })["file_path"]
     base_name = os.path.basename(compressed_reads)
     not_filtered_reads = os.path.join(outdir, base_name)
     os.rename(compressed_reads, not_filtered_reads)
     return {
         "output_directory": outdir,
         "filtered_fastq_file": not_filtered_reads,
         "run_log": dummy_log,
         "command":
         "BBTools.run_RQCFilter_local -- skipped. No command run.",
         "version_string": "KBase BBTools module"
     }
    def _proc_ws_obj_params(self, ctx, params):
        """
        Check the validity of workspace and object params and return them
        """
        dst_ref = params.get(self.PARAM_IN_DST_REF)

        ws_name_id, obj_name_id = os.path.split(dst_ref)

        if not bool(ws_name_id.strip()) or ws_name_id == '/':
            raise ValueError("Workspace name or id is required in " +
                             self.PARAM_IN_DST_REF)

        if not bool(obj_name_id.strip()):
            raise ValueError("Object name or id is required in " +
                             self.PARAM_IN_DST_REF)

        dfu = DataFileUtil(self.callback_url)

        if not isinstance(ws_name_id, int):

            try:
                ws_name_id = dfu.ws_name_to_id(ws_name_id)
            except DFUError as se:
                prefix = se.message.split('.')[0]
                raise ValueError(prefix)

        self.__LOGGER.info('Obtained workspace name/id ' + str(ws_name_id))

        return ws_name_id, obj_name_id
    def __init__(self, config):
        """

        :param config:
        :param logger:
        :param directory: Working directory
        :param urls: Service urls
        """
        # BEGIN_CONSTRUCTOR
        self.ws_url = config["workspace-url"]
        self.ws_url = config["workspace-url"]
        self.callback_url = config['SDK_CALLBACK_URL']
        self.srv_wiz_url = config['srv-wiz-url']
        self.token = config['KB_AUTH_TOKEN']
        self.shock_url = config['shock-url']
        self.dfu = DataFileUtil(self.callback_url)
        self.gfu = GenomeFileUtil(self.callback_url)
        self.au = AssemblyUtil(self.callback_url)
        self.rau = ReadsAlignmentUtils(self.callback_url)
        self.set_api = SetAPI(self.srv_wiz_url, service_ver='dev')
        self.eu = ExpressionUtils(self.callback_url)
        self.ws = Workspace(self.ws_url, token=self.token)

        self.scratch = os.path.join(config['scratch'], str(uuid.uuid4()))
        self._mkdir_p(self.scratch)

        self.tool_used = "Cufflinks"
        self.tool_version = os.environ['VERSION']
        # END_CONSTRUCTOR
        pass
Example #13
0
    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"
Example #14
0
    def _upload_report(self, report_dir, file_links, workspace_name,
                       saved_objects):
        dfu = DataFileUtil(self.callback_url)
        upload_info = dfu.file_to_shock({
            'file_path': report_dir,
            'pack': 'zip'
        })
        shock_id = upload_info['shock_id']

        report_params = {
            'message':
            'JGI metagenome assembly report',
            'direct_html_link_index':
            0,
            'html_links': [{
                'shock_id': shock_id,
                'name': 'index.html',
                'description': 'assembly report'
            }],
            'file_links':
            file_links,
            'report_object_name':
            'JGI_assembly_pipeline.' + str(uuid.uuid4()),
            'workspace_name':
            workspace_name,
            'objects_created':
            saved_objects
        }

        report_client = KBaseReport(self.callback_url)
        report = report_client.create_extended_report(report_params)
        return {'report_ref': report['ref'], 'report_name': report['name']}
    def test_simple_upload(self):
        genomeFileUtil = self.getImpl()

        ### Test for a Local Function Call - file needs to be just on disk
        tmp_dir = self.__class__.cfg['scratch']
        #file_name = "GCF_000005845.2_ASM584v2_genomic.gbff.gz"
        #shutil.copy(os.path.join("data", file_name), tmp_dir)
        gbk_path = self.getTempGenbank()  # os.path.join(tmp_dir, file_name)
        print('attempting upload via local function directly')
        ws_obj_name = 'MyGenome'
        result = genomeFileUtil.genbank_to_genome_annotation(self.getContext(), 
            {
                'file_path':gbk_path,
                'workspace_name':self.getWsName(),
                'genome_name':ws_obj_name
            });
        pprint(result)
        # todo: add test that result is correct

        ### Test for upload from SHOCK - upload the file to shock first
        print('attempting upload through shock')
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], 
                                token=self.__class__.ctx['token'],
                                service_ver='dev')
        shock_id = data_file_cli.file_to_shock({'file_path': gbk_path})['shock_id']
        ws_obj_name2 = 'MyGenome.2'
        result2 = genomeFileUtil.genbank_to_genome_annotation(self.getContext(), 
            {
                'shock_id':shock_id,
                'workspace_name':self.getWsName(),
                'genome_name':ws_obj_name2,
                'convert_to_legacy':1
            });
        pprint(result2)
    def test_reannotate_RICKETS(self):
        genome_ref = '31932/5/1'
        genome_ref = '32038/3/2'
        genome_ref = '32132/5/1'
        genome_name = 'Aceti'
        self.callback_url = os.environ["SDK_CALLBACK_URL"]
        self.dfu = DataFileUtil(self.callback_url)

        result = self.getImpl().annotate(
            self.getContext(), {
                "object_ref": genome_ref,
                "output_workspace": self.getWsName(),
                "output_genome_name": genome_name,
                "evalue": None,
                "fast": 0,
                "gcode": 0,
                "genus": "genus",
                "kingdom": "Bacteria",
                "metagenome": 0,
                "mincontiglen": 1,
                "norrna": 0,
                "notrna": 0,
                "rawproduct": 0,
                "rfam": 1,
                "scientific_name": "RhodoBacter"
            })[0]

        genome_data = self.dfu.get_objects(
            {"object_refs": [result['output_genome_ref']]})["data"][0]['data']
        scratch = "/kb/module/work/tmp/"

        with open(scratch + 'OUTPUT_GENOME.txt', 'w+') as outfile:
            json.dump(genome_data, outfile)
Example #17
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = config['scratch']
     self.shock_url = config['shock-url']
     self.dfu = DataFileUtil(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.setapi = SetAPI(self.callback_url)
     self.wss = workspaceService(config['workspace-url'])
Example #18
0
 def __init__(self, scratch_dir, callback_url, workspace_url, srv_wiz_url):
     self.scratch_dir = scratch_dir
     self.rau = ReadsAlignmentUtils(callback_url)
     self.kbr = KBaseReport(callback_url)
     self.dfu = DataFileUtil(callback_url)
     self.set_api = SetAPI(srv_wiz_url)
     self.ws = Workspace(workspace_url)
     self.valid_commands = ['bamqc', 'multi-bamqc']
 def __init__(self, config):
     self.ws_url = config["workspace-url"]
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.shock_url = config['shock-url']
     self.dfu = DataFileUtil(self.callback_url)
     self.ws = Workspace(self.ws_url, token=self.token)
     self.scratch = config['scratch']
 def __init__(self, config):
     log('--->\nInitializing RNASeqDownloaderUtils instance:\n config: %s' %
         config)
     self.scratch = config['scratch']
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url, token=self.token)
     self.rau = ReadsAlignmentUtils(self.callback_url, token=self.token)
Example #21
0
 def __init__(self, config):
     #BEGIN_CONSTRUCTOR
     self.workspaceURL = config['workspace-url']
     self.scratch = os.path.abspath(config['scratch'])
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.dfu = DataFileUtil(self.callback_url)
     #END_CONSTRUCTOR
     pass
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.token = config['KB_AUTH_TOKEN']
     self.scratch = os.path.join(config['scratch'], 'import_GenBank_' + str(uuid.uuid4()))
     handler_utils._mkdir_p(self.scratch)
     self.dfu = DataFileUtil(self.callback_url)
     self.gfu = GenomeFileUtil(self.callback_url, service_ver='dev')
     self.uploader_utils = UploaderUtil(config)
    def Xtest_modify_old_genome(self):
        self.callback_url = os.environ["SDK_CALLBACK_URL"]
        self.gfu = GenomeFileUtil(self.callback_url)
        self.dfu = DataFileUtil(self.callback_url)
        old_genome = "30045/15/1"

        new_genome = "30045/14/1"
        genome_name = 'OldRhodo'
        genome_data_old = self.dfu.get_objects({"object_refs":
                                                [old_genome]})["data"][0]
        genome_data_new = self.dfu.get_objects({"object_refs":
                                                [new_genome]})["data"][0]

        sso_1 = {
            "id": "1",
            "evidence": [],
            "term_name": "1",
            "ontology_ref": "1",
            "term_lineage": []
        }

        sso_2 = {
            "id": "2",
            "evidence": [],
            "term_name": "2",
            "ontology_ref": "2",
            "term_lineage": []
        }

        sso_terms = {'SSO1': sso_1, 'SSO2': sso_2}

        print("ABOUT TO MODIFY OLD GENOME")
        for i, item in enumerate(genome_data_old['data']['features']):
            genome_data_old['data']['features'][i]['ontology_terms'] = {
                "SSO": sso_terms
            }

        print("ABOUT TO MODIFY NEW GENOME")
        for i, item in enumerate(genome_data_new['data']['features']):
            genome_data_new['data']['features'][i]['ontology_terms'] = {
                "SSO": sso_terms
            }

        print("ABOUT TO SAVE OLD GENOME")
        info = self.gfu.save_one_genome({
            "workspace": self.getWsName(),
            "name": genome_name,
            "data": genome_data_old["data"],
            "provenance": self.ctx.provenance()
        })["info"]

        print("ABOUT TO SAVE NEW GENOME")
        info = self.gfu.save_one_genome({
            "workspace": self.getWsName(),
            "name": genome_name,
            "data": genome_data_new["data"],
            "provenance": self.ctx.provenance()
        })["info"]
Example #24
0
 def __init__(self, config, logger=None):
     self.config = config
     self.logger = logger
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.scratch = os.path.join(config['scratch'], 'EM_' + str(uuid.uuid4()))
     self.ws_url = config['workspace-url']
     self.ws_client = Workspace(self.ws_url)
     self.dfu = DataFileUtil(self.callback_url)
     pass
Example #25
0
 def __init__(self, config):
     #BEGIN_CONSTRUCTOR
     self.utils = Utils(config)
     self.scratch = config['scratch']
     self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     self.ws = Workspace(config['workspace-url'])
     logging.basicConfig(level=logging.INFO)
     #END_CONSTRUCTOR
     pass
Example #26
0
 def __init__(self, config):
     self.callback_url = config['SDK_CALLBACK_URL']
     self.scratch = os.path.join(config['scratch'],
                                 'import_assembly_' + str(uuid.uuid4()))
     handler_utils._mkdir_p(self.scratch)
     self.token = config['KB_AUTH_TOKEN']
     self.dfu = DataFileUtil(self.callback_url)
     self.au = AssemblyUtil(self.callback_url)
     self.uploader_utils = UploaderUtil(config)
    def test_simple_upload(self):
        # fetch the test files and set things up
        genomeFileUtil = self.getImpl()
        gbk_path = "data/GCF_000005845.2_ASM584v2_genomic.gbff"

        ### Test for a Local Function Call
        print('attempting upload via local function directly')
        ws_obj_name = 'MyGenome'
        result = genomeFileUtil.genbank_to_genome(self.getContext(), 
            {
                'file' : { 'path': gbk_path },
                'workspace_name':self.getWsName(),
                'genome_name':ws_obj_name
            })[0]
        pprint(result)
        self.assertIsNotNone(result['genome_ref'])
        target_dir = os.path.join("/kb/module/work/tmp", "GCF_000005845")
        download_genome_to_json_files(self.getContext()['token'], result['genome_ref'],
                                      target_dir)
        #self.assertEqual(0, len(compare_genome_json_files(target_dir, 
        #                                                  os.path.join("/kb/module/test/data", 
        #                                                               "GCF_000005845"))))
        # todo: add test that result is correct

        ### Test for upload from SHOCK - upload the file to shock first
        print('attempting upload through shock')
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'], 
                                token=self.__class__.ctx['token'],
                                service_ver='dev')
        shutil.copy(gbk_path, self.__class__.cfg['scratch'])
        shock_id = data_file_cli.file_to_shock({
            'file_path': os.path.join(self.__class__.cfg['scratch'], gbk_path.split("/")[-1])
        })['shock_id']
        ws_obj_name2 = 'MyGenome.2'
        result2 = genomeFileUtil.genbank_to_genome(self.getContext(), 
            {
                'file': {'shock_id':shock_id},
                'workspace_name':self.getWsName(),
                'genome_name':ws_obj_name2,
            })[0]
        pprint(result2)
        self.assertIsNotNone(result['genome_ref'])
        # todo: add test that result is correct

        ### Test for upload via FTP- use something from genbank
        print('attempting upload through ftp url')
        ws_obj_name3 = 'MyGenome.3'
        result3 = genomeFileUtil.genbank_to_genome(self.getContext(), 
            {
                'file':{'ftp_url': self.__class__.TEST_ECOLI_FILE_FTP},
                'workspace_name': self.getWsName(),
                'genome_name': ws_obj_name3,
            })[0]
        pprint(result3)

        self.assertIsNotNone(result3['genome_ref'])
Example #28
0
 def __init__(self, config):
     #BEGIN_CONSTRUCTOR
     self.config = config
     self.scratch = config['scratch']
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.ws_url = config['workspace-url']
     self.ws_client = Workspace(self.ws_url)
     self.dfu = DataFileUtil(self.callback_url)
     #END_CONSTRUCTOR
     pass
Example #29
0
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.config = config
        self.scratch = os.path.abspath(config['scratch'])
        self.callbackURL = os.environ['SDK_CALLBACK_URL']
        # self.shared_folder = os.path.abspath(config['scratch'])
        self.dfu = DataFileUtil(self.callbackURL)

        #END_CONSTRUCTOR
        pass
Example #30
0
 def __init__(self, config):
     self.cfg = config
     self.scratch = config['scratch']
     self.callback_url = os.environ['SDK_CALLBACK_URL']
     self.dfu = DataFileUtil(self.callback_url)
     self.kbse = KBaseSearchEngine(config['search-url'])
     self.gen_api = GenericsAPI(self.callback_url)
     self.DEFAULT_ONTOLOGY_REF = "KbaseOntologies/Custom"
     self.DEFAULT_ONTOLOGY_ID = "Custom:Term"
     self.DEFAULT_UNIT_ID = "Custom:Unit"
Example #31
0
    def __init__(self, config):
        #BEGIN_CONSTRUCTOR
        self.callback_url = os.environ['SDK_CALLBACK_URL']
        self.dfu = DataFileUtil(self.callback_url)
        self.sj = sleep_job(self.callback_url)
        self.sac = simpleapp_client(self.callback_url)
        self.aj = alans_job(self.callback_url,service_ver='dev')

        #END_CONSTRUCTOR
        pass
    def export_genome_annotation_as_genbank(self, ctx, params):
        """
        A method designed especially for download, this calls 'genome_annotation_to_genbank' to do
        the work, but then packages the output with WS provenance and object info into
        a zip file and saves to shock.
        :param params: instance of type "ExportParams" -> structure:
           parameter "input_ref" of String
        :returns: instance of type "ExportOutput" -> structure: parameter
           "shock_id" of String
        """
        # ctx is the context object
        # return variables are: output
        #BEGIN export_genome_annotation_as_genbank

         # validate parameters
        if 'input_ref' not in params:
            raise ValueError('Cannot export GenomeAnnotation- not input_ref field defined.')

        # get WS metadata to get ws_name and obj_name
        ws = Workspace(url=self.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref': params['input_ref'] }],'includeMetadata':0, 'ignoreErrors':0})[0]

        # export to a file
        file = self.genome_annotation_to_genbank(ctx, { 
                            'genome_ref': params['input_ref'], 
                            'new_genbank_file_name': info[1]+'.gbk' })[0]

        # create the output directory and move the file there
        export_package_dir = os.path.join(self.sharedFolder, info[1])
        os.makedirs(export_package_dir)
        shutil.move(file['path'], os.path.join(export_package_dir, os.path.basename(file['path'])))

        # package it up and be done
        dfUtil = DataFileUtil(self.callback_url)
        package_details = dfUtil.package_for_download({
                                    'file_path': export_package_dir,
                                    'ws_refs': [ params['input_ref'] ]
                                })

        output = { 'shock_id': package_details['shock_id'] }

        #END export_genome_annotation_as_genbank

        # At some point might do deeper type checking...
        if not isinstance(output, dict):
            raise ValueError('Method export_genome_annotation_as_genbank return value ' +
                             'output is not type dict as required.')
        # return the results
        return [output]
Example #33
0
def upload_file_to_shock(logger,
                         filePath,
                         make_handle = True,
                         shock_service_url = None,
                         #attributes = '{}',
                         ssl_verify = True,
                         token = None):
    """
    Use HTTP multi-part POST to save a file to a SHOCK instance.
    """

    
    #shock_service_url is from config
    dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token)
    #return dfu.file_to_shock({"file_path":filePath, "attributes": json.dumps(attributes), "make_handle" : make_handle})
    return dfu.file_to_shock( { "file_path":filePath,  "make_handle" : make_handle } )
    def test_basic_upload_and_download(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "test.fna"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'MyNewAssembly'
        result = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                       {'file': {'path': fasta_path},
                                                        'workspace_name': self.getWsName(),
                                                        'assembly_name': ws_obj_name
                                                        })
        pprint(result)
        self.check_fasta_file(ws_obj_name, fasta_path)


        print('attempting upload through shock')
        data_file_cli = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        shock_id = data_file_cli.file_to_shock({'file_path': fasta_path})['shock_id']
        ws_obj_name2 = 'MyNewAssembly.2'
        result2 = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                        {'shock_id': shock_id,
                                                         'workspace_name': self.getWsName(),
                                                         'assembly_name': ws_obj_name2
                                                         })
        pprint(result2)
        self.check_fasta_file(ws_obj_name2, fasta_path)

        print('attempting upload via ftp url')
        ftp_url = 'ftp://ftp.ensemblgenomes.org/pub/release-29/bacteria//fasta/bacteria_8_collection/acaryochloris_marina_mbic11017/dna/Acaryochloris_marina_mbic11017.GCA_000018105.1.29.dna.genome.fa.gz'
        ws_obj_name3 = 'MyNewAssembly.3'
        result3 = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                        {'ftp_url': ftp_url,
                                                         'workspace_name': self.getWsName(),
                                                         'assembly_name': ws_obj_name3
                                                         })
        pprint(result3)
        # todo: add checks here on ws object

        ws_obj_name3 = 'MyNewAssembly.3'
        result4 = assemblyUtil.export_assembly_as_fasta(self.getContext(),
                                                        {'input_ref': self.getWsName() + '/' + ws_obj_name3})
        pprint(result4)
    def test_filtered_everything(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "legacy_test.fna"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'FilteredAssembly'
        result = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                       {'file': {'path': fasta_path},
                                                        'workspace_name': self.getWsName(),
                                                        'assembly_name': ws_obj_name,
                                                        'min_contig_length': 500
                                                        })

        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        assembly = dfu.get_objects({'object_refs': [result[0]]})['data'][0]['data']
        self.assertEqual(assembly['dna_size'], 0)
        self.assertEqual(assembly['gc_content'], None)
        self.assertEqual(assembly['num_contigs'], 0)
    def test_load_with_filter_and_options(self):
        assemblyUtil = self.getImpl()

        tmp_dir = self.__class__.cfg['scratch']
        file_name = "legacy_test.fna"
        shutil.copy(os.path.join("data", file_name), tmp_dir)
        fasta_path = os.path.join(tmp_dir, file_name)
        print('attempting upload')
        ws_obj_name = 'FilteredAssembly'
        result = assemblyUtil.save_assembly_from_fasta(self.getContext(),
                                                       {'file': {'path': fasta_path},
                                                        'workspace_name': self.getWsName(),
                                                        'assembly_name': ws_obj_name,
                                                        'min_contig_length': 9,
                                                        'external_source': 'someplace',
                                                        'external_source_id': 'id',
                                                        'external_source_origination_date': 'sunday',
                                                        'type': 'metagenome',
                                                        'contig_info': {'s3': {'is_circ': 0, 'description': 'somethin'}}
                                                        })

        dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        assembly = dfu.get_objects({'object_refs': [result[0]]})['data'][0]['data']

        self.assertEqual(len(assembly['contigs']), 1)
        self.assertEqual(assembly['contigs']['s3']['md5'], '4f339bd56e5f43ecb52e8682a790a111')
        self.assertEqual(assembly['contigs']['s3']['contig_id'], 's3')
        self.assertEqual(assembly['contigs']['s3']['length'], 18)
        self.assertEqual(assembly['contigs']['s3']['is_circ'], 0)
        self.assertEqual(assembly['contigs']['s3']['description'], 'somethin')

        self.assertEqual(assembly['dna_size'], 18)
        self.assertEqual(assembly['gc_content'], 0.44444)
        self.assertEqual(assembly['md5'], 'eba4d1771060e19671a56832d159526e')
        self.assertEqual(assembly['num_contigs'], 1)
        self.assertEqual(assembly['type'], 'metagenome')
        self.assertEqual(assembly['external_source'], 'someplace')
        self.assertEqual(assembly['external_source_id'], 'id')
        self.assertEqual(assembly['external_source_origination_date'], 'sunday')
Example #37
0
def download_file_from_shock(logger,
                             shock_service_url = None,
                             shock_id = None,
                             filename = None,
                             directory = None,
			     filesize= None,
                             token = None):
    """
    Given a SHOCK instance URL and a SHOCK node id, download the contents of that node
    to a file on disk.
    """

    if filename is not None:
        shockFileName = filename

    if directory is not None:
        filePath = os.path.join(directory, shockFileName)
    else:
        filePath = shockFileName

    #shock_service_url is from config
    dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'], token=token)
    return dfu.shock_to_file({"shock_id" : shock_id, "file_path":filePath, "unpack" : None})
    def ztest_aaa_upload_to_shock(self):

        print "upload ref data to shock staging"
        self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        #file_path =  self.write_file('Phage_gene_catalog.tar.gz', 'Test')

        input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz'
        source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name)

        tmp_dir = self.cfg['scratch']
        target_file_path = os.path.join(tmp_dir, input_file_name)

        print "file_path " + source_file_path+"\t"+target_file_path

        orig_size = os.path.getsize(source_file_path)

        shutil.copy(source_file_path, target_file_path)

        print "Testing "+target_file_path
        print(os.path.isfile(target_file_path))

        ret1 = self.dfUtil.file_to_shock(
            {'file_path': target_file_path})
        
        print str(ret1)
        shock_id = ret1['shock_id']
        
        print "shock_id "+shock_id
        file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz')

        #ret2 = self.dfUtil.shock_to_file(
        #    {'shock_id': shock_id, 'file_path': file_path2})[0]
        ret2 = self.dfUtil.shock_to_file(
            {'shock_id': shock_id, 'file_path': file_path2})

        print(ret2)

        file_name = ret2['node_file_name']
        attribs = ret2['attributes']
        self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz')
        self.assertEqual(ret2['file_path'], file_path2)
        self.assertEqual(ret2['size'], orig_size)
        self.assertIsNone(attribs)
Example #39
0
class FastaToAssembly:

    def __init__(self, callback_url, scratch):
        self.scratch = scratch
        self.dfu = DataFileUtil(callback_url)

        # Note added X due to kb|g.1886.fasta
        self.valid_chars = "-ACGTUWSMKRYBDHVNX"
        self.amino_acid_specific_characters = "PLIFQE"


    def import_fasta(self, ctx, params):
        print('validating parameters')
        self.validate_params(params)

        print('staging input files')
        fasta_file_path = self.stage_input(params)

        if 'min_contig_length' in params:
            min_contig_length = int(params['min_contig_length'])
            print('filtering fasta file by contig length (min len=' + str(min_contig_length) + 'bp)')
            fasta_file_path = self.filter_contigs_by_length(fasta_file_path, min_contig_length)

        print('parsing FASTA file: ' + str(fasta_file_path))
        assembly_data = self.parse_fasta(fasta_file_path, params)
        print(' - parsed ' + str(assembly_data['num_contigs']) + ' contigs, ' +
              str(assembly_data['dna_size']) + 'bp')

        print('saving assembly to KBase')

        # save file to shock and build handle
        fasta_file_handle_info = self.save_fasta_file_to_shock(fasta_file_path)
        # construct the output object
        assembly_object_to_save = self.build_assembly_object(assembly_data,
                                                             fasta_file_handle_info,
                                                             params)

        # save to WS and return
        if 'workspace_id' in params:
            workspace_id = int(params['workspace_id'])
        else:
            workspace_id = self.dfu.ws_name_to_id(params['workspace_name'])
        assembly_info = self.save_assembly_object(workspace_id,
                                                  params['assembly_name'],
                                                  assembly_object_to_save)

        return assembly_info


    def build_assembly_object(self, assembly_data, fasta_file_handle_info, params):
        ''' construct the WS object data to save based on the parsed info and params '''
        assembly_data['assembly_id'] = params['assembly_name']
        assembly_data['fasta_handle_ref'] = fasta_file_handle_info['handle']['hid']
        assembly_data['fasta_handle_info'] = fasta_file_handle_info

        assembly_data['type'] = 'Unknown'
        if 'type' in params:
            assembly_data['type'] = params['type']

        if 'taxon_ref' in params:
            assembly_data['taxon_ref'] = params['taxon_ref']

        if 'external_source' in params:
            assembly_data['external_source'] = params['external_source']

        if 'external_source_id' in params:
            assembly_data['external_source_id'] = params['external_source_id']

        if 'external_source_origination_date' in params:
            assembly_data['external_source_origination_date'] = params['external_source_origination_date']

        return assembly_data


    def parse_fasta(self, fasta_file_path, params):
        ''' Do the actual work of inspecting each contig '''

        # variables to store running counts of things
        total_length = 0
        base_counts = {'A': 0, 'G': 0, 'C': 0, 'T': 0}
        md5_list = []

        # map from contig_id to contig_info
        all_contig_data = {}
        extra_contig_info = {}
        if'contig_info' in params:
            extra_contig_info = params['contig_info']

        for record in SeqIO.parse(fasta_file_path, "fasta"):
            # SeqRecord(seq=Seq('TTAT...', SingleLetterAlphabet()),
            #           id='gi|113968346|ref|NC_008321.1|',
            #           name='gi|113968346|ref|NC_008321.1|',
            #           description='gi|113968346|ref|NC_008321.1| Shewanella sp. MR-4 chromosome, complete genome',
            #           dbxrefs=[])

            sequence = str(record.seq).upper()

            contig_info = {
                'contig_id': record.id,
                'name': record.id,
                'description': record.description[len(record.id):].strip(),
                'length': len(record.seq)
            }

            # 1) compute sequence character statistics running total
            total_length += contig_info['length']
            sequence_count_table = dict(Counter(sequence))
            for character in sequence_count_table:
                if character in base_counts:
                    base_counts[character] = base_counts[character] + sequence_count_table[character]
                else:
                    base_counts[character] = sequence_count_table[character]
                if character not in self.valid_chars:
                    if character in self.amino_acid_specific_characters:
                        raise ValueError('This fasta file may have amino acids in it instead ' +
                                         'of the required nucleotides.')
                    raise ValueError("This FASTA file has non nucleic acid characters : {0}".format(character))

            # 2) record number of 'N' characters (only set if there are some)
            Ncount = 0
            if 'N' in sequence_count_table:
                Ncount = sequence_count_table['N']
                contig_info['Ncount'] = Ncount

            # 2b) record if the contig is circular
            if record.id in extra_contig_info:
                if 'is_circ' in extra_contig_info[record.id]:
                    contig_info['is_circ'] = int(extra_contig_info[record.id]['is_circ'])
                if 'description' in extra_contig_info[record.id]:
                    contig_info['description'] = str(extra_contig_info[record.id]['description'])

            # 3) record md5 checksum
            contig_md5 = md5(sequence).hexdigest()
            contig_info['md5'] = contig_md5
            md5_list.append(contig_md5)

            # 4) record the all important GC to ~3 significant digits
            GC_count = 0
            for base in ['G', 'C']:
                if base in sequence_count_table:
                    GC_count += sequence_count_table[base]
            contig_info['gc_content'] = round(float(GC_count) / float(contig_info['length']), 5)

            # 5) add to contig list
            if contig_info['contig_id'] in all_contig_data:
                raise ValueError('The fasta header key ' + contig_info['contig_id'] +
                                 'appears more than once in the file')
            all_contig_data[contig_info['contig_id']] = contig_info

        # Aggregate stats for the data
        total_gc_content = None
        if total_length > 0:
            total_gc_content = round(float(base_counts['G'] + base_counts['C']) / float(total_length), 5)
        assembly_data = {
            'md5': md5(",".join(sorted(md5_list))).hexdigest(),
            'base_counts': base_counts,
            'dna_size': total_length,
            'gc_content': total_gc_content,
            'contigs': all_contig_data,
            'num_contigs': len(all_contig_data)
        }
        return assembly_data


    def fasta_filter_contigs_generator(self, fasta_record_iter, min_contig_length):
        ''' generates SeqRecords iterator for writing from a legacy contigset object '''
        rows = 0
        rows_added = 0
        for record in fasta_record_iter:
            rows += 1
            if len(record.seq) >= min_contig_length:
                rows_added += 1
                yield record
        print(' - filtered out ' + str(rows - rows_added) + ' of ' + str(rows) + ' contigs that were shorter than ' +
              str(min_contig_length) + 'bp.')


    def filter_contigs_by_length(self, fasta_file_path, min_contig_length):
        ''' removes all contigs less than the min_contig_length provided '''
        filtered_fasta_file_path = fasta_file_path + '.filtered.fa'

        fasta_record_iter = SeqIO.parse(fasta_file_path, 'fasta')
        SeqIO.write(self.fasta_filter_contigs_generator(fasta_record_iter, min_contig_length),
                    filtered_fasta_file_path, 'fasta')

        return filtered_fasta_file_path


    def save_assembly_object(self, workspace_id, assembly_name, obj_data):
        print('Saving Assembly to Workspace')
        sys.stdout.flush()
        obj_info = self.dfu.save_objects({'id': workspace_id,
                                          'objects': [{'type': 'KBaseGenomeAnnotations.Assembly',
                                                       'data': obj_data,
                                                       'name': assembly_name
                                                       }]
                                          })[0]
        return obj_info


    def save_fasta_file_to_shock(self, fasta_file_path):
        ''' Given the path to the file, upload to shock and return Handle information
            returns:
                typedef structure {
                    string shock_id;
                    Handle handle;
                    string node_file_name;
                    string size;
                } FileToShockOutput;

        '''
        print('Uploading fasta file (' + str(fasta_file_path) + ') to SHOCK')
        sys.stdout.flush()
        return self.dfu.file_to_shock({'file_path': fasta_file_path, 'make_handle': 1})


    def stage_input(self, params):
        ''' Setup the input_directory by fetching the files and returning the path to the file'''
        file_path = None
        if 'file' in params:
            file_path = os.path.abspath(params['file']['path'])
        elif 'shock_id' in params:
            print('Downloading file from SHOCK node: ' + str(params['shock_id']))
            sys.stdout.flush()
            input_directory = os.path.join(self.scratch, 'assembly-upload-staging-' + str(uuid.uuid4()))
            os.makedirs(input_directory)
            file_name = self.dfu.shock_to_file({'file_path': input_directory,
                                                'shock_id': params['shock_id']
                                                })['node_file_name']
            file_path = os.path.join(input_directory, file_name)
        elif 'ftp_url' in params:
            print('Downloading file from: ' + str(params['ftp_url']))
            sys.stdout.flush()
            file_path = self.dfu.download_web_file({'file_url': params['ftp_url'],
                                                    'download_type': 'FTP'
                                                    })['copy_file_path']

        # extract the file if it is compressed
        if file_path is not None:
            unpacked_file = self.dfu.unpack_file({'file_path': file_path})
            return unpacked_file['file_path']

        raise ValueError('No valid fasta could be extracted based on the input parameters')


    def validate_params(self, params):
        for key in ('workspace_name', 'assembly_name'):
            if key not in params:
                raise ValueError('required "' + key + '" field was not defined')

        # one and only one of either 'file', 'shock_id', or ftp_url is required
        input_count = 0
        for key in ('file', 'shock_id', 'ftp_url'):
            if key in params and params[key] is not None:
                input_count = input_count + 1
                if key == 'file':
                    if not isinstance(params[key], dict) or 'path' not in params[key]:
                        raise ValueError('when specifying a fasta file input, "path" field was not defined in "file"')

        if input_count == 0:
            raise ValueError('required fasta file as input, set as either "file", "shock_id", or "ftp_url"')
        if input_count > 1:
            raise ValueError('required exactly one fasta file as input source, you set more than one of ' +
                             'these fields: "file", "shock_id", or "ftp_url"')
Example #40
0
 def __init__(self, config):
     self.cfg = config
     self.scratch = config['scratch']
     self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
     self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
     self.ws = Workspace(config["workspace-url"])
Example #41
0
class FeatureSetDownload:
    def __init__(self, config):
        self.cfg = config
        self.scratch = config['scratch']
        self.gsu = GenomeSearchUtil(os.environ['SDK_CALLBACK_URL'])
        self.dfu = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        self.ws = Workspace(config["workspace-url"])

    @staticmethod
    def validate_params(params, expected={"workspace_name", "featureset_name"}):
        expected = set(expected)
        pkeys = set(params)
        if expected - pkeys:
            raise ValueError("Required keys {} not in supplied parameters"
                             .format(", ".join(expected - pkeys)))

    def to_tsv(self, params):
        working_dir = os.path.join(self.scratch,
                                   'featureset-download-'+str(uuid.uuid4()))
        os.makedirs(working_dir)
        header = ['Feature Id', 'Aliases', 'Genome', 'Type', 'Function']

        fs_name, fs_dicts = self.make_featureset_dict(params['featureset_ref'])
        files = {'file_path': "{}/{}.tsv".format(working_dir, fs_name)}
        writer = csv.DictWriter(open(files['file_path'], 'w'), header, delimiter='\t',
                                lineterminator='\n')
        writer.writeheader()
        for feat in fs_dicts:
            writer.writerow(feat)
        return fs_name, files

    def make_featureset_dict(self, fs_ref):
        features = []
        ret = self.dfu.get_objects({'object_refs': [fs_ref]})['data'][0]
        feat_set = ret['data']
        fs_name = ret['info'][1]

        feat_by_genome = defaultdict(list)
        for k, v in feat_set['elements'].items():
            feat_by_genome[v[0]].append(k)

        for genome, fids in feat_by_genome.items():
            genome_name = self.ws.get_object_info3({'objects': [{'ref': genome}]})['infos'][0][1]
            res = self.gsu.search({'ref': genome,
                                   'structured_query': {'feature_id': fids},
                                   'sort_by': [['contig_id', 1]],
                                   'start': 0,
                                   'limit': len(fids)
                                   })

            for feat in res['features']:
                features.append({'Feature Id': feat['feature_id'],
                                 'Aliases': ", ".join(feat['aliases'].keys()),
                                 'Genome': "{} ({})".format(genome_name, genome),
                                 'Type': feat['feature_type'],
                                 'Function': feat['function']
                                 })
        return fs_name, features

    def export(self, files, name, params):
        export_package_dir = os.path.join(self.scratch, name+str(uuid.uuid4()))
        os.makedirs(export_package_dir)
        for file in files:
            shutil.move(file, os.path.join(export_package_dir,
                                           os.path.basename(file)))

        # package it up and be done
        package_details = self.dfu.package_for_download({
            'file_path': export_package_dir,
            'ws_refs': [params['featureset_ref']]
        })

        return {'shock_id': package_details['shock_id']}
class kb_virsorterTest(unittest.TestCase):

    @classmethod
    def setUpClass(cls):
        token = environ.get('KB_AUTH_TOKEN', None)
        user_id = requests.post(
            'https://kbase.us/services/authorization/Sessions/Login',
            data='token={}&fields=user_id'.format(token)).json()['user_id']
        # WARNING: don't call any logging methods on the context object,
        # it'll result in a NoneType error
        cls.ctx = MethodContext(None)
        cls.ctx.update({'token': token,
                        'user_id': user_id,
                        'provenance': [
                            {'service': 'kb_virsorter',
                             'method': 'please_never_use_it_in_production',
                             'method_params': []
                             }],
                        'authenticated': 1})

        config_file = environ.get('KB_DEPLOYMENT_CONFIG', None)
        cls.cfg = {}
        config = ConfigParser()
        config.read(config_file)
        for nameval in config.items('kb_virsorter'):
            cls.cfg[nameval[0]] = nameval[1]
        cls.wsURL = cls.cfg['workspace-url']
        cls.wsClient = workspaceService(cls.wsURL, token=token)
        cls.serviceImpl = kb_virsorter(cls.cfg)

        cls.testobjref = []
        #cls.testobjdata = []
        cls.testwsname = []

    @classmethod
    def tearDownClass(cls):
        if hasattr(cls, 'wsName'):
            cls.wsClient.delete_workspace({'workspace': cls.wsName})
            print('Test workspace was deleted')

        if hasattr(cls, 'testwsname') and len(cls.testwsname) > 0:
            try:
                print('Deleting workspace 2 ' + cls.testwsname[0])
                cls.wsClient.delete_workspace({'workspace': cls.testwsname[0]})
                print('Test workspace 2 was deleted ' + cls.testwsname[0])
            except Exception as e:
                print e

        #if hasattr(cls, 'testobjdata'):
        #    try:
        #        print('Deleting shock data ' + str(len(cls.testobjdata)))
        #        print('Deleting shock data ' + str(len(cls.testobjdata[0]['data'][0])))
        #        print('Deleting shock data ' + str(cls.testobjdata[0]))
        #        node = cls.testobjdata[0]['data'][0]['lib']['file']['id']
        #        cls.delete_shock_node(node)
        #        print('Test shock data was deleted')
        #    except Exception as e:
        #        print e

    def getWsClient(self):
        return self.__class__.wsClient

    def getWsName(self):
        if hasattr(self.__class__, 'wsName'):
            return self.__class__.wsName
        suffix = int(time.time() * 1000)
        wsName = "test_kb_virsorter_" + str(suffix)
        ret = self.getWsClient().create_workspace({'workspace': wsName})
        self.__class__.wsName = wsName
        return wsName

    def getImpl(self):
        return self.__class__.serviceImpl

    def getContext(self):
        return self.__class__.ctx
    
    
    def write_file(self, filename, content):
        tmp_dir = self.cfg['scratch']
        file_path = os.path.join(tmp_dir, filename)
        with open(file_path, 'w') as fh1:
            fh1.write(content)
        return file_path


    def delete_shock_node(self, node_id):
        header = {'Authorization': 'Oauth {0}'.format(cls.token)}
        requests.delete(cls.shockURL + '/node/' + node_id, headers=header,
                        allow_redirects=True)

    def ztest_aaa_upload_to_shock(self):

        print "upload ref data to shock staging"
        self.dfUtil = DataFileUtil(os.environ['SDK_CALLBACK_URL'])
        #file_path =  self.write_file('Phage_gene_catalog.tar.gz', 'Test')

        input_file_name = 'Phage_gene_catalog_plus_viromes.tar.gz'#'Phage_gene_catalog.tar.gz'#''PFAM_27.tar.gz'
        source_file_path = "/kb/module/work/"+input_file_name# os.path.join(tmp_dir, input_file_name)

        tmp_dir = self.cfg['scratch']
        target_file_path = os.path.join(tmp_dir, input_file_name)

        print "file_path " + source_file_path+"\t"+target_file_path

        orig_size = os.path.getsize(source_file_path)

        shutil.copy(source_file_path, target_file_path)

        print "Testing "+target_file_path
        print(os.path.isfile(target_file_path))

        ret1 = self.dfUtil.file_to_shock(
            {'file_path': target_file_path})
        
        print str(ret1)
        shock_id = ret1['shock_id']
        
        print "shock_id "+shock_id
        file_path2 = os.path.join("/kb/module/work/", 'test.tar.gz')

        #ret2 = self.dfUtil.shock_to_file(
        #    {'shock_id': shock_id, 'file_path': file_path2})[0]
        ret2 = self.dfUtil.shock_to_file(
            {'shock_id': shock_id, 'file_path': file_path2})

        print(ret2)

        file_name = ret2['node_file_name']
        attribs = ret2['attributes']
        self.assertEqual(file_name, 'Phage_gene_catalog_plus_viromes.tar.gz')
        self.assertEqual(ret2['file_path'], file_path2)
        self.assertEqual(ret2['size'], orig_size)
        self.assertIsNone(attribs)

        #self.delete_shock_node(shock_id)


    def create_random_string(self):
        N = 20
        return ''.join(
            random.SystemRandom().choice(string.ascii_uppercase + string.digits) for _ in range(N))

    def test_virsorter_ok(self):
        self.upload_assembly()


        if not self.testwsname:
            self.testwsname.append(self.create_random_string())

        print "upload_reads self.testwsname[0] " + self.testwsname[0]

        #try:
        #    ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  # test_ws_name
        #except Exception as e:
        #    # print "ERROR"
        #    # print(type(e))
        #    # print(e.args)
        #    print(e)
        #    pass

        print "self.testwsname "+ str(self.testwsname)
        params = {}
        params['assembly_ref'] =  str(self.testobjref[0])#str(self.testwsname[0])+"/"+ #"16589/2/1"#""#'16589/2/1'#self.testobjref
        params['ws_name'] = self.testwsname[0]

        result = self.getImpl().run_virsorter(self.getContext(), params)
        print('RESULT run_virsorter:')
        pprint(result)

        #testresult = [
        #    {'blah': 'blah', 'bleh': 'bleh'}]

        testresult = [{'report_ref': result[0]['report_ref'], 'report_name': result[0]['report_name']}]


        self.assertEqual(sorted(result), sorted(testresult))


    def upload_assembly(self):
        if not self.testobjref:

            print "upload_assembly start"
    
            indata = 'U00096.2.fa'#_first1000.
            ftarget = os.path.join(self.cfg['scratch'], indata)#self.scratch, indata)
            print "ftarget " + ftarget
            ret = shutil.copy('../test_data/' + indata, ftarget)
    
            #self.readsUtilClient = ReadsUtils(os.environ['SDK_CALLBACK_URL'])

            self.assemblyUtilClient = AssemblyUtil(os.environ['SDK_CALLBACK_URL'])

            if not self.testwsname:
                self.testwsname.append(self.create_random_string())
    
            print "upload_assembly self.testwsname[0] " + self.testwsname[0]
    
            try:
                ret = self.wsClient.create_workspace({'workspace': self.testwsname[0]})  #test_ws_name
            except Exception as e:
                #print "ERROR"
                #print(type(e))
                #print(e.args)
                print(e)
                pass
    
            try:
                print "attempt upload"
                print "ftarget " + ftarget
                ref = self.assemblyUtilClient.save_assembly_from_fasta(
                    {
                     'workspace_name': self.testwsname[0],
                     'assembly_name': 'Ecolik12MG1655',
                     'file': {'path': ftarget}})
        
                print "upload_assembly"
                print ref
                #self.testobjref = []
                self.testobjref.append(self.testwsname[0] + '/Ecolik12MG1655/1')
                #self.testobjdata = []
                #self.testobjdata.append(self.dfu.get_objects(
                #    {'object_refs': [self.testobjref[0]]}))
        
                ##print self.testobjdata[0]
    
            except Exception as e:
                print e
                pass
    
            print "self.testobjref[0]"
            print self.testobjref
            print self.testobjref[0]
    def genome_annotation_to_genbank(self, ctx, params):
        """
        :param params: instance of type "GenomeAnnotationToGenbankParams"
           (genome_ref -- Reference to the GenomeAnnotation or Genome object
           in KBase in any ws supported format OR genome_name +
           workspace_name -- specifiy the genome name and workspace name of
           what you want.  If genome_ref is defined, these args are ignored.
           new_genbank_file_name -- specify the output name of the genbank
           file, optional save_to_shock -- set to 1 or 0, if 1 then output is
           saved to shock. default is zero) -> structure: parameter
           "genome_ref" of String, parameter "genome_name" of String,
           parameter "workspace_name" of String, parameter
           "new_genbank_file_name" of String, parameter "save_to_shock" of
           type "boolean" (A boolean - 0 for false, 1 for true. @range (0, 1))
        :returns: instance of type "GenbankFile" -> structure: parameter
           "path" of String, parameter "shock_id" of String
        """
        # ctx is the context object
        # return variables are: file
        #BEGIN genome_annotation_to_genbank

        print('genome_annotation_to_genbank -- paramaters = ')
        pprint(params)

        service_endpoints = {
            "workspace_service_url": self.workspaceURL, 
            "shock_service_url": self.shockURL,
            "handle_service_url": self.handleURL
        }

        # parse/validate parameters.  could do a better job here.
        genome_ref = None
        if 'genome_ref' in params and params['genome_ref'] is not None:
            genome_ref = params['genome_ref']
        else:
            if 'genome_name' not in params:
                raise ValueError('genome_ref and genome_name are not defined.  One of those is required.')
            if 'workspace_name' not in params:
                raise ValueError('workspace_name is not defined.  This is required if genome_name is specified' +
                    ' without a genome_ref')
            genome_ref = params['workspace_name'] + '/' + params['genome_name']

        # do a quick lookup of object info- could use this to do some validation.  Here we need it to provide
        # a nice output file name if it is not set...  We should probably catch errors here and print out a nice
        # message - usually this would mean the ref was bad.
        ws = Workspace(url=self.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref':genome_ref}],'includeMetadata':0, 'ignoreErrors':0})[0]
        print('resolved object to:');
        pprint(info)

        if 'new_genbank_file_name' not in params or params['new_genbank_file_name'] is None:
            new_genbank_file_name = info[1] + ".gbk"
        else:
            new_genbank_file_name = params['new_genbank_file_name']


        # construct a working directory to hand off to the data_api
        working_directory =  os.path.join(self.sharedFolder, 'genome-download-'+str(uuid.uuid4()))
        os.makedirs(working_directory)
        output_file_destination = os.path.join(working_directory,new_genbank_file_name)

        # do it
        print('calling: doekbase.data_api.downloaders.GenomeAnnotation.downloadAsGBK');
        GenomeAnnotation.downloadAsGBK(
                            genome_ref,
                            service_endpoints,
                            ctx['token'],
                            output_file_destination,
                            working_directory)

        # if we need to upload to shock, well then do that too.
        file = {}
        if 'save_to_shock' in params and params['save_to_shock'] == 1:
            dfUtil = DataFileUtil(self.callback_url, token=ctx['token'])
            file['shock_id'] =dfUtil.file_to_shock({
                                    'file_path':output_file_destination,
                                    'gzip':0,
                                    'make_handle':0
                                    #attributes: {} #we can set shock attributes if we want
                                })['shock_id']
        else:
            file['path'] = output_file_destination

        #END genome_annotation_to_genbank

        # At some point might do deeper type checking...
        if not isinstance(file, dict):
            raise ValueError('Method genome_annotation_to_genbank return value ' +
                             'file is not type dict as required.')
        # return the results
        return [file]
    def genbank_to_genome_annotation(self, ctx, params):
        """
        :param params: instance of type "GenbankToGenomeAnnotationParams"
           (file_path or shock_id -- Local path or shock_id of the uploaded
           file with genome sequence in GenBank format or zip-file with
           GenBank files. genome_name -- The name you would like to use to
           reference this GenomeAnnotation. If not supplied, will use the
           Taxon Id and the data source to determine the name. taxon_wsname -
           name of the workspace containing the Taxonomy data, defaults to
           'ReferenceTaxons') -> structure: parameter "file_path" of String,
           parameter "shock_id" of String, parameter "ftp_url" of String,
           parameter "genome_name" of String, parameter "workspace_name" of
           String, parameter "source" of String, parameter "taxon_wsname" of
           String, parameter "convert_to_legacy" of type "boolean" (A boolean
           - 0 for false, 1 for true. @range (0, 1))
        :returns: instance of type "GenomeAnnotationDetails" -> structure:
           parameter "genome_annotation_ref" of String
        """
        # ctx is the context object
        # return variables are: details
        #BEGIN genbank_to_genome_annotation

        print('genbank_to_genome_annotation -- paramaters = ')
        pprint(params)

        # validate input and set defaults.  Note that because we don't call the uploader method
        # as a stand alone script, we do the validation here.
        if 'workspace_name' not in params:
            raise ValueError('workspace_name field was not defined')
        workspace_name = params['workspace_name']

        if 'genome_name' not in params:
            raise ValueError('genome_name field was not defined')
        genome_name = params['genome_name']

        source = 'Genbank'
        if 'source' in params:
            source = source;

        taxon_wsname = 'ReferenceTaxons'
        if 'taxon_wsname' in params:
            taxon_wsname = params['taxon_wsname']

        # other options to handle
        # release
        # taxon_reference
        # exclude_feature_types
        # type


        # construct the input directory where we stage files
        input_directory =  os.path.join(self.sharedFolder, 'genome-upload-staging-'+str(uuid.uuid4()))
        os.makedirs(input_directory)

        # determine how to get the file: if it is from shock, download it.  If it
        # is just sitting there, then use it.  Move the file to the staging input directory

        genbank_file_path = None

        if 'file_path' not in params:
            if 'shock_id' not in params:
                if 'ftp_url' not in params:
                    raise ValueError('No input file (either file_path, shock_id, or ftp_url) provided')
                else:
                    # TODO handle ftp - this creates a directory for us, so update the input directory
                    print('calling Transform download utility: script_utils.download');
                    print('URL provided = '+params['ftp_url']);
                    script_utils.download_from_urls(
                            working_directory = input_directory,
                            token = ctx['token'], # not sure why this requires a token to download from a url...
                            urls  = {
                                        'ftpfiles': params['ftp_url']
                                    }
                        );
                    input_directory = os.path.join(input_directory,'ftpfiles')
                    # unpack everything in input directory
                    dir_contents = os.listdir(input_directory)
                    print('downloaded directory listing:')
                    pprint(dir_contents)
                    dir_files = []
                    for f in dir_contents:
                        if os.path.isfile(os.path.join(input_directory, f)):
                            dir_files.append(f)

                    print('processing files in directory...')
                    for f in dir_files:
                        # unpack if needed using the standard transform utility
                        print('unpacking '+f)
                        script_utils.extract_data(filePath=os.path.join(input_directory,f))

            else:
                # handle shock file
                dfUtil = DataFileUtil(self.callback_url, token=ctx['token'])
                file_name = dfUtil.shock_to_file({
                                    'file_path': input_directory,
                                    'shock_id': params['shock_id']
                                })['node_file_name']
                genbank_file_path = os.path.join(input_directory, file_name)
        else:
            # copy the local file to the input staging directory
            # (NOTE: could just move it, but then this method would have the side effect of moving your
            # file which another SDK module might have an open handle on)
            local_file_path = params['file_path']
            genbank_file_path = os.path.join(input_directory, os.path.basename(local_file_path))
            shutil.copy2(local_file_path, genbank_file_path)

        if genbank_file_path is not None:
            print("input genbank file =" + genbank_file_path)

            # unpack if needed using the standard transform utility
            script_utils.extract_data(filePath=genbank_file_path)

        # do the upload (doesn't seem to return any information)
        uploader.upload_genome(
                logger=None,

                shock_service_url = self.shockURL,
                handle_service_url = self.handleURL,
                workspace_service_url = self.workspaceURL,

                input_directory=input_directory,

                workspace_name   = workspace_name,
                core_genome_name = genome_name,
                source           = source,
                taxon_wsname     = taxon_wsname
            )

        #### Code to convert to legacy type if requested
        if 'convert_to_legacy' in params and params['convert_to_legacy']==1:
            from doekbase.data_api.converters import genome as cvt
            print('Converting to legacy type, object={}'.format(genome_name))
            cvt.convert_genome(
                    shock_url=self.shockURL,
                    handle_url=self.handleURL,
                    ws_url=self.workspaceURL,
                    obj_name=genome_name,
                    ws_name=workspace_name)

        # clear the temp directory
        shutil.rmtree(input_directory)

        # get WS metadata to return the reference to the object (could be returned by the uploader method...)
        ws = Workspace(url=self.workspaceURL)
        info = ws.get_object_info_new({'objects':[{'ref':workspace_name + '/' + genome_name}],'includeMetadata':0, 'ignoreErrors':0})[0]

        details = {
            'genome_annotation_ref':str(info[6]) + '/' + str(info[0]) + '/' + str(info[4])
        }


        #END genbank_to_genome_annotation

        # At some point might do deeper type checking...
        if not isinstance(details, dict):
            raise ValueError('Method genbank_to_genome_annotation return value ' +
                             'details is not type dict as required.')
        # return the results
        return [details]