def convert(shock_url, shock_id, handle_url, handle_id, input_filename, output_filename, level=logging.INFO, logger=None):
    """
    Converts FASTA file to KBaseAssembly.SingleEndLibrary json string.

    Args:
        shock_url: A url for the KBase SHOCK service.
        handle_url: A url for the KBase Handle Service.
        shock_id: A KBase SHOCK node id.
        handle_id: A KBase Handle id.
        input_filename: A file name for the input FASTA data.
        output_filename: A file name where the output JSON string should be stored.
        level: Logging level, defaults to logging.INFO.

    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of FASTA to KBaseAssembly.SingleEndLibrary.")

    token = os.environ.get('KB_AUTH_TOKEN')
    
    logger.info("Gathering information.")
    handles = script_utils.getHandles(logger, shock_url, handle_url, [shock_id], [handle_id], token)   
    
    assert len(handles) != 0
    
    objectString = json.dumps({"handle" : handles[0]}, sort_keys=True, indent=4)
    
    logger.info("Writing out JSON.")
    with open(args.output_filename, "w") as outFile:
        outFile.write(objectString)
    
    logger.info("Conversion completed.")
Exemple #2
0
def PluginManager(directory=None, logger=script_utils.stderrlogger(__file__)):
    if directory is None:
        raise Exception(
            "Must provide a directory to read plugin configs from!")

    manager = PlugIns(directory, logger)
    return manager
def transform(workspace_service_url=None, shock_service_url=None, handle_service_url=None, 
              workspace_name=None, object_name=None, object_id=None, 
              object_version_number=None, working_directory=None, output_file_name=None, 
              level=logging.INFO, logger=None):  
    """
    Converts KBaseAssembly.SingleEndLibrary to a Fasta file of assembledDNA.
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        workspace_name: Name of the workspace
        object_name: Name of the object in the workspace 
        object_id: Id of the object in the workspace, mutually exclusive to object_name
        object_version_number: Version number of workspace object (ContigSet), defaults to most recent version
        working_directory: The working directory where the output file should be stored.
        output_file_name: The desired file name of the result file.
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        A FASTA file containing assembled sequences from a KBase ContigSet object.
    
    Authors:
        Jason Baumohl, Matt Henderson
    
    """ 

    def insert_newlines(s, every):
        lines = []
        for i in xrange(0, len(s), every):
            lines.append(s[i:i+every])
        return "\n".join(lines)+"\n"


    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of KBaseGenomes.ContigSet to FASTA.DNA.Assembly")
    token = os.environ.get("KB_AUTH_TOKEN")
    
    if not os.path.isdir(args.working_directory): 
        raise Exception("The working directory does not exist {0} does not exist".format(working_directory)) 

    logger.info("Grabbing Data.")
 
    try:
        ws_client = biokbase.workspace.client.Workspace(workspace_service_url) 
        if object_version_number and object_name:
            contig_set = ws_client.get_objects([{"workspace":workspace_name,"name":object_name, "ver":object_version_number}])[0] 
        elif object_name:
            contig_set = ws_client.get_objects([{"workspace":workspace_name,"name":object_name}])[0]
        elif object_version_number and object_id:
            contig_set = ws_client.get_objects([{"workspace":workspace_name,"objid":object_id, "ver":object_version_number}])[0]
        else:
            contig_set = ws_client.get_objects([{"workspace":workspace_name,"objid":object_id}])[0] 
    except Exception, e: 
        logger.exception("Unable to retrieve workspace object from {0}:{1}.".format(workspace_service_url,workspace_name))
        logger.exception(e)
        raise 
def main():
    script_details = script_utils.parse_docs(transform.__doc__)

    parser = argparse.ArgumentParser(prog=__file__,
                                     description=script_details["Description"],
                                     epilog=script_details["Authors"])

    parser.add_argument('--workspace_service_url',
                        help=script_details["Args"]["workspace_service_url"],
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument('--workspace_name',
                        help=script_details["Args"]["workspace_name"],
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument("--object_name", 
                        help=script_details["Args"]["object_name"], 
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument('--output_file_name',
                        help=script_details["Args"]["output_file_name"],
                        action='store', type=str, nargs='?', default=None,
                        required=False)
    parser.add_argument('--input_directory',
                        help=script_details["Args"]["input_directory"],
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument("--working_directory", 
                        help=script_details["Args"]["working_directory"], 
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument("--has_replicates", 
                        help=script_details["Args"]["has_replicates"], 
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument('--input_mapping',
                        help=script_details["Args"]["input_mapping"],
                        action='store', type=unicode, nargs='?', default=None,
                        required=False)

    # custom arguments specific to this uploader
    parser.add_argument('--format_type',
                        help=script_details["Args"]["format_type"],
                        action='store', type=str, required=False)

    args, unknown = parser.parse_known_args()

    logger = script_utils.stderrlogger(__file__)

    logger.debug(args)
    try:
        transform(workspace_service_url=args.workspace_service_url,
                  workspace_name=args.workspace_name,
                  object_name=args.object_name,
                  output_file_name=args.output_file_name,
                  input_directory=args.input_directory,
                  working_directory=args.working_directory,
                  has_replicates=args.has_replicates,
                  input_mapping=args.input_mapping,
                  format_type=args.format_type,
                  logger=logger)
    except Exception as e:
        logger.exception(e)
        sys.exit(1)
Exemple #5
0
    def __init__(self, plugins_directory=None, logger=script_utils.stderrlogger(__file__)):
        self.scripts_config = {"external_types": list(),
                               "kbase_types": list(),
                               "validate": dict(),
                               "upload": dict(),
                               "download": dict(),
                               "convert": dict()}

        self.logger = logger

        plugins = sorted(os.listdir(plugins_directory))
        
        for p in plugins:
            try:
                f = open(os.path.join(plugins_directory, p), 'r')
                pconfig = simplejson.loads(f.read())
                f.close()

                id = None
                
                if pconfig["script_type"] == "validate":
                    if pconfig["external_type"] not in self.scripts_config["external_types"]:
                        self.scripts_config["external_types"].append(pconfig["external_type"])
                    
                    id = pconfig["external_type"]
                elif pconfig["script_type"] == "upload":
                    if pconfig["external_type"] not in self.scripts_config["external_types"]:
                        self.scripts_config["external_types"].append(pconfig["external_type"])
                    
                    if pconfig["kbase_type"] not in self.scripts_config["kbase_types"]:
                        self.scripts_config["kbase_types"].append(pconfig["kbase_type"])
                    
                    id = "{0}=>{1}".format(pconfig["external_type"],pconfig["kbase_type"])
                elif pconfig["script_type"] == "download":
                    if pconfig["external_type"] not in self.scripts_config["external_types"]:
                        self.scripts_config["external_types"].append(pconfig["external_type"])
                    
                    if pconfig["kbase_type"] not in self.scripts_config["kbase_types"]:
                        self.scripts_config["kbase_types"].append(pconfig["kbase_type"])
                    
                    id = "{0}=>{1}".format(pconfig["kbase_type"],pconfig["external_type"])
                elif pconfig["script_type"] == "convert":
                    if pconfig["source_kbase_type"] not in self.scripts_config["kbase_types"]:
                        self.scripts_config["kbase_types"].append(pconfig["source_kbase_type"])
                    
                    if pconfig["destination_kbase_type"] not in self.scripts_config["kbase_types"]:
                        self.scripts_config["kbase_types"].append(pconfig["destination_kbase_type"])
                    
                    id = "{0}=>{1}".format(pconfig["source_kbase_type"],pconfig["destination_kbase_type"])

                self.scripts_config[pconfig["script_type"]][id] = pconfig

                self.logger.info("Successfully added plugin {0}".format(p))
            except Exception, e:
                self.logger.warning("Unable to read plugin {0}: {1}".format(p,e.message))
Exemple #6
0
    def __init__(self, plugins_directory=None, logger=script_utils.stderrlogger(__file__)):
        self.scripts_config = {"external_types": list(),
                               "kbase_types": list(),
                               "validate": dict(),
                               "upload": dict(),
                               "download": dict(),
                               "convert": dict()}

        self.logger = logger

        plugins = sorted(os.listdir(plugins_directory))
        
        for p in plugins:
            try:
                f = open(os.path.join(plugins_directory, p), 'r')
                pconfig = simplejson.loads(f.read())
                f.close()

                id = None
                
                if pconfig["script_type"] == "validate":
                    if pconfig["external_type"] not in self.scripts_config["external_types"]:
                        self.scripts_config["external_types"].append(pconfig["external_type"])
                    
                    id = pconfig["external_type"]
                elif pconfig["script_type"] == "upload":
                    if pconfig["external_type"] not in self.scripts_config["external_types"]:
                        self.scripts_config["external_types"].append(pconfig["external_type"])
                    
                    if pconfig["kbase_type"] not in self.scripts_config["kbase_types"]:
                        self.scripts_config["kbase_types"].append(pconfig["kbase_type"])
                    
                    id = "{0}=>{1}".format(pconfig["external_type"],pconfig["kbase_type"])
                elif pconfig["script_type"] == "download":
                    if pconfig["external_type"] not in self.scripts_config["external_types"]:
                        self.scripts_config["external_types"].append(pconfig["external_type"])
                    
                    if pconfig["kbase_type"] not in self.scripts_config["kbase_types"]:
                        self.scripts_config["kbase_types"].append(pconfig["kbase_type"])
                    
                    id = "{0}=>{1}".format(pconfig["kbase_type"],pconfig["external_type"])
                elif pconfig["script_type"] == "convert":
                    if pconfig["source_kbase_type"] not in self.scripts_config["kbase_types"]:
                        self.scripts_config["kbase_types"].append(pconfig["source_kbase_type"])
                    
                    if pconfig["destination_kbase_type"] not in self.scripts_config["kbase_types"]:
                        self.scripts_config["kbase_types"].append(pconfig["destination_kbase_type"])
                    
                    id = "{0}=>{1}".format(pconfig["source_kbase_type"],pconfig["destination_kbase_type"])

                self.scripts_config[pconfig["script_type"]][id] = pconfig

                self.logger.info("Successfully added plugin {0}".format(p))
            except Exception, e:
                self.logger.warning("Unable to read plugin {0}: {1}".format(p,e.message))
Exemple #7
0
def run_task(logger, arguments, debug=False):
    """
    A factory function to abstract the implementation details of how tasks are run.
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    h = TaskRunner(logger)
    out = h.run(arguments, debug)
    return out
Exemple #8
0
 def __init__(self, logger=None, callback=None):
     #logger_stdout = script_utils.getStdoutLogger()
     if logger is None:
         self.logger = script_utils.stderrlogger(__file__)
     else:
         self.logger = logger
         
     if callback is None:
         self.callback = lambda x: self.logger.info(x)
     else:
         self.callback = callback
Exemple #9
0
    def __init__(self, logger=None, callback=None):
        #logger_stdout = script_utils.getStdoutLogger()
        if logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        else:
            self.logger = logger

        if callback is None:
            self.callback = lambda x: self.logger.info(x)
        else:
            self.callback = callback
Exemple #10
0
def run_task(logger, arguments, debug=False, callback=None):
    """
    A factory function to abstract the implementation details of how tasks are run.
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    h = TaskRunner(logger, callback=callback)
    out = h.run(arguments, debug)
    return out
def validate(input_directory, working_directory, level=logging.INFO, logger=None):
    """
    Validates a FASTA file of nucleotide sequences.

    Args:
        input_directory: A directory containing one or more FASTA files.
        working_directory: A directory where any output files produced by validation can be written.
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Currently writes to stderr with a Java Exception trace on error, otherwise no output.
    
    Authors:
        Srividya Ramikrishnan, Matt Henderson
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    extensions = [".fa",".fasta",".fna"]
        
    validated = False
    for input_file_name in os.listdir(input_directory):
        logger.info("Checking for FASTA file : {0}".format(input_file_name))

        filePath = os.path.join(os.path.abspath(input_directory), input_file_name)
        
        if not os.path.isfile(filePath):
            logger.warning("Skipping directory {0}".format(input_file_name))
            continue
        elif os.path.splitext(input_file_name)[-1] not in extensions:
            logger.warning("Unrecognized file type, skipping.")
            continue
    
        logger.info("Starting FASTA validation of {0}".format(input_file_name))
                
        # TODO This needs to be changed, this is really just a demo program for this library and not a serious tool
        java_classpath = os.path.join(os.environ.get("KB_TOP"), "lib/jars/FastaValidator/FastaValidator-1.0.jar")
        arguments = ["java", "-classpath", java_classpath, "FVTester", filePath]
            
        tool_process = subprocess.Popen(arguments, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
    
        if len(stderr) > 0:
            logger.error("Validation failed on {0}".format(input_file_name))
        else:
            logger.info("Validation passed on {0}".format(input_file_name))
            validated = True
        
    if not validated:
        raise Exception("Validation failed!")
    else:
        logger.info("Validation passed.")
Exemple #12
0
def transform(input_file=None, level=logging.INFO, logger=None):
    """
    Validate Genbank file.
    
    Args:
        input_directory: An genbank input file
    
    Returns:
        Any validation errors or success.
    
    Authors:
        Shinjae Yoo, Matt Henderson, Marcin Joachimiak
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting Genbank validation")

    token = os.environ.get("KB_AUTH_TOKEN")

    classpath = "/kb/dev_container/modules/transform/lib/jars/kbase/transform/GenBankTransform.jar:$KB_TOP/lib/jars/kbase/genomes/kbase-genomes-20140411.jar:$KB_TOP/lib/jars/kbase/common/kbase-common-0.0.6.jar:$KB_TOP/lib/jars/jackson/jackson-annotations-2.2.3.jar:$KB_TOP/lib/jars/jackson/jackson-core-2.2.3.jar:$KB_TOP/lib/jars/jackson/jackson-databind-2.2.3.jar:$KB_TOP/lib/jars/kbase/transform/GenBankTransform.jar:$KB_TOP/lib/jars/kbase/auth/kbase-auth-1398468950-3552bb2.jar:$KB_TOP/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar"
    mc = 'us.kbase.genbank.ValidateGBK'

    java_classpath = os.path.join(
        os.environ.get("KB_TOP"),
        classpath.replace('$KB_TOP', os.environ.get("KB_TOP")))

    argslist = "{0}".format("--input_file {0}".format(input_file))

    arguments = [
        "java", "-classpath", java_classpath, "us.kbase.genbank.ConvertGBK",
        argslist
    ]

    print arguments
    tool_process = subprocess.Popen(arguments, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()

    if len(stderr) > 0:
        logger.error(
            "Validation of Genbank.Genome failed on {0}".format(input_file))
        sys.exit(1)
    else:
        logger.info("Validation of Genbank.Genome completed.")
        sys.exit(0)
def transform(input_file=None,
              level=logging.INFO, logger=None):
    """
    Validate Genbank file.
    
    Args:
        input_directory: An genbank input file
    
    Returns:
        Any validation errors or success.
    
    Authors:
        Shinjae Yoo, Matt Henderson, Marcin Joachimiak
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting Genbank validation")
    
    token = os.environ.get("KB_AUTH_TOKEN")

    classpath = "/kb/dev_container/modules/transform/lib/jars/kbase/transform/GenBankTransform.jar:$KB_TOP/lib/jars/kbase/genomes/kbase-genomes-20140411.jar:$KB_TOP/lib/jars/kbase/common/kbase-common-0.0.6.jar:$KB_TOP/lib/jars/jackson/jackson-annotations-2.2.3.jar:$KB_TOP/lib/jars/jackson/jackson-core-2.2.3.jar:$KB_TOP/lib/jars/jackson/jackson-databind-2.2.3.jar:$KB_TOP/lib/jars/kbase/transform/GenBankTransform.jar:$KB_TOP/lib/jars/kbase/auth/kbase-auth-1398468950-3552bb2.jar:$KB_TOP/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar"
    mc = 'us.kbase.genbank.ValidateGBK'

    java_classpath = os.path.join(os.environ.get("KB_TOP"), classpath.replace('$KB_TOP', os.environ.get("KB_TOP")))
    
    argslist = "{0}".format("--input_file {0}".format(input_file))
    
    arguments = ["java", "-classpath", java_classpath, "us.kbase.genbank.ConvertGBK", argslist]

    print arguments        
    tool_process = subprocess.Popen(arguments, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()

    if len(stderr) > 0:
        logger.error("Validation of Genbank.Genome failed on {0}".format(input_file))
        sys.exit(1)
    else:
        logger.info("Validation of Genbank.Genome completed.")
        sys.exit(0)
    def filter_genes(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN filter_genes
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
 
 
        cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, 
                                          '--workspace_name', param['workspace_name'],
                                          '--object_name', param['object_name'],
                                          '--working_directory', self.RAWEXPR_DIR,
                                          '--output_file_name', self.EXPRESS_FN
                              ]
 
        # need shell in this case because the java code is depending on finding the KBase token in the environment
        #  -- copied from FVE_2_TSV
        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f:
          fl = f.readline()
        ncol = len(fl.split('\t'))
        
        # force to use ANOVA if the number of sample is two
        if(ncol == 3): param['method'] = 'anova'
 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol-1):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_filter
        cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN),
                           '-m', param['method'], '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
                           '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y']
        if 'num_features' in param:
          cmd_coex_filter.append("-n")
          cmd_coex_filter.append(str(param['num_features']))
 
        if 'p_value' in param:
          cmd_coex_filter.append("-p")
          cmd_coex_filter.append(str(param['p_value']))
 
        if 'p_value' not in param and 'num_features' not in param:
          self.logger.error("One of p_value or num_features must be defined");
          return empty_results("One of p_value or num_features must be defined", expr,self.__WS_URL, param, self.logger, ws)
          #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination
 
        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)
 
        tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        ## Header correction
        try:
            with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'r') as ff:
                fe = ff.readlines()
            with open("{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN), 'w') as ff:
                ff.write(fl) # use original first line that has correct header information
                fe.pop(0)
                ff.writelines(fe)
        except:
            self.logger.error("Output was not found");
            return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws)
            
        
        ## checking genelist
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN),'r') as glh:
          gl = glh.readlines()
        gl = [x.strip('\n') for x in gl]
 
        if(len(gl) < 1) :
          self.logger.error("No genes are selected")
          return empty_results("Increase p_value or specify num_features", expr,self.__WS_URL, param, self.logger, ws)
          #sys.exit(4)
 
        ## Upload FVE
        # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
        # Updates: change missing genome handling strategy by copying reference to working workspace
        cmd_upload_expr = [self.TSV_2_FVE, '--workspace_service_url', self.__WS_URL, 
                                          '--object_name', param['out_expr_object_name'],
                                          '--working_directory', self.FINAL_DIR,
                                          '--input_directory', self.FLTRD_DIR,
                                          '--output_file_name', self.FINAL_FN
                              ]
        tmp_ws = param['workspace_name']
        if 'genome_ref' in expr:
            obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0]
 
            if len(obj_infos) < 1:
                self.logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref']))
                raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref']))
 
            #tmp_ws = "{0}".format(obj_infos[7])
            self.logger.info("{0} => {1} / {2}".format(expr['genome_ref'], obj_infos[7], obj_infos[1]))
            if obj_infos[7] != param['workspace_name']:
                #we need to copy it from the other workspace
                try:
                  self.logger.info("trying to copy the referenced genome object : {0}".format(expr['genome_ref']))
                  ws.copy_object({'from' : {'ref' : expr['genome_ref']},'to' : {'workspace': param['workspace_name'], 'name' : obj_infos[1]}})
                  # add genome_object_name only after successful copy
                  cmd_upload_expr.append('--genome_object_name')
                  cmd_upload_expr.append(obj_infos[1])
                except:
                  # no permission or any issues... then, give up providing genome reference
                  self.logger.info("".join(traceback.format_exc()))
                  pass
            else:
                # it is local... we can simply add reference without copying genome
                cmd_upload_expr.append('--genome_object_name')
                cmd_upload_expr.append(obj_infos[1])
 
        # updated ws name
        cmd_upload_expr.append('--workspace_name')
        cmd_upload_expr.append(tmp_ws)
 
        self.logger.info(" ".join(cmd_upload_expr))
 
        tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        
        with open("{0}/{1}".format(self.FINAL_DIR,self.FINAL_FN),'r') as et:
          eo = json.load(et)
 
        if 'description' not in expr: 
            expr['description'] = "Filtered Expression Matrix"
        expr['description'] += " : Filtered by '{1}' method ".format(expr['description'], param['method'])
 
        if 'feature_mapping' in expr and 'feature_mapping' in eo:
            expr['feature_mapping'] = eo['feature_mapping']
        expr['data'] = eo['data']
 
        ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix',
                                                                              'data' : expr,
                                                                              'name' : (param['out_expr_object_name'])}]})
 
        ## Upload FeatureSet
        fs ={'elements': {}}
        fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(param['method'])
 
        fs['description'] += "from {0}/{1}".format(param['workspace_name'], param['object_name'])
 
        for g in gl:
          if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
          else:
            fs['elements'][g] = []
 
        ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet',
                                                                              'data' : fs,
                                                                              'name' : (param['out_fs_object_name'])}]})
        result = {'workspace_name' : param['workspace_name'], 'out_expr_object_name' : param['out_expr_object_name'], 'out_fs_object_name' : param['out_fs_object_name']}
        #END filter_genes

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method filter_genes return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
    Args:
        shock_service_url: If you have shock references you need to make.
        handle_service_url: In case your type has at least one handle reference.
        working_directory: A directory where you can do work.
    
    Returns:
        JSON representing a KBase object.
    
    Authors:
        Your name here
    
    """

    # there are utility functions for things you need to do, like log messages
    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Python KBase Upload template transform script")
    
    # here is how you get the user token to access services
    token = os.environ.get("KB_AUTH_TOKEN")
    
    # stuff happens in here, transform the data
    
    # now make your JSON object
    objectString = json.dumps("{}", sort_keys=True, indent=4)
    
    # write it to disk
    
    logger.info("Transform completed.")
    
def transform(workspace_service_url=None,
              shock_service_url=None,
              handle_service_url=None,
              workspace_name=None,
              object_name=None,
              object_id=None,
              object_version_number=None,
              working_directory=None,
              output_file_name=None,
              level=logging.INFO,
              logger=None):
    """
    Converts KBaseAssembly.SingleEndLibrary to a Fasta file of assembledDNA.
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        workspace_name: Name of the workspace
        object_name: Name of the object in the workspace 
        object_id: Id of the object in the workspace, mutually exclusive to object_name
        object_version_number: Version number of workspace object (ContigSet), defaults to most recent version
        working_directory: The working directory where the output file should be stored.
        output_file_name: The desired file name of the result file.
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        A FASTA file containing assembled sequences from a KBase ContigSet object.
    
    Authors:
        Jason Baumohl, Matt Henderson
    
    """
    def insert_newlines(s, every):
        lines = []
        for i in xrange(0, len(s), every):
            lines.append(s[i:i + every])
        return "\n".join(lines) + "\n"

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of KBaseGenomes.ContigSet to FASTA.DNA.Assembly")
    token = os.environ.get("KB_AUTH_TOKEN")

    if not os.path.isdir(args.working_directory):
        raise Exception(
            "The working directory does not exist {0} does not exist".format(
                working_directory))

    logger.info("Grabbing Data.")

    try:
        ws_client = biokbase.workspace.client.Workspace(workspace_service_url)
        if object_version_number and object_name:
            contig_set = ws_client.get_objects([{
                "workspace": workspace_name,
                "name": object_name,
                "ver": object_version_number
            }])[0]
        elif object_name:
            contig_set = ws_client.get_objects([{
                "workspace": workspace_name,
                "name": object_name
            }])[0]
        elif object_version_number and object_id:
            contig_set = ws_client.get_objects([{
                "workspace": workspace_name,
                "objid": object_id,
                "ver": object_version_number
            }])[0]
        else:
            contig_set = ws_client.get_objects([{
                "workspace": workspace_name,
                "objid": object_id
            }])[0]
    except Exception, e:
        logger.exception(
            "Unable to retrieve workspace object from {0}:{1}.".format(
                workspace_service_url, workspace_name))
        logger.exception(e)
        raise
def transform(shock_service_url=None,
              handle_service_url=None,
              output_file_name=None,
              input_directory=None,
              working_directory=None,
              level=logging.INFO,
              logger=None):
    """
    Converts a FASTQ file to a KBaseAssembly.SingleEndLibrary json string.  

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be stored.  
        input_directory: The directory containing the file.
        working_directory: The directory the resulting json file will be written to.
        level: Logging level, defaults to logging.INFO.
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.

    Authors:
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Scanning for FASTQ files.")

    valid_extensions = [".fq", ".fastq", ".fnq"]

    files = os.listdir(working_directory)
    fastq_files = [
        x for x in files if os.path.splitext(x)[-1] in valid_extensions
    ]

    assert len(fastq_files) != 0

    logger.info("Found {0}".format(str(fastq_files)))

    input_file_name = files[0]

    if len(fastq_files) > 1:
        logger.warning(
            "Not sure how to handle multiple FASTQ files in this context. Using {0}"
            .format(input_file_name))

    kb_token = os.environ.get('KB_AUTH_TOKEN')

    script_utils.upload_file_to_shock(logger=logger,
                                      shock_service_url=shock_service_url,
                                      filePath=os.path.join(
                                          input_directory, input_file_name),
                                      token=kb_token)

    handles = script_utils.getHandles(logger=logger,
                                      shock_service_url=shock_service_url,
                                      handle_service_url=handle_service_url,
                                      token=kb_token)

    assert len(handles) != 0

    objectString = simplejson.dumps({"handle": handles[0]},
                                    sort_keys=True,
                                    indent=4)

    if output_file_name is None:
        output_file_name = input_file_name

    with open(os.path.join(output_directory, output_file_name), "w") as f:
        f.write(objectString)
def transform(shock_service_url=None,
              workspace_service_url=None,
              workspace_name=None,
              object_name=None,
              contigset_object_name=None,
              input_directory=None,
              working_directory=None,
              level=logging.INFO,
              logger=None):
    """
    Transforms Genbank file to KBaseGenomes.Genome and KBaseGenomes.ContigSet objects.
    
    Args:
        shock_service_url: If you have shock references you need to make.
        workspace_service_url: KBase Workspace URL
        workspace_name: Name of the workspace to save the data to
        object_name: Name of the genome object to save
        contigset_object_name: Name of the ContigSet object that is created with this Genome
        input_directory: A directory of either a genbank file or a directory of partial genome files to merge
        working_directory: A directory where you can do work
    
    Returns:
        Workspace objects saved to the user's workspace.
    
    Authors:
        Shinjae Yoo, Marcin Joachimiak, Matt Henderson
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting transformation of Genbank to KBaseGenomes.Genome")

    classpath = [
        "$KB_TOP/lib/jars/kbase/transform/GenBankTransform.jar",
        "$KB_TOP/lib/jars/kbase/genomes/kbase-genomes-20140411.jar",
        "$KB_TOP/lib/jars/kbase/common/kbase-common-0.0.6.jar",
        "$KB_TOP/lib/jars/jackson/jackson-annotations-2.2.3.jar",
        "$KB_TOP/lib/jars/jackson/jackson-core-2.2.3.jar",
        "$KB_TOP/lib/jars/jackson/jackson-databind-2.2.3.jar",
        "$KB_TOP/lib/jars/kbase/transform/GenBankTransform.jar",
        "$KB_TOP/lib/jars/kbase/auth/kbase-auth-1398468950-3552bb2.jar",
        "$KB_TOP/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar"
    ]

    mc = "us.kbase.genbank.ConvertGBK"

    argslist = [
        "--shock_url {0}".format(shock_service_url),
        "--workspace_service_url {0}".format(workspace_service_url),
        "--workspace_name {0}".format(workspace_name),
        "--object_name {0}".format(object_name),
        "--working_directory {0}".format(working_directory),
        "--input_directory {0}".format(input_directory)
    ]

    if contigset_object_name is not None:
        argslist.append(
            "--contigset_object_name {0}".format(contigset_object_name))

    arguments = [
        "java", "-classpath", ":".join(classpath),
        "us.kbase.genbank.ConvertGBK", " ".join(argslist)
    ]

    logger.debug(arguments)

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    tool_process = subprocess.Popen(" ".join(arguments),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.error(
            "Transformation from Genbank.Genome to KBaseGenomes.Genome failed on {0}"
            .format(input_directory))
        logger.error(stderr)
        sys.exit(1)

    logger.info(
        "Transformation from Genbank.Genome to KBaseGenomes.Genome completed.")
    sys.exit(0)
def run_filter_genes(workspace_service_url=None, param_file = None, level=logging.INFO, logger = None):
    """
    Narrative Job Wrapper script to execute coex_filter
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        param_file: parameter file
        object_name: Name of the object in the workspace 
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Output is written back in WS
    
    Authors:
        Shinjae Yoo
    
    """ 

    try:
        os.makedirs(RAWEXPR_DIR)
    except:
        pass
    try:
        os.makedirs(FLTRD_DIR)
    except:
        pass
    try:
        os.makedirs(FINAL_DIR)
    except:
        pass

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
    token = os.environ.get("KB_AUTH_TOKEN")

    with open(param_file) as paramh:
      param = json.load(paramh)

    cmd_dowload_cvt_tsv = [FVE_2_TSV, '--workspace_service_url', workspace_service_url, 
                                      '--workspace_name', param['workspace_name'],
                                      '--object_name', param['object_name'],
                                      '--working_directory', RAWEXPR_DIR,
                                      '--output_file_name', EXPRESS_FN
                          ]

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    #  -- copied from FVE_2_TSV
    tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True)
    stdout, stderr = tool_process.communicate()
    
    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    logger.info("Identifying differentially expressed genes")

    ## Prepare sample file
    # detect num of columns
    with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f:
      fl = f.readline()
    ncol = len(fl.split('\t'))
    
    with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s:
      s.write("0")
      for j in range(1,ncol-1):
        s.write("\t{0}".format(j))
      s.write("\n")


    ## Run coex_filter
    cmd_coex_filter = [COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o', "{0}/{1}".format(FLTRD_DIR, FLTRD_FN),
                       '-m', param['method'], '-s', "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN),
                       '-x', "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y']
    if 'num_features' in param:
      cmd_coex_filter.append("-n")
      cmd_coex_filter.append(param['num_features'])

    if 'num_features' not in param and 'p_value' in param:
      cmd_coex_filter.append("-p")
      cmd_coex_filter.append(param['p_value'])

    if 'p_value' not in param and 'num_features' not in param:
      logger.error("One of p_value or num_features must be defined");
      sys.exit(2)

    #if 'p_value' in param and 'num_features' in param:
    #  logger.error("Both of p_value and num_features cannot be defined together");
    #  sys.exit(3)

    tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()
    
    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    ## Header correction
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff:
        fe = ff.readlines()
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff:
        ff.write(fl) # use original first line that has correct header information
        fe.pop(0)
        ff.writelines(fe)
    

    ## Upload FVE
    from biokbase.workspace.client import Workspace
    ws = Workspace(url=workspace_service_url, token=os.environ['KB_AUTH_TOKEN'])
    expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
    
    # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
    cmd_upload_expr = [TSV_2_FVE, '--workspace_service_url', workspace_service_url, 
                                      '--object_name', param['out_expr_object_name'],
                                      '--working_directory', FINAL_DIR,
                                      '--input_directory', FLTRD_DIR,
                                      '--output_file_name', FINAL_FN
                          ]
    tmp_ws = param['workspace_name']
    if 'genome_ref' in expr:
        cmd_upload_expr.append('--genome_object_name')
        obj_infos = ws.get_object_info_new({"objects": [{'ref':expr['genome_ref']}]})[0]

        if len(obj_infos) < 1:
            logger.error("Couldn't find {0} from the workspace".format(expr['genome_ref']))
            raise Exception("Couldn't find {0} from the workspace".format(expr['genome_ref']))

        cmd_upload_expr.append(obj_infos[1])
        tmp_ws = obj_infos[7]
        logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws, obj_infos[1]))

    # updated ws name
    cmd_upload_expr.append('--workspace_name')
    cmd_upload_expr.append(tmp_ws)

    tool_process = subprocess.Popen(" ".join(cmd_upload_expr), stderr=subprocess.PIPE, shell=True)
    stdout, stderr = tool_process.communicate()
    
    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    
    with open("{0}/{1}".format(FINAL_DIR,FINAL_FN),'r') as et:
      eo = json.load(et)

    if 'description' in expr: expr['description'] = "{0}, coex_filter by {1}".format(expr['description'], " ".join(cmd_coex_filter))
    if 'feature_mapping' in expr:
        expr['feature_mapping'] = eo['feature_mapping']
    expr['data'] = eo['data']

    ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix',
                                                                          'data' : expr,
                                                                          'name' : (param['out_expr_object_name'])}]})

    ## Upload FeatureSet
    fs ={'description':'Differentially expressed genes generated by {0}'.format(" ".join(cmd_coex_filter)),
         'elements': {}}
    
    with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN),'r') as glh:
      gl = glh.readlines()
    gl = [x.strip('\n') for x in gl]

    for g in gl:
      if 'genome_ref' in expr:
        fs['elements'][g] = [expr['genome_ref']]
      else:
        fs['elements'][g] = []

    ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseCollections.FeatureSet',
                                                                          'data' : fs,
                                                                          'name' : (param['out_fs_object_name'])}]})
Exemple #20
0
def convert_to_contigs(shock_service_url,
                       handle_service_url,
                       input_file_name,
                       contigset_id,
                       working_directory,
                       shock_id,
                       handle_id,
                       fasta_reference_only,
                       source,
                       level=logging.INFO,
                       logger=None):
    """
    Converts KBaseFile.AssemblyFile to KBaseGenomes.ContigSet and saves to WS.
    Note the MD5 for the contig is generated by uppercasing the sequence.
    The ContigSet MD5 is generated by taking the MD5 of joining the sorted list
    of individual contig's MD5s with a comma separator

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        input_file_name: A file name for the input FASTA data.
        contigset_id: The id of the ContigSet. If not
            specified the name will default to the name of the input file
            appended with "_contig_set"'
        working_directory: The directory the resulting json file will be
            written to.
        shock_id: Shock id for the fasta file if it already exists in shock
        handle_id: Handle id for the fasta file if it already exists as a
            handle
        fasta_reference_only: Creates a reference to the fasta file in Shock,
            but does not store the sequences in the workspace object.
            Not recommended unless the fasta file is larger than 1GB.
            This is the default behavior for files that large.
        level: Logging level, defaults to logging.INFO.
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of FASTA to KBaseGenomes.ContigSet")

    logger.info("Building Object.")

    if not os.path.isfile(input_file_name):
        raise Exception(
            "The input file name {0} is not a file!".format(input_file_name))

    # default if not too large
    contig_set_has_sequences = True
    if fasta_reference_only:
        contig_set_has_sequences = False

    fasta_filesize = os.stat(input_file_name).st_size
    if fasta_filesize > 1000000000:
        # Fasta file too large to save sequences into the ContigSet object.
        contigset_warn = 'The FASTA input file seems to be too large. A ' +\
            'ContigSet object will be created without sequences, but will ' +\
            'contain a reference to the file.'
        logger.warning(contigset_warn)
        contig_set_has_sequences = False

    input_file_handle = open(input_file_name, 'r')
    fasta_header = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    for current_line in input_file_handle:
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_list) and first_header_found:
                raise Exception(
                    "There is no sequence related to FASTA record: {0}".format(
                        fasta_header))
            if not first_header_found:
                first_header_found = True
            else:
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence:
                    raise Exception(
                        "There is no sequence related to FASTA record: " +
                        fasta_header)
                contig_dict = dict()
                contig_dict["id"] = fasta_header
                contig_dict["length"] = len(total_sequence)
                contig_dict["name"] = fasta_header
                contig_dict["description"] = "Note MD5 is generated from " +\
                    "uppercasing the sequence"
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
                contig_dict["md5"] = contig_md5
                contig_set_md5_list.append(contig_md5)
                if contig_set_has_sequences:
                    contig_dict["sequence"] = total_sequence
                else:
                    contig_dict["sequence"] = ""
                fasta_dict[fasta_header] = contig_dict

                # get set up for next fasta sequence
                sequence_list = []
            fasta_header = current_line.replace('>', '').strip()
        else:
            sequence_list.append(current_line)

    input_file_handle.close()

    # wrap up last fasta sequence
    if (not sequence_list) and first_header_found:
        raise Exception(
            "There is no sequence related to FASTA record: {0}".format(
                fasta_header))
    elif not first_header_found:
        raise Exception("There are no contigs in this file")
    else:
        # build up sequence and remove all white space
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence:
            raise Exception("There is no sequence related to FASTA record: " +
                            fasta_header)
        contig_dict = dict()
        contig_dict["id"] = fasta_header
        contig_dict["length"] = len(total_sequence)
        contig_dict["name"] = fasta_header
        contig_dict["description"] = "Note MD5 is generated from " +\
            "uppercasing the sequence"
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"] = contig_md5
        contig_set_md5_list.append(contig_md5)
        if contig_set_has_sequences:
            contig_dict["sequence"] = total_sequence
        else:
            contig_dict["sequence"] = ""
        fasta_dict[fasta_header] = contig_dict

    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(
        sorted(contig_set_md5_list))).hexdigest()
    contig_set_dict["id"] = contigset_id
    contig_set_dict["name"] = contigset_id
    s = 'unknown'
    if source and source['source']:
        s = source['source']
    contig_set_dict["source"] = s
    sid = os.path.basename(input_file_name)
    if source and source['source_id']:
        sid = source['source_id']
    contig_set_dict["source_id"] = sid
    contig_set_dict["contigs"] = [
        fasta_dict[x] for x in sorted(fasta_dict.keys())
    ]

    contig_set_dict["fasta_ref"] = shock_id

    logger.info("Conversion completed.")
    return contig_set_dict
Exemple #21
0
    def const_coex_net_clust(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN const_coex_net_clust
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.CLSTR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']

        param = args

        auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL)
        user_id = auth_client.get_user(token)
        workspace_name_t = Template(param['workspace_name'])
        workspace_name = workspace_name_t.substitute(user_id=user_id)

        provenance = [{}]
        if 'provenance' in ctx:
                provenance = ctx['provenance']
        provenance[0]['input_ws_objects']=[workspace_name+'/'+param['object_name']]
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data']
 
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token
        self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        ncol = len(expr['data']['col_ids'])
        
        # grouping information 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_cluster
        cmd_coex_cluster = [self.COEX_CLUSTER, '-t', 'y',
                           '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 
                           '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), '-m', "{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN) ]
 
        for p in ['net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight']:
           if p in param:
             cmd_coex_cluster.append("--{0}".format(p))
             cmd_coex_cluster.append(str(param[p]))
  
 
        #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination
 
        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)
 
        tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            if re.search(r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr):
              self.logger.info(stderr)
            else:
              self.logger.error(stderr)
              raise Exception(stderr)
 
        
        # build index for gene list
        pos_index ={expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids']))}
 
 
        # parse clustering results
        cid2genelist = {}
        cid2stat = {}
        with open("{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN),'r') as glh:
            glh.readline() # skip header
            for line in glh:
                cluster, mcor, msec = line.rstrip().replace('"','').split("\t")
                cid2stat[cluster]= [mcor, msec]
        with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN),'r') as glh:
            glh.readline() # skip header
            for line in glh:
                gene, cluster = line.rstrip().replace('"','').split("\t")
                if cluster not in cid2genelist:
                    cid2genelist[cluster] = []
                cid2genelist[cluster].append(gene)
 
        if(len(cid2genelist) < 1) :
          self.logger.error("Clustering failed")
          return error_report("Error: No cluster output", expr,self.__WS_URL, workspace_name, provenance, ws)
          #sys.exit(4)
 
        self.logger.info("Uploading the results onto WS")
        feature_clusters = []
        for cluster in cid2genelist:
            feature_clusters.append( {"meancor": float(cid2stat[cluster][0]), "msec": float(cid2stat[cluster][0]), "id_to_pos" : { gene : pos_index[gene] for gene in cid2genelist[cluster]}})

        ## Upload Clusters
        feature_clusters ={"original_data": "{0}/{1}".format(workspace_name,param['object_name']),
                           "feature_clusters": feature_clusters}
 
        cl_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseFeatureValues.FeatureClusters',
                                                                          'data' : feature_clusters,
                                                                          'name' : (param['out_object_name'])}]})[0]
        ## Create report object:
	report = "Clustering expression matrix using WGCNA on {0}".format(param['object_name'])
        reportObj = {
                        'objects_created':[                             {
                                'ref':"{0}/{1}/{2}".format(cl_info[6], cl_info[0], cl_info[4]),
                                'description':'WGCNA FeatureClusters' 
                             }],
                        'text_message':report
                    }

        # generate a unique name for the Method report
        reportName = 'WGCNA_Clusters_'+str(hex(uuid.getnode()))
        report_info = ws.save_objects({
                                        'id':cl_info[6],
                                        'objects':[
                                        {
                                        'type':'KBaseReport.Report',
                                        'data':reportObj,
                                        'name':reportName,
                                        'meta':{},
                                        'hidden':1, 
                                        'provenance':provenance
                                        }
                                        ]
                                        })[0]

        result = { "report_name" : reportName,"report_ref" : "{0}/{1}/{2}".format(report_info[6],report_info[0],report_info[4]) }
        #result = {'workspace_name' : workspace_name, 'out_object_name' : param['out_object_name']}
        #result = {'workspace' : workspace_name, 'output' : param['out_object_name']}
        #END const_coex_net_clust

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method const_coex_net_clust return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Exemple #22
0
    def filter_genes(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN filter_genes
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args

        auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL)
        user_id = auth_client.get_user(token)
        workspace_name_t = Template(param['workspace_name'])
        workspace_name = workspace_name_t.substitute(user_id=user_id)
 
        provenance = [{}]
        if 'provenance' in ctx:
                provenance = ctx['provenance']
        provenance[0]['input_ws_objects']=[workspace_name+'/'+param['object_name']]
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data']
 
        self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        ncol = len(expr['data']['col_ids'])
        
        # force to use ANOVA if the number of sample is two
        if(ncol == 3): param['method'] = 'anova'
 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_filter
        cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN),
                           '-m', param['method'], '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
                           '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y']
        if 'num_features' in param:
          cmd_coex_filter.append("-n")
          cmd_coex_filter.append(str(param['num_features']))
 
        if 'p_value' in param:
          cmd_coex_filter.append("-p")
          cmd_coex_filter.append(str(param['p_value']))
 
        if 'p_value' not in param and 'num_features' not in param:
          self.logger.error("One of p_value or num_features must be defined");
          return error_report("One of p_value or num_features must be defined", expr,self.__WS_URL, workspace_name, provenance, ws)
          #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination
 
        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)
 
        tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        ## checking genelist
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN),'r') as glh:
          gl = glh.readlines()
        gl = [x.strip('\n') for x in gl]
 
        if(len(gl) < 1) :
          self.logger.error("No genes are selected")
          return error_report("Increase p_value or specify num_features", expr,self.__WS_URL, workspace_name, provenance, ws)
          #sys.exit(4)
 
        ## Upload FVE
        if 'description' not in expr: 
            expr['description'] = "Filtered Expression Matrix"
        expr['description'] += " : Filtered by '{1}' method ".format(expr['description'], param['method'])
 
        expr = self._subselectExp(expr, gl)
 
        ex_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseFeatureValues.ExpressionMatrix',
                                                                              'data' : expr,
                                                                              'name' : (param['out_expr_object_name'])}]})[0]
 
        ## Upload FeatureSet
        fs ={'elements': {}}
        fs['description'] = "FeatureSet identified by filtering method '{0}' ".format(param['method'])
 
        fs['description'] += "from {0}/{1}".format(workspace_name, param['object_name'])
 
        for g in gl:
          if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
          else:
            fs['elements'][g] = []
 
        fs_info = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'KBaseCollections.FeatureSet',
                                                                              'data' : fs,
                                                                              'name' : (param['out_fs_object_name'])}]})[0]

        ## Create report object:
	report = "Filtering expression matrix using {0} on {1}".format(param['method'],param['object_name'])
        reportObj = {
                        'objects_created':[{
                                'ref':"{0}/{1}/{2}".format(fs_info[6], fs_info[0], fs_info[4]),
                                'description':'Filtered FeatureSet' },
                             {
                                'ref':"{0}/{1}/{2}".format(ex_info[6], ex_info[0], ex_info[4]),
                                'description':'Filetered ExpressionMatrix' 
                             }],
                        'text_message':report
                    }

        # generate a unique name for the Method report
        reportName = 'FilterExpression_'+str(hex(uuid.getnode()))
        report_info = ws.save_objects({
                                        'id':ex_info[6],
                                        'objects':[
                                        {
                                        'type':'KBaseReport.Report',
                                        'data':reportObj,
                                        'name':reportName,
                                        'meta':{},
                                        'hidden':1, 
                                        'provenance':provenance
                                        }
                                        ]
                                        })[0]

        result = { "report_name" : reportName,"report_ref" : "{0}/{1}/{2}".format(report_info[6],report_info[0],report_info[4]) }



        #result = {'workspace_name' : workspace_name, 'out_expr_object_name' : param['out_expr_object_name'], 'out_fs_object_name' : param['out_fs_object_name']}
        #END filter_genes

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method filter_genes return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Exemple #23
0
    def diff_p_distribution(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN diff_p_distribution
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args

        auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL)
        user_id = auth_client.get_user(token)
        workspace_name_t = Template(param['workspace_name'])
        workspace_name = workspace_name_t.substitute(user_id=user_id)
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data']
 
 
        self._dumpExp2File(expr, self.RAWEXPR_DIR, self.EXPRESS_FN)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        ncol = len(expr['data']['col_ids'])
        
        # force to use ANOVA if the number of sample is two
        if(ncol == 3): param['method'] = 'anova'
 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_filter
        cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN),
           '-m', param['method'], '-n', '10', '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
           '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y', '-j', self.PVFDT_FN]
        if 'num_features' in param:
          cmd_coex_filter.append("-n")
          cmd_coex_filter.append(str(param['num_features']))
 
        if 'p_value' in param:
          cmd_coex_filter.append("-p")
          cmd_coex_filter.append(str(param['p_value']))
 
 
        tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        ## loading pvalue distribution FDT
        pvfdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]};
        pvfdt = OrderedDict(pvfdt)
        with open(self.PVFDT_FN, 'r') as myfile:
           pvfdt = json.load(myfile)
        data_obj_name = "{0}.fdt".format(param['out_figure_object_name'])
        pvfdt['id'] = data_obj_name
 
 
        fig_properties = {"xlabel" : "-log2(p-value)", "ylabel" : "Number of features", "xlog_mode" : "-log2", "ylog_mode" : "none", "title" : "Histogram of P-values", "plot_type" : "histogram"}
        sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'MAK.FloatDataTable',
                                                                              'data' : pvfdt,
                                                                              'name' : data_obj_name}]})

        data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        fig_properties['data_ref'] = data_ref

        sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.FigureProperties',
                                                                              'data' : fig_properties,
                                                                              'name' : (param['out_figure_object_name'])}]})
        result = fig_properties
        #END diff_p_distribution

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method diff_p_distribution return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
        error_object["status"] = "ERROR : {0}".format(e.message)[:handler_utils.UJS_STATUS_MAX]
        error_object["error_message"] = traceback.format_exc()

        handler_utils.report_exception(logger, error_object, cleanup_details)

        ujs.complete_job(ujs_job_id, 
                         kb_token, 
                         "Download from {0} failed.".format(workspace_name), 
                         traceback.format_exc(), 
                         None)
        sys.exit(1)                                  



if __name__ == "__main__":
    logger = script_utils.stderrlogger(__file__, level=logging.DEBUG)
    
    script_details = script_utils.parse_docs(download_taskrunner.__doc__)
        
    parser = script_utils.ArgumentParser(description=script_details["Description"],
                                         epilog=script_details["Authors"])
    # provided by service config
    parser.add_argument('--workspace_service_url', 
                        help=script_details["Args"]["workspace_service_url"], 
                        action='store', 
                        required=True)
    parser.add_argument('--ujs_service_url', 
                        help=script_details["Args"]["ujs_service_url"], 
                        action='store', 
                        required=True)
    
def convert_to_contigs(shock_service_url, handle_service_url, input_file_name,
                       contigset_id, working_directory, shock_id,
                       handle_id, fasta_reference_only, source,
                       level=logging.INFO, logger=None):
    """
    Converts KBaseFile.AssemblyFile to KBaseGenomes.ContigSet and saves to WS.
    Note the MD5 for the contig is generated by uppercasing the sequence.
    The ContigSet MD5 is generated by taking the MD5 of joining the sorted list
    of individual contig's MD5s with a comma separator

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        input_file_name: A file name for the input FASTA data.
        contigset_id: The id of the ContigSet. If not
            specified the name will default to the name of the input file
            appended with "_contig_set"'
        working_directory: The directory the resulting json file will be
            written to.
        shock_id: Shock id for the fasta file if it already exists in shock
        handle_id: Handle id for the fasta file if it already exists as a
            handle
        fasta_reference_only: Creates a reference to the fasta file in Shock,
            but does not store the sequences in the workspace object.
            Not recommended unless the fasta file is larger than 1GB.
            This is the default behavior for files that large.
        level: Logging level, defaults to logging.INFO.
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of FASTA to KBaseGenomes.ContigSet")

    logger.info("Building Object.")

    if not os.path.isfile(input_file_name):
        raise Exception("The input file name {0} is not a file!".format(
            input_file_name))

    # default if not too large
    contig_set_has_sequences = True
    if fasta_reference_only:
        contig_set_has_sequences = False

    fasta_filesize = os.stat(input_file_name).st_size
    if fasta_filesize > 1000000000:
        # Fasta file too large to save sequences into the ContigSet object.
        contigset_warn = 'The FASTA input file seems to be too large. A ' +\
            'ContigSet object will be created without sequences, but will ' +\
            'contain a reference to the file.'
        logger.warning(contigset_warn)
        contig_set_has_sequences = False

    input_file_handle = open(input_file_name, 'r')
    fasta_header = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    for current_line in input_file_handle:
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_list) and first_header_found:
                raise Exception(
                    "There is no sequence related to FASTA record: {0}".format(
                        fasta_header))
            if not first_header_found:
                first_header_found = True
            else:
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence:
                    raise Exception(
                        "There is no sequence related to FASTA record: " +
                        fasta_header)
                contig_dict = dict()
                contig_dict["id"] = fasta_header
                contig_dict["length"] = len(total_sequence)
                contig_dict["name"] = fasta_header
                contig_dict["description"] = "Note MD5 is generated from " +\
                    "uppercasing the sequence"
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
                contig_dict["md5"] = contig_md5
                contig_set_md5_list.append(contig_md5)
                if contig_set_has_sequences:
                    contig_dict["sequence"] = total_sequence
                else:
                    contig_dict["sequence"] = ""
                fasta_dict[fasta_header] = contig_dict

                # get set up for next fasta sequence
                sequence_list = []
            fasta_header = current_line.replace('>', '').strip()
        else:
            sequence_list.append(current_line)

    input_file_handle.close()

    # wrap up last fasta sequence
    if (not sequence_list) and first_header_found:
        raise Exception(
            "There is no sequence related to FASTA record: {0}".format(
                fasta_header))
    elif not first_header_found:
        raise Exception("There are no contigs in this file")
    else:
        # build up sequence and remove all white space
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence:
            raise Exception(
                "There is no sequence related to FASTA record: " +
                fasta_header)
        contig_dict = dict()
        contig_dict["id"] = fasta_header
        contig_dict["length"] = len(total_sequence)
        contig_dict["name"] = fasta_header
        contig_dict["description"] = "Note MD5 is generated from " +\
            "uppercasing the sequence"
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"] = contig_md5
        contig_set_md5_list.append(contig_md5)
        if contig_set_has_sequences:
            contig_dict["sequence"] = total_sequence
        else:
            contig_dict["sequence"] = ""
        fasta_dict[fasta_header] = contig_dict

    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(sorted(
        contig_set_md5_list))).hexdigest()
    contig_set_dict["id"] = contigset_id
    contig_set_dict["name"] = contigset_id
    s = 'unknown'
    if source and source['source']:
        s = source['source']
    contig_set_dict["source"] = s
    sid = os.path.basename(input_file_name)
    if source and source['source_id']:
        sid = source['source_id']
    contig_set_dict["source_id"] = sid
    contig_set_dict["contigs"] = [fasta_dict[x] for x in sorted(
        fasta_dict.keys())]

    contig_set_dict["fasta_ref"] = shock_id

    logger.info("Conversion completed.")
    return contig_set_dict
def main():
    parser = script_utils.ArgumentParser(
        prog=SCRIPT_NAME,
        description='Converts KBaseFile.AssemblyFile to  ' +
        'KBaseGenomes.ContigSet.',
        epilog='Authors: Jason Baumohl, Matt Henderson, Gavin Price')
    # The following 7 arguments should be standard to all uploaders
    parser.add_argument(
        '--working_directory',
        help='Directory for temporary files',
        action='store', type=str, required=True)

    # Example of a custom argument specific to this uploader
    parser.add_argument('--workspace_service_url',
                        help='workspace service url',
                        action='store', type=str, required=True)
    parser.add_argument(
        '--source_workspace_name', help='name of the source workspace',
        action='store', type=str, required=True)
    parser.add_argument(
        '--destination_workspace_name', help='name of the target workspace',
        action='store', type=str, required=True)
    parser.add_argument(
        '--source_object_name',
        help='name of the workspace object to convert',
        action='store', type=str, required=True)
    parser.add_argument(
        '--destination_object_name',
        help='name for the produced ContigSet.',
        action='store', type=str, required=True)

    parser.add_argument(
        '--fasta_reference_only',
        help='Creates a reference to the fasta file in Shock, but does not ' +
        'store the sequences in the workspace object.  Not recommended ' +
        'unless the fasta file is larger than 1GB. This is the default ' +
        'behavior for files that large.', action='store_true', required=False)

    # ignore unknown arguments for now
    args, _ = parser.parse_known_args()

    logger = script_utils.stderrlogger(__file__)
    try:
        # make there's at least something for a token
        if not TOKEN:
            raise Exception("Unable to retrieve KBase Authentication token!")

        shock_url, shock_id, ref, source = download_workspace_data(
            args.workspace_service_url,
            args.source_workspace_name,
            args.source_object_name,
            args.working_directory,
            logger)

        inputfile = os.path.join(args.working_directory,
                                 args.source_object_name)

        cs = convert_to_contigs(
            None, None, inputfile,
            args.destination_object_name, args.working_directory,
            shock_id, None, args.fasta_reference_only, source, logger=logger)

        upload_workspace_data(
            cs, args.workspace_service_url, ref,
            args.destination_workspace_name, args.destination_object_name)
    except Exception, e:
        logger.exception(e)
        sys.exit(1)
def validate(input_directory,
             working_directory,
             level=logging.INFO,
             logger=None):
    """
    Validates a FASTA file of nucleotide sequences.

    Args:
        input_directory: A directory containing one or more FASTA files.
        working_directory: A directory where any output files produced by validation can be written.
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Currently writes to stderr with a Java Exception trace on error, otherwise no output.
    
    Authors:
        Srividya Ramikrishnan, Matt Henderson
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    extensions = [".fa", ".fasta", ".fna"]

    validated = False
    for input_file_name in os.listdir(input_directory):
        logger.info("Checking for FASTA file : {0}".format(input_file_name))

        filePath = os.path.join(os.path.abspath(input_directory),
                                input_file_name)

        if not os.path.isfile(filePath):
            logger.warning("Skipping directory {0}".format(input_file_name))
            continue
        elif os.path.splitext(input_file_name)[-1] not in extensions:
            logger.warning("Unrecognized file type, skipping.")
            continue

        logger.info("Starting FASTA validation of {0}".format(input_file_name))

        # TODO This needs to be changed, this is really just a demo program for this library and not a serious tool
        java_classpath = os.path.join(
            os.environ.get("KB_TOP"),
            "lib/jars/FastaValidator/FastaValidator-1.0.jar")
        arguments = [
            "java", "-classpath", java_classpath, "FVTester", filePath
        ]

        tool_process = subprocess.Popen(arguments, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()

        if len(stderr) > 0:
            logger.error("Validation failed on {0}".format(input_file_name))
        else:
            logger.info("Validation passed on {0}".format(input_file_name))
            validated = True

    if not validated:
        raise Exception("Validation failed!")
    else:
        logger.info("Validation passed.")
    def view_heatmap(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN view_heatmap
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Loading data")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        fc = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
        if 'original_data' not in fc:
            raise Exception("FeatureCluster object does not have information for the original ExpressionMatrix")
        oexpr = ws.get_objects([{ 'ref' : fc['original_data']}])[0]

        df2 = pd.DataFrame(oexpr['data']['data']['values'], index=oexpr['data']['data']['row_ids'], columns=oexpr['data']['data']['col_ids'])
#        cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, 
#                                          '--workspace_name', oexpr['info'][7],
#                                          '--object_name', oexpr['info'][1],
#                                          '--working_directory', self.RAWEXPR_DIR,
#                                          '--output_file_name', self.EXPRESS_FN
#                              ]
# 
#        # need shell in this case because the java code is depending on finding the KBase token in the environment
#        #  -- copied from FVE_2_TSV
#        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv)
#        stdout, stderr = tool_process.communicate()
#        
#        if stdout is not None and len(stdout) > 0:
#            self.logger.info(stdout)
# 
#        if stderr is not None and len(stderr) > 0:
#            self.logger.info(stderr)
# 
#        df = pd.read_csv("{0}/{1}".format(self.RAWEXPR_DIR,self.EXPRESS_FN), sep='\t')
#        df2 = df[df.columns[1:]]
#        rn = df[df.columns[0]]
#        df2.index = rn

        # L2 normalization
        df3 = df2.div(df2.pow(2).sum(axis=1).pow(0.5), axis=0)

        # type - ? level, ratio, log-ratio  <---> "untransformed"
        # scale - ? probably: raw, ln, log2, log10
        self.logger.info("Expression matrix type: {0}, scale: {1}".format(oexpr['data']['type'],oexpr['data']['scale'] ))
        if oexpr['data']['type'] == 'level' or oexpr['data']['type'] == 'untransformed': # need to compute fold changes
            if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0":
              factor = 0.125
              fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
            else:
              fc_df = df2
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0))
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0))
              if oexpr['data']['scale'] == "log10":
                  fc_df = fc_df/np.log10(2)
              elif oexpr['data']['scale'] == "ln":
                  fc_df = fc_df/np.log(2)
              else:
                  pass
        elif oexpr['data']['type'] == 'ratio':
            fc_cf = df2.apply(np.log2)
        elif oexpr['data']['type'] == 'log-ratio':
            fc_cf = df2
            if oexpr['data']['scale'] == "log10":
                fc_df = fc_df/np.log10(2)
            elif oexpr['data']['scale'] == "ln":
                fc_df = fc_df/np.log(2)
            else:
                pass

        else: # do the same thing with simple level or untransformed
            if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0":
              factor = 0.125
              fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
            else:
              fc_df = df2
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0))
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0))
              if oexpr['data']['scale'] == "log10":
                  fc_df = fc_df/np.log10(2)
              elif oexpr['data']['scale'] == "ln":
                  fc_df = fc_df/np.log(2)
              else:
                  pass
       
        self.logger.info("Compute cluster statistics")

        cl = {}
        afs = [];
        cid = 1;

        c_stat = pd.DataFrame()
        for cluster in fc['feature_clusters']:
         
          try: 
            fs  = cluster['id_to_pos'].keys()
          except:
            continue # couldn't find feature_set

          fsn = "Cluster_{0}".format(cid)
          cid +=1
          c_stat.loc[fsn,'size'] = len(fs)
          if 'meancor' in cluster:
              c_stat.loc[fsn,'mcor'] = cluster['meancor']
          else:
            pass
            # TODO: Add mean cor calculation later
            #raise Exception("Mean correlation is not included in FeatureCluster object") # now it is NaN

          if 'quantile' in param:
              c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(float(param['quantile']))
          else:
              c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(0.75)
         

          c1 = df3.loc[fs,].sum(axis=0)
          if df3.loc[fs,].shape[0] < 1: # empty
            continue
          cl[fsn] = fs
          #afs.extend(fs)

          #c1 = df3.loc[fs,].sum(axis=0)
          #c1 = c1 / np.sqrt(c1.pow(2).sum())
          #if(len(cl.keys()) == 1):
          #  centroids = c1.to_frame(fsn).T
          #else:
          #  centroids.loc[fsn] = c1

        # now we have centroids and statistics
        # let's subselect clusters
        min_features = 200
        if 'min_features' in param :
          min_features = param['min_features']
        
        c_stat.loc[:,'nmcor'] = c_stat.loc[:,'mcor'] / c_stat.loc[:,'mcor'].max()
        c_stat.loc[:,'nstdstat'] = c_stat.loc[:,'stdstat'] / c_stat.loc[:,'stdstat'].max()
        
        if 'use_norm_weight' in param and param['use_norm_weight'] != 0:
            if 'quantile_weight' in param:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + float(param['quantile_weight']) * c_stat.loc[:,'nstdstat']
            else:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + 1.0                             * c_stat.loc[:,'nstdstat']
        else:
            if 'quantile_weight' in param:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + float(param['quantile_weight']) * c_stat.loc[:,'stdstat']
            else:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + 0.1                             * c_stat.loc[:,'stdstat']

        c_stat.sort_values('weight', inplace=True, ascending=False)

        pprint(c_stat)

        centroids = pd.DataFrame()
        for i in range(c_stat.shape[0]):
            fsn = c_stat.index[i]
            fs = cl[fsn]
            if i != 0 and len(afs) + len(fs) > min_features :
                break;
           
            afs.extend(fs)

            c1 = df3.loc[fs,].sum(axis=0)
            c1 = c1 / np.sqrt(c1.pow(2).sum())
            if(centroids.shape[0] < 1):
              centroids = c1.to_frame(fsn).T
            else:
              centroids.loc[fsn] = c1
           
        pprint(centroids)
        
        if len(cl.keys()) == 0:
            raise Exception("No feature ids were mapped to dataset or no clusters were selected")
        
        # dataset centroid
        dc = df3.loc[afs,].sum(axis=0)
        dc = dc / np.sqrt(dc.pow(2).sum())
    
        
        self.logger.info("Ordering Centroids and Data")
        # the most far away cluster centroid from dataset centroid
        fc = (centroids * dc).sum(axis=1).idxmin()
        # the most far away centroid centroid from fc
        ffc = (centroids * centroids.loc[fc,]).sum(axis=1).idxmin()
        
        # major direction to order on unit ball space
        md = centroids.loc[ffc,] - centroids.loc[fc,]
        
        # unnormalized component of projection to the major direction (ignored md quantities because it is the same to all)
        corder = (centroids * md).sum(axis=1).sort_values() # cluster order
        coidx = corder.index
        
        dorder =(df3.loc[afs,] * md).sum(axis=1).sort_values() # data order
        
        # get first fs table    
        fig_properties = {"xlabel" : "Conditions", "ylabel" : "Features", "xlog_mode" : "none", "ylog_mode" : "none", "title" : "Log Fold Changes", "plot_type" : "heatmap", 'ygroup': []}
        fig_properties['ygtick_labels'] = coidx.tolist()

        if 'fold_change' in param and param['fold_change'] == 1:
            frange = 2
            if 'fold_change_range' in param:
                frange = float(param['fold_change_range'])
            final=fc_df.loc[dorder.loc[cl[coidx[0]],].index,]
            fig_properties['ygroup'].append(final.shape[0])
            
            for i in range(1,len(coidx)):
                tf = fc_df.loc[dorder.loc[cl[coidx[i]],].index,]
                fig_properties['ygroup'].append(tf.shape[0])
                final = final.append(tf)

            if 'fold_cutoff' in param and param['fold_cutoff'] == 1:
                final[final > frange] = frange
                final[final < - frange] = - frange
            else:
                fc_df0b = final.sub(final.min(axis=1), axis=0)
                final = (fc_df0b.div(fc_df0b.max(axis=1), axis=0) - 0.5) * 2 * frange
        else:
            final=df2.loc[dorder.loc[cl[coidx[0]],].index,]
            fig_properties['ygroup'].append(final.shape[0])
            
            for i in range(1,len(coidx)):
                tf = df2.loc[dorder.loc[cl[coidx[i]],].index,]
                fig_properties['ygroup'].append(tf.shape[0])
                final = final.append(tf)
        
        ## loading pvalue distribution FDT
        fdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]};
        #fdt = OrderedDict(fdt)
        fdt['data'] = final.T.as_matrix().tolist() # make sure Transpose
        fdt['row_labels'] = final.columns.tolist()
        fdt['column_labels'] = final.index.tolist()
        # TODO: Add group label later
        fdt['id'] = "{0}.fdt".format(param['out_figure_object_name'])
 
        self.logger.info("Saving the results")
        sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'MAK.FloatDataTable',
                                                                              'data' : fdt,
                                                                              'name' : "{0}.fdt".format(param['out_figure_object_name'])}]})

        data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        fig_properties['data_ref'] = data_ref

        sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'CoExpression.FigureProperties',
                                                                              'data' : fig_properties,
                                                                              'name' : (param['out_figure_object_name'])}]})
        result = fig_properties
        #END view_heatmap

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method view_heatmap return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
def transform(shock_service_url=None, handle_service_url=None, 
              output_file_name=None, input_directory=None, 
              working_directory=None, shock_id=None, handle_id=None, 
              input_mapping=None, fasta_reference_only=False, 
              level=logging.INFO, logger=None):
    """
    Converts FASTA file to KBaseGenomes.ContigSet json string.  
    Note the MD5 for the contig is generated by uppercasing the sequence.
    The ContigSet MD5 is generated by taking the MD5 of joining the sorted 
    list of individual contig's MD5s with a comma separator.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be stored.  
                          If the output file name is not specified the name will default 
                          to the name of the input file appended with '_contig_set'
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be written to.
        shock_id: Shock id for the fasta file if it already exists in shock
        handle_id: Handle id for the fasta file if it already exists as a handle
        input_mapping: JSON string mapping of input files to expected types.  
                       If you don't get this you need to scan the input 
                       directory and look for your files.
        fasta_reference_only: Creates a reference to the fasta file in Shock, but does not store the sequences in the workspace object.  Not recommended unless the fasta file is larger than 1GB. This is the default behavior for files that large.
        level: Logging level, defaults to logging.INFO.
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.

    Authors:
        Jason Baumohl, Matt Henderson
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of FASTA to KBaseGenomes.ContigSet")
    token = os.environ.get('KB_AUTH_TOKEN')
        
    if input_mapping is None:
        logger.info("Scanning for FASTA files.")
    
        valid_extensions = [".fa",".fasta",".fna"]
    
        files = os.listdir(input_directory)
        fasta_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]
            
        assert len(fasta_files) != 0
    
        logger.info("Found {0}".format(str(fasta_files)))

        input_file_name = os.path.join(input_directory,files[0])
    
        if len(fasta_files) > 1:
            logger.warning("Not sure how to handle multiple FASTA files in this context. Using {0}".format(input_file_name))
    else:
        input_file_name = os.path.join(os.path.join(input_directory, "FASTA.DNA.Assembly"), simplejson.loads(input_mapping)["FASTA.DNA.Assembly"])
        
                
    logger.info("Building Object.")
 
    if not os.path.isfile(input_file_name):
        raise Exception("The input file name {0} is not a file!".format(input_file_name))        

    if not os.path.isdir(args.working_directory):
        raise Exception("The working directory {0} is not a valid directory!".format(working_directory))        

    logger.debug(fasta_reference_only)

    # default if not too large
    contig_set_has_sequences = True 
    if fasta_reference_only:
        contig_set_has_sequences = False 

    fasta_filesize = os.stat(input_file_name).st_size
    if fasta_filesize > 1000000000:
        # Fasta file too large to save sequences into the ContigSet object.
        contigset_warn = """The FASTA input file seems to be too large. A ContigSet
                            object will be created without sequences, but will
                            contain a reference to the file."""
        logger.warning(contigset_warn) 
        contig_set_has_sequences = False 

    input_file_handle = open(input_file_name, 'r')
    
    fasta_header = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    sequence_exists = False
    
    valid_chars = "-AaCcGgTtUuWwSsMmKkRrYyBbDdHhVvNn"
    amino_acid_specific_characters = "PpLlIiFfQqEe" 

    for current_line in input_file_handle:
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_exists) and first_header_found:
                logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
                raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
            if not first_header_found:
                first_header_found = True
            else:
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence :
                    logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
                    raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
                for character in total_sequence:
                    if character not in valid_chars:
                        if character in amino_acid_specific_characters:
                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))
#                fasta_key = fasta_header.strip()
                try:
                    fasta_key , fasta_description = fasta_header.strip().split(' ',1)
                except:
                    fasta_key = fasta_header.strip()
                    fasta_description = None
                contig_dict = dict() 
                contig_dict["id"] = fasta_key 
                contig_dict["length"] = len(total_sequence) 
                contig_dict["name"] = fasta_key 
                if fasta_description is None:
                    contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
                else:
                    contig_dict["description"] = "%s.  Note MD5 is generated from uppercasing the sequence" % (fasta_description) 
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest() 
                contig_dict["md5"] = contig_md5 
                contig_set_md5_list.append(contig_md5)
                 
                if contig_set_has_sequences: 
                    contig_dict["sequence"]= total_sequence
                else: 
                    contig_dict["sequence"]= ""
                
                fasta_dict[fasta_key] = contig_dict
               
                # get set up for next fasta sequence
                sequence_list = []
                sequence_exists = False
            
            fasta_header = current_line.replace('>','')
        else:
            sequence_list.append(current_line)
            sequence_exists = True

    input_file_handle.close()

    # wrap up last fasta sequence
    if (not sequence_exists) and first_header_found: 
        logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
        raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
    elif not first_header_found :
        logger.error("There are no contigs in this file") 
        raise Exception("There are no contigs in this file") 
    else: 
        # build up sequence and remove all white space      
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence :
            logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
            raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 

        for character in total_sequence: 
            if character not in valid_chars: 
                if character in amino_acid_specific_characters:
                    raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))

#        fasta_key = fasta_header.strip()
        try: 
            fasta_key , fasta_description = fasta_header.strip().split(' ',1)
        except:
            fasta_key = fasta_header.strip()
            fasta_description = None
        contig_dict = dict()
        contig_dict["id"] = fasta_key 
        contig_dict["length"] = len(total_sequence)
        contig_dict["name"] = fasta_key
        if fasta_description is None:
            contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
        else:
            contig_dict["description"] = "%s.  Note MD5 is generated from uppercasing the sequence" % (fasta_description)
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"]= contig_md5
        contig_set_md5_list.append(contig_md5)
        
        if contig_set_has_sequences: 
            contig_dict["sequence"] = total_sequence 
        else:
            contig_dict["sequence"]= ""
         
        fasta_dict[fasta_key] = contig_dict 


    if output_file_name is None:
        # default to input file name minus file extenstion adding "_contig_set" to the end
        base = os.path.basename(input_file_name)
        output_file_name = "{0}_contig_set.json".format(os.path.splitext(base)[0])
    
    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(sorted(contig_set_md5_list))).hexdigest()
    contig_set_dict["id"] = output_file_name
    contig_set_dict["name"] = output_file_name
    contig_set_dict["source"] = "KBase"
    contig_set_dict["source_id"] = os.path.basename(input_file_name) 
    contig_set_dict["contigs"] = [fasta_dict[x] for x in sorted(fasta_dict.keys())]

    if shock_id is None:
        shock_info = script_utils.upload_file_to_shock(logger, shock_service_url, input_file_name, token=token)
        shock_id = shock_info["id"]
    
    contig_set_dict["fasta_ref"] = shock_id

    # For future development if the type is updated to the handle_reference instead of a shock_reference

    # This generates the json for the object
    objectString = simplejson.dumps(contig_set_dict, sort_keys=True, indent=4)

    logger.info("ContigSet data structure creation completed.  Writing out JSON.")

    output_file_path = os.path.join(working_directory,output_file_name) 
    with open(output_file_path, "w") as outFile:
        outFile.write(objectString)
    
    logger.info("Conversion completed.")
Exemple #30
0
    def view_heatmap(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN view_heatmap
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Loading data")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args

        auth_client = _KBaseAuth(self.__AUTH_SERVICE_URL)
        user_id = auth_client.get_user(token)
        workspace_name_t = Template(param['workspace_name'])
        workspace_name = workspace_name_t.substitute(user_id=user_id)
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        fc = ws.get_objects([{'workspace': workspace_name, 'name' : param['object_name']}])[0]['data']
        if 'original_data' not in fc:
            raise Exception("FeatureCluster object does not have information for the original ExpressionMatrix")
        oexpr = ws.get_objects([{ 'ref' : fc['original_data']}])[0]

        df2 = pd.DataFrame(oexpr['data']['data']['values'], index=oexpr['data']['data']['row_ids'], columns=oexpr['data']['data']['col_ids'])

        # L2 normalization
        df3 = df2.div(df2.pow(2).sum(axis=1).pow(0.5), axis=0)

        # type - ? level, ratio, log-ratio  <---> "untransformed"
        # scale - ? probably: raw, ln, log2, log10
        self.logger.info("Expression matrix type: {0}, scale: {1}".format(oexpr['data']['type'],oexpr['data']['scale'] ))
        # do default behavior
        factor = 0.125
        fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
        if param['control_condition']  in fc_df.columns:
            fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
        else:
            fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
        # now fc_df will be reset
        if oexpr['data']['type'] == 'level' or oexpr['data']['type'] == 'untransformed': # need to compute fold changes
            if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0":
              factor = 0.125
              fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
            else:
              fc_df = df2
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0))
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0))
              if oexpr['data']['scale'] == "log10":
                  fc_df = fc_df/np.log10(2)
              elif oexpr['data']['scale'] == "ln":
                  fc_df = fc_df/np.log(2)
              else:
                  pass
        elif oexpr['data']['type'] == 'ratio':
            fc_df = df2.apply(np.log2)
        elif oexpr['data']['type'] == 'log-ratio':
            fc_df = df2
            if oexpr['data']['scale'] == "log10":
                fc_df = fc_df/np.log10(2)
            elif oexpr['data']['scale'] == "ln":
                fc_df = fc_df/np.log(2)
            else:
                pass

        else: # do the same thing with simple level or untransformed
            if 'scale' not in oexpr['data'] or oexpr['data']['scale'] == 'raw' or oexpr['data']['scale'] == "1.0":
              factor = 0.125
              fc_df = df2 + df2[df2 !=0].abs().min().min() * factor
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0)).apply(np.log2)
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0)).apply(np.log2)
            else:
              fc_df = df2
              if param['control_condition']  in fc_df.columns:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[param['control_condition']]], axis=0))
              else:
                  fc_df = (fc_df.div(fc_df.loc[:,fc_df.columns[0]], axis=0))
              if oexpr['data']['scale'] == "log10":
                  fc_df = fc_df/np.log10(2)
              elif oexpr['data']['scale'] == "ln":
                  fc_df = fc_df/np.log(2)
              else:
                  pass
       
        self.logger.info("Compute cluster statistics")

        cl = {}
        afs = [];
        cid = 1;

        c_stat = pd.DataFrame()
        for cluster in fc['feature_clusters']:
         
          try: 
            fs  = cluster['id_to_pos'].keys()
          except:
            continue # couldn't find feature_set

          fsn = "Cluster_{0}".format(cid)
          cid +=1
          c_stat.loc[fsn,'size'] = len(fs)
          if 'meancor' in cluster:
              c_stat.loc[fsn,'mcor'] = cluster['meancor']
          else:
            pass
            # TODO: Add mean cor calculation later
            #raise Exception("Mean correlation is not included in FeatureCluster object") # now it is NaN

          if 'quantile' in param:
              # enforcing quantile to be in [0 .. 1] rnage
              qt = float(param['quantile'])
              if qt > 1.0: qt = 1.0
              if qt < 0.0: qt = 0.0
              c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(qt)
          else:
              c_stat.loc[fsn,'stdstat'] = fc_df.loc[fs,].std(axis=1).quantile(0.75)
         

          c1 = df3.loc[fs,].sum(axis=0)
          if df3.loc[fs,].shape[0] < 1: # empty
            continue
          cl[fsn] = fs
          #afs.extend(fs)

          #c1 = df3.loc[fs,].sum(axis=0)
          #c1 = c1 / np.sqrt(c1.pow(2).sum())
          #if(len(cl.keys()) == 1):
          #  centroids = c1.to_frame(fsn).T
          #else:
          #  centroids.loc[fsn] = c1

        # now we have centroids and statistics
        # let's subselect clusters
        min_features = 200
        if 'min_features' in param :
          min_features = param['min_features']
        
        c_stat.loc[:,'nmcor'] = c_stat.loc[:,'mcor'] / c_stat.loc[:,'mcor'].max()
        c_stat.loc[:,'nstdstat'] = c_stat.loc[:,'stdstat'] / c_stat.loc[:,'stdstat'].max()
        
        if 'use_norm_weight' in param and param['use_norm_weight'] != 0:
            if 'quantile_weight' in param:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + float(param['quantile_weight']) * c_stat.loc[:,'nstdstat']
            else:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'nmcor'] + 1.0                             * c_stat.loc[:,'nstdstat']
        else:
            if 'quantile_weight' in param:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + float(param['quantile_weight']) * c_stat.loc[:,'stdstat']
            else:
                c_stat.loc[:,'weight'] = c_stat.loc[:,'mcor'] + 0.1                             * c_stat.loc[:,'stdstat']

        c_stat.sort_values('weight', inplace=True, ascending=False)

        pprint(c_stat)

        centroids = pd.DataFrame()
        for i in range(c_stat.shape[0]):
            fsn = c_stat.index[i]
            fs = cl[fsn]
            if i != 0 and len(afs) + len(fs) > min_features :
                break;
           
            afs.extend(fs)

            c1 = df3.loc[fs,].sum(axis=0)
            c1 = c1 / np.sqrt(c1.pow(2).sum())
            if(centroids.shape[0] < 1):
              centroids = c1.to_frame(fsn).T
            else:
              centroids.loc[fsn] = c1
           
        pprint(centroids)
        
        if len(cl.keys()) == 0:
            raise Exception("No feature ids were mapped to dataset or no clusters were selected")
        
        # dataset centroid
        dc = df3.loc[afs,].sum(axis=0)
        dc = dc / np.sqrt(dc.pow(2).sum())
    
        
        self.logger.info("Ordering Centroids and Data")
        # the most far away cluster centroid from dataset centroid
        fc = (centroids * dc).sum(axis=1).idxmin()
        # the most far away centroid centroid from fc
        ffc = (centroids * centroids.loc[fc,]).sum(axis=1).idxmin()
        
        # major direction to order on unit ball space
        md = centroids.loc[ffc,] - centroids.loc[fc,]
        
        # unnormalized component of projection to the major direction (ignored md quantities because it is the same to all)
        corder = (centroids * md).sum(axis=1).sort_values() # cluster order
        coidx = corder.index
        
        dorder =(df3.loc[afs,] * md).sum(axis=1).sort_values() # data order
        
        # get first fs table    
        fig_properties = {"xlabel" : "Conditions", "ylabel" : "Features", "xlog_mode" : "none", "ylog_mode" : "none", "title" : "Log Fold Changes", "plot_type" : "heatmap", 'ygroup': []}
        fig_properties['ygtick_labels'] = coidx.tolist()

        if 'fold_change' in param and param['fold_change'] == 1:
            frange = 2
            if 'fold_change_range' in param:
                frange = float(param['fold_change_range'])
            final=fc_df.loc[dorder.loc[cl[coidx[0]],].index,]
            fig_properties['ygroup'].append(final.shape[0])
            
            for i in range(1,len(coidx)):
                tf = fc_df.loc[dorder.loc[cl[coidx[i]],].index,]
                fig_properties['ygroup'].append(tf.shape[0])
                final = final.append(tf)

            if 'fold_cutoff' in param and param['fold_cutoff'] == 1:
                final[final > frange] = frange
                final[final < - frange] = - frange
            else:
                fc_df0b = final.sub(final.min(axis=1), axis=0)
                final = (fc_df0b.div(fc_df0b.max(axis=1), axis=0) - 0.5) * 2 * frange
        else:
            final=df2.loc[dorder.loc[cl[coidx[0]],].index,]
            fig_properties['ygroup'].append(final.shape[0])
            
            for i in range(1,len(coidx)):
                tf = df2.loc[dorder.loc[cl[coidx[i]],].index,]
                fig_properties['ygroup'].append(tf.shape[0])
                final = final.append(tf)
        
        ## loading pvalue distribution FDT
        fdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]};
        #fdt = OrderedDict(fdt)
        # Nan to None
        final = final.where(pd.notnull(final),None)
        fdt['data'] = final.T.as_matrix().tolist() # make sure Transpose
        fdt['row_labels'] = final.columns.tolist()
        fdt['column_labels'] = final.index.tolist()
        # TODO: Add group label later
        fdt['id'] = "{0}.fdt".format(param['out_figure_object_name'])
 
        self.logger.info("Saving the results")
        sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'MAK.FloatDataTable',
                                                                              'data' : fdt,
                                                                              'hidden':1, 
                                                                              'name' : "{0}.fdt".format(param['out_figure_object_name'])}]})

        data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        fig_properties['data_ref'] = data_ref

        sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.FigureProperties',
                                                                              'data' : fig_properties,
                                                                              #'hidden':1, 
                                                                              'name' : "{0}".format(param['out_figure_object_name'])}]})
                                                                              #'name' : "{0}.fp".format(param['out_figure_object_name'])}]})

        #mchp = {}
        #mchp['figure_obj'] = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        #sstatus = ws.save_objects({'workspace' : workspace_name, 'objects' : [{'type' : 'CoExpression.MulticlusterHeatmapPlot',
        #                                                                      'data' : mchp,
        #                                                                      'name' : (param['out_figure_object_name'])}]})

        result = fig_properties
        #END view_heatmap

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method view_heatmap return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Exemple #31
0
def main():
    script_details = script_utils.parse_docs(transform.__doc__)

    parser = argparse.ArgumentParser(prog=__file__,
                                     description=script_details["Description"],
                                     epilog=script_details["Authors"])

    parser.add_argument('--workspace_service_url',
                        help=script_details["Args"]["workspace_service_url"],
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument('--workspace_name',
                        help=script_details["Args"]["workspace_name"],
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument("--object_name", 
                        help=script_details["Args"]["object_name"], 
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument('--output_file_name',
                        help=script_details["Args"]["output_file_name"],
                        action='store', type=str, nargs='?', default=None,
                        required=False)
    parser.add_argument('--input_directory',
                        help=script_details["Args"]["input_directory"],
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument("--working_directory", 
                        help=script_details["Args"]["working_directory"], 
                        action='store', type=str, nargs='?', required=True)
    parser.add_argument('--input_mapping',
                        help=script_details["Args"]["input_mapping"],
                        action='store', type=unicode, nargs='?', default=None,
                        required=False)

    # custom arguments specific to this uploader
    parser.add_argument('--format_type',
                        help=script_details["Args"]["format_type"],
                        action='store', type=str, required=False)
    parser.add_argument('--genome_object_name',
                        help=script_details["Args"]["genome_object_name"],
                        action='store', type=str, required=False)
    parser.add_argument('--fill_missing_values',
                        help=script_details["Args"]["fill_missing_values"],
                        action='store', type=int, required=False)
    parser.add_argument('--data_type',
                        help=script_details["Args"]["data_type"],
                        action='store', type=str, required=False)
    parser.add_argument('--data_scale',
                        help=script_details["Args"]["data_scale"],
                        action='store', type=str, required=False)

    args, unknown = parser.parse_known_args()

    logger = script_utils.stderrlogger(__file__)

    logger.debug(args)
    try:
        transform(workspace_service_url=args.workspace_service_url,
                  workspace_name=args.workspace_name,
                  object_name=args.object_name,
                  output_file_name=args.output_file_name,
                  input_directory=args.input_directory,
                  working_directory=args.working_directory,
                  input_mapping=args.input_mapping,
                  format_type=args.format_type,
                  genome_object_name=args.genome_object_name,
                  fill_missing_values=args.fill_missing_values,
                  data_type=args.data_type,
                  data_scale=args.data_scale,
                  logger=logger)
    except Exception as e:
        logger.exception(e)
        sys.exit(1)
def convert(shock_service_url, handle_service_url, input_directory, 
            object_name, level=logging.INFO, logger=None):
    """
    Converts FASTQ file to KBaseAssembly.PairedEndLibrary json string.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        input_directory: Where the FASTQ file can be found.
        object_name: A name to use when storing the JSON string.
        mean_insert: The average insert size.
        std_dev: standard deviation of the inserts
        interleaved: Are the reads interleaved?
        read_orientation: Do the reads have an outward orientation?
        level: Logging level, defaults to logging.INFO.
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of FASTQ to KBaseAssembly.PairedEndLibrary.")

    token = os.environ.get('KB_AUTH_TOKEN')

    # scan the directory for files
    logger.info("Scanning for FASTQ files.")
    
    valid_extensions = [".fq",".fastq",".fnq"]
    
    files = os.listdir(working_directory)
    fastq_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]
            
    assert len(fastq_files) != 0
    
    # put the files in shock, get handles
    shock_ids = list()
    for x in fastq_files:
        shock_info = script_utils.upload_file_to_shock(logger, shock_service_url, input_file_name, token=token)
        shock_ids.append(shock_info["id"])
    
    logger.info("Gathering information.")
    handles = script_utils.getHandles(logger, shock_service_url, handle_service_url, shock_ids, [handle_id], token)   
    
    assert len(handles) != 0

    # fill out the object details
    resultObject = dict()
    resultObject["handle_1"] = handles[0]
    
    if len(handles) == 2:
        resultObject["handle_2"] = handles[1]

    if mean_insert is not None :
    	resultObject["insert_size_mean"] = mean_insert
    
    if std_dev is not None:
    	resultObject["insert_size_std_dev"] = std_dev

    if interleaved:    
        resultObject["interleaved"] = 1
    
    if read_orientation:
    	resultObject["read_orientation_outward"] = 1

    objectString = json.dumps(resultObject, sort_keys=True, indent=4)
    
    logger.info("Writing out JSON.")
    with open(args.output_filename, "w") as outFile:
        outFile.write(objectString)
    
    logger.info("Conversion completed.")
def transform(
    shock_service_url=None,
    handle_service_url=None,
    output_file_name=None,
    input_directory=None,
    working_directory=None,
    level=logging.INFO,
    logger=None,
):
    """
    Converts a FASTQ file to a KBaseAssembly.SingleEndLibrary json string.  

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be stored.  
        input_directory: The directory containing the file.
        working_directory: The directory the resulting json file will be written to.
        level: Logging level, defaults to logging.INFO.
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.

    Authors:
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Scanning for FASTQ files.")

    valid_extensions = [".fq", ".fastq", ".fnq"]

    files = os.listdir(working_directory)
    fastq_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]

    assert len(fastq_files) != 0

    logger.info("Found {0}".format(str(fastq_files)))

    input_file_name = files[0]

    if len(fastq_files) > 1:
        logger.warning("Not sure how to handle multiple FASTQ files in this context. Using {0}".format(input_file_name))

    kb_token = os.environ.get("KB_AUTH_TOKEN")

    script_utils.upload_file_to_shock(
        logger=logger,
        shock_service_url=shock_service_url,
        filePath=os.path.join(input_directory, input_file_name),
        token=kb_token,
    )

    handles = script_utils.getHandles(
        logger=logger, shock_service_url=shock_service_url, handle_service_url=handle_service_url, token=kb_token
    )

    assert len(handles) != 0

    objectString = simplejson.dumps({"handle": handles[0]}, sort_keys=True, indent=4)

    if output_file_name is None:
        output_file_name = input_file_name

    with open(os.path.join(output_directory, output_file_name), "w") as f:
        f.write(objectString)
Exemple #34
0
def transform(workspace_service_url=None,
              workspace_name=None,
              object_name=None,
              version=None,
              working_directory=None,
              output_file_name=None,
              level=logging.INFO,
              logger=None):
    """
    Converts KBaseEnigmaMetals.SamplePropertyMatrix to TSV-formatted file.
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        workspace_name: Name of the workspace
        object_name: Name of the object in the workspace 
        version: Version number of workspace object, defaults to most recent version
        working_directory: The working directory where the output file should be stored.
        output_file_name: The desired file name of the result file.
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        TSV-formatted file containing data from SamplePropertyMatrix object.
    
    Authors:
        Roman Sutormin, Alexey Kazakov
    
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of KBaseEnigmaMetals.SamplePropertyMatrix to TSV.SampleProperty"
    )
    token = os.environ.get("KB_AUTH_TOKEN")

    if not working_directory or not os.path.isdir(args.working_directory):
        raise Exception("The working directory {0} does not exist".format(
            working_directory))

    logger.info("Grabbing Data.")

    classpath = [
        "$KB_TOP/lib/jars/kbase/transform/kbase_transform_deps.jar",
        "$KB_TOP/lib/jars/apache_commons/commons-cli-1.2.jar",
        "$KB_TOP/lib/jars/ini4j/ini4j-0.5.2.jar",
        "$KB_TOP/lib/jars/jackson/jackson-annotations-2.2.3.jar",
        "$KB_TOP/lib/jars/jackson/jackson-core-2.2.3.jar",
        "$KB_TOP/lib/jars/jackson/jackson-databind-2.2.3.jar",
        "$KB_TOP/lib/jars/jetty/jetty-all-7.0.0.jar",
        "$KB_TOP/lib/jars/jna/jna-3.4.0.jar",
        "$KB_TOP/lib/jars/kbase/auth/kbase-auth-0.3.1.jar",
        "$KB_TOP/lib/jars/kbase/common/kbase-common-0.0.10.jar",
        "$KB_TOP/lib/jars/servlet/servlet-api-2.5.jar",
        "$KB_TOP/lib/jars/syslog4j/syslog4j-0.9.46.jar",
        "$KB_TOP/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar"
    ]

    mc = "us.kbase.kbaseenigmametals.SamplePropertyMatrixDownloader"

    argslist = [
        "--workspace_service_url {0}".format(workspace_service_url),
        "--workspace_name {0}".format(workspace_name),
        "--object_name {0}".format(object_name),
        "--working_directory {0}".format(working_directory)
    ]

    if output_file_name:
        argslist.append("--output_file_name {0}".format(output_file_name))

    if version:
        argslist.append("--version {0}".format(version))

    arguments = [
        "java", "-classpath", ":".join(classpath), mc, " ".join(argslist)
    ]

    logger.debug(arguments)

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    tool_process = subprocess.Popen(" ".join(arguments),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.error(
            "Transformation from KBaseEnigmaMetals.SamplePropertyMatrix to TSV.SampleProperty failed"
        )
        logger.error(stderr)
        sys.exit(1)

    logger.info("Conversion completed.")
def transform(workspace_service_url=None, workspace_name=None,
              object_name=None, output_file_name=None, input_directory=None, 
              working_directory=None, has_replicates=None, input_mapping=None, format_type=None, 
              level=logging.INFO, logger=None):
    """
    Converts SampleProperty TSV file to json string of KBaseEnigmaMetals.SamplePropertyMatrix type.

    Args:
        workspace_service_url: URL for a KBase Workspace service where KBase objects.
                               are stored.
        workspace_name: The name of the destination workspace.
        object_name: The destination object name.
        output_file_name: A file name where the output JSON string should be stored.
                          If the output file name is not specified the name will
                          default to the name of the input file appended with
                          '_output.json'.
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be
                           written to.
        has_replicates: 0 if the input file contains marked series of replicates, 
        				1 if the input file contains non-marked series of replicates, 
                        2 if the input file contains no replicates.
        input_mapping: JSON string mapping of input files to expected types.
                       If you don't get this you need to scan the input
                       directory and look for your files.
        format_type: Mannually defined type of TSV file format.

    Returns:
        JSON files on disk that can be saved as a KBase workspace objects.

    Authors:
        Roman Sutormin, Alexey Kazakov
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of SampleProperty TSV to KBaseEnigmaMetals.SamplePropertyMatrix")
    # token = os.environ.get('KB_AUTH_TOKEN')

    if not working_directory or not os.path.isdir(working_directory):
        raise Exception("The working directory {0} is not a valid directory!"
                        .format(working_directory))

    classpath = ["$KB_TOP/lib/jars/kbase/transform/kbase_transform_deps.jar",
                 "$KB_TOP/lib/jars/apache_commons/commons-cli-1.2.jar",
                 "$KB_TOP/lib/jars/apache_commons/commons-lang3-3.1.jar",
                 "$KB_TOP/lib/jars/ini4j/ini4j-0.5.2.jar",
                 "$KB_TOP/lib/jars/jackson/jackson-annotations-2.2.3.jar",
                 "$KB_TOP/lib/jars/jackson/jackson-core-2.2.3.jar",
                 "$KB_TOP/lib/jars/jackson/jackson-databind-2.2.3.jar",
                 "$KB_TOP/lib/jars/jetty/jetty-all-7.0.0.jar",
                 "$KB_TOP/lib/jars/jna/jna-3.4.0.jar",
                 "$KB_TOP/lib/jars/kbase/auth/kbase-auth-0.3.1.jar",
                 "$KB_TOP/lib/jars/kbase/common/kbase-common-0.0.10.jar",
                 "$KB_TOP/lib/jars/servlet/servlet-api-2.5.jar",
                 "$KB_TOP/lib/jars/syslog4j/syslog4j-0.9.46.jar",
                 "$KB_TOP/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar"]
    
    mc = "us.kbase.kbaseenigmametals.SamplePropertyMatrixUploader"

    argslist = ["--workspace_service_url {0}".format(workspace_service_url),
                "--workspace_name {0}".format(workspace_name),
                "--object_name {0}".format(object_name),
                "--input_directory {0}".format(input_directory),
                "--has_replicates {0}".format(has_replicates),
                "--working_directory {0}".format(working_directory)]
    if output_file_name:
        argslist.append("--output_file_name {0}".format(output_file_name))
    if input_mapping:
        argslist.append("--input_mapping {0}".format(input_mapping))
    argslist.append("--format_type {0}".format(format_type))

    arguments = ["java", "-classpath", ":".join(classpath), mc, " ".join(argslist)]

    logger.info(arguments)

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    tool_process = subprocess.Popen(" ".join(arguments), stderr=subprocess.PIPE, shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.error(stderr)
    if tool_process.returncode:
        logger.error("Transformation from TSV.SampleProperty to KBaseEnigmaMetals.SamplePropertyMatrix failed on {0}".format(input_directory))
        sys.exit(1)

    logger.info("Conversion completed.")
def main():
    """
    KBase Convert task manager for converting between KBase objects.
    
    Step 1 - Run a converter to pull the source object and save the destination object.
    
    Args:
        workspace_service_url: URL for a KBase Workspace service where KBase objects 
                               are stored.
        ujs_service_url: URL for a User and Job State service to report task progress
                         back to the user.
        shock_service_url: URL for a KBase SHOCK data store service for storing files 
                           and large reference data.
        handle_service_url: URL for a KBase Handle service that maps permissions from 
                            the Workspace to SHOCK for KBase types that specify a Handle 
                            reference instead of a SHOCK reference.
        source_workspace_name: The name of the source workspace.
        destination_workspace_name: The name of the destination workspace.
        source_object_name: The source object name.
        destination_object_name: The destination object name.
        source_kbase_type: The KBase Workspace type string that indicates the module
                           and type of the object being created.                       
        destination_kbase_type: The KBase Workspace type string that indicates the module
                                and type of the object being created.
        optional_arguments: This is a JSON string containing optional parameters that can
                            be passed in for custom behavior per conversion.
        ujs_job_id: The job id from the User and Job State service that can be used to
                    report status on task progress back to the user.
        job_details: This is a JSON string that passes in the script specific command
                     line options for a given conversion type.  The service pulls
                     these config settings from a script config created by the developer
                     of the conversion script and passes that into the AWE job that
                     calls this script.
        working_directory: The working directory on disk where files can be created and
                           will be cleaned when the job ends with success or failure.
        keep_working_directory: A flag to tell the script not to delete the working
                                directory, which is mainly for debugging purposes.
        debug: Run the taskrunner in debug mode for local execution in a virtualenv.
    
    Returns:
        Literal return value is 0 for success and 1 for failure.
        
        Actual data output is one or more Workspace objects saved to a user's workspace. 
        
    Authors:
        Matt Henderson, Gavin Price            
    """

    logger = script_utils.stderrlogger(__file__, level=logging.DEBUG)
    logger.info("Executing KBase Convert tasks")

    script_details = script_utils.parse_docs(main.__doc__)

    logger.debug(script_details["Args"])

    parser = script_utils.ArgumentParser(
        description=script_details["Description"],
        epilog=script_details["Authors"])
    # provided by service config
    parser.add_argument('--workspace_service_url',
                        help=script_details["Args"]["workspace_service_url"],
                        action='store',
                        required=True)
    parser.add_argument('--ujs_service_url',
                        help=script_details["Args"]["ujs_service_url"],
                        action='store',
                        required=True)

    # optional because not all KBase Workspace types contain a SHOCK or Handle reference
    parser.add_argument('--shock_service_url',
                        help=script_details["Args"]["shock_service_url"],
                        action='store',
                        default=None)
    parser.add_argument('--handle_service_url',
                        help=script_details["Args"]["handle_service_url"],
                        action='store',
                        default=None)

    # workspace info for pulling the data
    parser.add_argument('--source_workspace_name',
                        help=script_details["Args"]["source_workspace_name"],
                        action='store',
                        required=True)
    parser.add_argument('--source_object_name',
                        help=script_details["Args"]["source_object_name"],
                        action='store',
                        required=True)

    # workspace info for saving the data
    parser.add_argument(
        '--destination_workspace_name',
        help=script_details["Args"]["destination_workspace_name"],
        action='store',
        required=True)
    parser.add_argument('--destination_object_name',
                        help=script_details["Args"]["destination_object_name"],
                        action='store',
                        required=True)

    # the types that we are transforming between, currently assumed one to one
    parser.add_argument('--source_kbase_type',
                        help=script_details["Args"]["source_kbase_type"],
                        action='store',
                        required=True)
    parser.add_argument('--destination_kbase_type',
                        help=script_details["Args"]["destination_kbase_type"],
                        action='store',
                        required=True)

    # any user options provided, encoded as a jason string
    parser.add_argument('--optional_arguments',
                        help=script_details["Args"]["optional_arguments"],
                        action='store',
                        default='{}')

    # Used if you are restarting a previously executed job?
    parser.add_argument('--ujs_job_id',
                        help=script_details["Args"]["ujs_job_id"],
                        action='store',
                        default=None,
                        required=False)

    # config information for running the validate and transform scripts
    parser.add_argument('--job_details',
                        help=script_details["Args"]["job_details"],
                        action='store',
                        default=None)

    # the working directory is where all the files for this job will be written,
    # and normal operation cleans it after the job ends (success or fail)
    parser.add_argument('--working_directory',
                        help=script_details["Args"]["working_directory"],
                        action='store',
                        default=None,
                        required=True)
    parser.add_argument('--keep_working_directory',
                        help=script_details["Args"]["keep_working_directory"],
                        action='store_true')

    # turn on debugging options for script developers running locally
    parser.add_argument('--debug',
                        help=script_details["Args"]["debug"],
                        action='store_true')

    args = None
    try:
        args = parser.parse_args()
    except Exception, e:
        logger.debug("Caught exception parsing arguments!")
        logger.exception(e)
        sys.exit(1)
Exemple #37
0
def transform(workspace_service_url=None, workspace_name=None,
              object_name=None, output_file_name=None, input_directory=None, 
              working_directory=None, input_mapping=None, format_type=None, 
              genome_object_name=None, fill_missing_values=None, data_type=None, 
              data_scale=None, level=logging.INFO, logger=None):
    """
    Converts Expression TSV file to json string of KBaseFeatureValues.ExpressionMatrix type.

    Args:
        workspace_service_url: URL for a KBase Workspace service where KBase objects.
                               are stored.
        workspace_name: The name of the destination workspace.
        object_name: The destination object name.
        output_file_name: A file name where the output JSON string should be stored.
                          If the output file name is not specified the name will
                          default to the name of the input file appended with
                          '_output.json'.
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be
                           written to.
        input_mapping: JSON string mapping of input files to expected types.
                       If you don't get this you need to scan the input
                       directory and look for your files.
        format_type: Mannually defined type of TSV file format.
        genome_object_name: Optional reference to a Genome object that will be used.
                            for mapping feature IDs to.
        fill_missing_values: Flag for filling in missing values in matrix (0-false, 1-true).
        data_type: Data type (default value is 'log-ratio').
        data_scale: Data scale (default value is '1.0').

    Returns:
        JSON files on disk that can be saved as a KBase workspace objects.

    Authors:
        Roman Sutormin
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of Expression TSV to KBaseFeatureValues.ExpressionMatrix")
    # token = os.environ.get('KB_AUTH_TOKEN')

    if not working_directory or not os.path.isdir(working_directory):
        raise Exception("The working directory {0} is not a valid directory!"
                        .format(working_directory))

    classpath = ["$KB_TOP/lib/jars/kbase/feature_values/kbase-feature-values-0.8.jar",
                 "$KB_TOP/lib/jars/kohsuke/args4j-2.0.21.jar",
                 "$KB_TOP/lib/jars/kbase/common/kbase-common-0.0.10.jar",
                 "$KB_TOP/lib/jars/jackson/jackson-annotations-2.2.3.jar",
                 "$KB_TOP/lib/jars/jackson/jackson-core-2.2.3.jar",
                 "$KB_TOP/lib/jars/jackson/jackson-databind-2.2.3.jar",
                 "$KB_TOP/lib/jars/kbase/auth/kbase-auth-1398468950-3552bb2.jar",
                 "$KB_TOP/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar"]
    
    mc = "us.kbase.kbasefeaturevalues.transform.ExpressionUploader"

    argslist = ["--workspace_service_url {0}".format(workspace_service_url),
                "--workspace_name {0}".format(workspace_name),
                "--object_name {0}".format(object_name),
                "--input_directory {0}".format(input_directory),
                "--working_directory {0}".format(working_directory)]
    if output_file_name:
        argslist.append("--output_file_name {0}".format(output_file_name))
    if input_mapping:
        argslist.append("--input_mapping {0}".format(input_mapping))
    if format_type:
        argslist.append("--format_type {0}".format(format_type))
    if genome_object_name:
        argslist.append("--genome_object_name {0}".format(genome_object_name))
    if fill_missing_values:
        argslist.append("--fill_missing_values")
    if data_type:
        argslist.append("--data_type {0}".format(data_type))
    if data_scale:
        argslist.append("--data_scale {0}".format(data_scale))

    arguments = ["java", "-classpath", ":".join(classpath), mc, " ".join(argslist)]

    logger.debug(arguments)

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    tool_process = subprocess.Popen(" ".join(arguments), stderr=subprocess.PIPE, shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.error("Transformation from TSV.Expression to KBaseFeatureValues.ExpressionMatrix failed on {0}".format(input_directory))
        logger.error(stderr)
        sys.exit(1)

    logger.info("Conversion completed.")
def validate(input_directory, working_directory, level=logging.INFO, logger=None):
    """
    Validates any file containing sequence data.

    Args:
        input_directory: A directory containing one or more SequenceRead files.
        working_directory: A directory where any output files produced by validation can be written.
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Currently writes to stderr with a Java Exception trace on error, otherwise no output.
    
    Authors:
        Srividya Ramikrishnan, Matt Henderson
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    fasta_extensions = [".fa",".fas",".fasta",".fna"]
    fastq_extensions = [".fq",".fastq",".fnq"]
        
    extensions = fasta_extensions + fastq_extensions

    checked = False
    validated = True
    for input_file_name in os.listdir(input_directory):
        logger.info("Checking for SequenceReads file : {0}".format(input_file_name))

        filePath = os.path.join(os.path.abspath(input_directory), input_file_name)
        
        if not os.path.isfile(filePath):
            logger.warning("Skipping directory {0}".format(input_file_name))
            continue
        elif os.path.splitext(input_file_name)[-1] not in extensions:
            logger.warning("Unrecognized file type, skipping.")
            continue
                
        logger.info("Starting SequenceReads validation of {0}".format(input_file_name))
        
        if os.path.splitext(input_file_name)[-1] in fasta_extensions:
            # TODO This needs to be changed, this is really just a demo program for this library and not a serious tool
            java_classpath = os.path.join(os.environ.get("KB_TOP"), "lib/jars/FastaValidator/FastaValidator-1.0.jar")
            arguments = ["java", "-classpath", java_classpath, "FVTester", filePath]

        elif os.path.splitext(input_file_name)[-1] in fastq_extensions:
            line_count = int(subprocess.check_output(["wc", "-l", filePath]).split()[0])
            
            if line_count % 4 > 0:
                #cleans out lines that are empty.  SRA Tool box puts newline on the end.
                cmd_list = ["sed","-i", r"/^$/d",filePath]
                filtering = subprocess.Popen(cmd_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                stdout, stderr = filtering.communicate()
                if filtering.returncode != 0:
                    raise Exception("sed execution failed for the file {0}".format(filePath))
                
            if (check_interleavedPE(filePath) == 1):
                arguments = ["fastQValidator", "--file", filePath, "--maxErrors", "10", "--disableSeqIDCheck"]      
            else :
                arguments = ["fastQValidator", "--file", filePath, "--maxErrors", "10"] 

        tool_process = subprocess.Popen(arguments, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
    
        if tool_process.returncode != 0:
            logger.error("Validation failed on {0}".format(input_file_name))
            validated = False
            break
        else:
            logger.info("Validation passed on {0}".format(input_file_name))
            checked = True
        
    if not validated:
        raise Exception("Validation failed!")
    elif not checked:
        raise Exception("No files were found that had a valid fasta or fastq extension.")
    else:
        logger.info("Validation passed.")
def upload_assembly(shock_service_url = None, 
                    handle_service_url = None,
                    input_directory = None,
#                    shock_id = None,
#                  handle_id = None,
                    input_mapping = None,
                    workspace_name = None, 
                    workspace_service_url = None, 
                    taxon_reference = None, 
                    assembly_name = None, 
                    source = None, 
                    date_string = None,
                    contig_information_dict = None,
                    logger = None):

    """
    Uploads CondensedGenomeAssembly
    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle service.
        shock_id: If the shock id exists use same file (NEEDS TO BE UPDATED TO HANDLE ID)
        input_mapping: (not sure, I think for mapping multiple files, not needed here only 1 file expected)
        workspace_name: Name of ws to load into
        workspace_service_url: URL of WS server instance the WS is on.
        taxon_reference: The ws reference the assembly points to.  (Optional)
        assembly_name: Name of the assembly object to be created. (Optional) (defaults to file_name)
        source: The source of the data (Ex: Refseq)
        date_string: Date (or date range) associated with data. (Optional)
        contig_information_dict: A mapping that has is_circular and description information (Optional)
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.
    Authors:
        Jason Baumohl, Matt Henderson
    """
    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of FASTA to Assembly object")
    token = os.environ.get('KB_AUTH_TOKEN')
 
    if input_mapping is None: 
        logger.info("Scanning for FASTA files.")
 
        valid_extensions = [".fa",".fasta",".fna",".fas"] 
 
#        files = os.listdir(input_directory)
        files = os.listdir(os.path.abspath(input_directory))
        fasta_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]
 
        if (len(fasta_files) == 0): 
            raise Exception("The input file does not have one of the following extensions .fa, .fasta, .fas or .fna") 
 
 
        logger.info("Found {0}".format(str(fasta_files))) 
 
        fasta_file_name = os.path.join(input_directory,fasta_files[0]) 
 
        if len(fasta_files) > 1: 
            logger.warning("Not sure how to handle multiple FASTA files in this context. Using {0}".format(fasta_file_name)) 
    else: 
        logger.info("Input Mapping not none : " + str(input_mapping))
        fasta_file_name = os.path.join(os.path.join(input_directory, "FASTA.DNA.Assembly"), simplejson.loads(input_mapping)["FASTA.DNA.Assembly"]) 
 
    logger.info("Building Object.") 
 
    if not os.path.isfile(fasta_file_name): 
        raise Exception("The fasta file name {0} is not a file!".format(fasta_file_name)) 
                    
    if not os.path.isdir(input_directory): 
        raise Exception("The input directory {0} is not a valid directory!".format(input_directory)) 

    ws_client = biokbase.workspace.client.Workspace(workspace_service_url)
 
    workspace_object = ws_client.get_workspace_info({'workspace':workspace_name}) 

    workspace_id = workspace_object[0] 
    workspace_name = workspace_object[1] 
    
    print "FASTA FILE Name :"+ fasta_file_name + ":"

    if assembly_name is None:
        base = os.path.basename(fasta_file_name) 
        assembly_name = "{0}_assembly".format(os.path.splitext(base)[0])


    ##########################################
    #ASSEMBLY CREATION PORTION  - consume Fasta File
    ##########################################

    logger.info("Starting conversion of FASTA to Assemblies")
    logger.info("Building Assembly Object.")

    input_file_handle = TextFileDecoder.open_textdecoder(fasta_file_name, 'ISO-8859-1')    
    fasta_header = None
    fasta_description = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    sequence_exists = False
    
    total_length = 0
    gc_length = 0
    #Note added X and x due to kb|g.1886.fasta
    valid_chars = "-AaCcGgTtUuWwSsMmKkRrYyBbDdHhVvNnXx"
    amino_acid_specific_characters = "PpLlIiFfQqEe" 

    #Base_counts - is dict of base characters and their counts.
    base_counts = dict()

    sequence_start = 0
    sequence_stop = 0

    current_line = input_file_handle.readline()
    while current_line != None and len(current_line) > 0:
#        print "CURRENT LINE: " + current_line
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_exists) and first_header_found:
                logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
                raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
            if not first_header_found:
                first_header_found = True
                sequence_start = 0
            else:
                sequence_stop = input_file_handle.tell() - len(current_line)
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence :
                    logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
                    raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
#                for character in total_sequence:
#                    if character not in valid_chars:
#                        if character in amino_acid_specific_characters:
#                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
#                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))
                seq_count = collections.Counter(total_sequence.upper())
                seq_dict = dict(seq_count)
                for character in seq_dict:
                    if character in base_counts:
                        base_counts[character] =  base_counts[character] + seq_dict[character]
                    else:
                        base_counts[character] =  seq_dict[character]
                    if character not in valid_chars:
                        if character in amino_acid_specific_characters:
                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))

                contig_dict = dict() 
                Ncount = 0
                if "N" in seq_dict:
                    Ncount = seq_dict["N"]
                contig_dict["Ncount"] = Ncount 
                length = len(total_sequence)
                total_length = total_length + length
                contig_gc_length = len(re.findall('G|g|C|c',total_sequence))
                contig_dict["gc_content"] = float(contig_gc_length)/float(length) 
                gc_length = gc_length + contig_gc_length
                fasta_key = fasta_header.strip()
                contig_dict["contig_id"] = fasta_key 
                contig_dict["length"] = length 
                contig_dict["name"] = fasta_key 
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest() 
                contig_dict["md5"] = contig_md5 
                contig_set_md5_list.append(contig_md5)

                contig_dict["is_circular"] = "Unknown"
                if fasta_description is not None: 
                    contig_dict["description"] = fasta_description
                if contig_information_dict is not None:
                    if contig_information_dict[fasta_key] is not None:
                        if contig_information_dict[fasta_key]["definition"] is not None:
                            contig_dict["description"] = contig_information_dict[fasta_key]["definition"]
                        if contig_information_dict[fasta_key]["is_circular"] is not None:
                            contig_dict["is_circular"] = contig_information_dict[fasta_key]["is_circular"]
                contig_dict["start_position"] = sequence_start
                contig_dict["num_bytes"] = sequence_stop - sequence_start

#                print "Sequence Start: " + str(sequence_start) + "Fasta: " + fasta_key
#                print "Sequence Stop: " + str(sequence_stop) + "Fasta: " + fasta_key

                if fasta_key in fasta_dict:
                    raise Exception("The fasta header {0} appears more than once in the file ".format(fasta_key))
                else: 
                    fasta_dict[fasta_key] = contig_dict
               
                # get set up for next fasta sequence
                sequence_list = []
                sequence_exists = False
                
#               sequence_start = input_file_handle.tell()               
            sequence_start = 0            

            fasta_header_line = current_line.strip().replace('>','')
            try:
                fasta_header , fasta_description = fasta_header_line.split(' ',1)
            except:
                fasta_header = fasta_header_line
                fasta_description = None
        else:
            if sequence_start == 0:
                sequence_start = input_file_handle.tell() - len(current_line) 
            sequence_list.append(current_line)
            sequence_exists = True
        current_line = input_file_handle.readline()
#        print "ENDING CURRENT LINE: " + current_line

    # wrap up last fasta sequence
    if (not sequence_exists) and first_header_found: 
        logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
        raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
    elif not first_header_found :
        logger.error("There are no contigs in this file") 
        raise Exception("There are no contigs in this file") 
    else: 
        sequence_stop = input_file_handle.tell()
        # build up sequence and remove all white space      
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence :
            logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
            raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 

#        for character in total_sequence: 
        seq_count = collections.Counter(total_sequence.upper()) 
        seq_dict = dict(seq_count) 
        for character in seq_dict:
            if character in base_counts:
                base_counts[character] =  base_counts[character] + seq_dict[character]
            else:
                base_counts[character] =  seq_dict[character]
            if character not in valid_chars: 
                if character in amino_acid_specific_characters:
                    raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))

        contig_dict = dict() 
        Ncount = 0
        if "N" in seq_dict:
            Ncount = seq_dict["N"]
        contig_dict["Ncount"] = Ncount 
        length = len(total_sequence)
        total_length = total_length + length
        contig_gc_length = len(re.findall('G|g|C|c',total_sequence))
        contig_dict["gc_content"] = float(contig_gc_length)/float(length) 
        gc_length = gc_length + contig_gc_length
        fasta_key = fasta_header.strip()
        contig_dict["contig_id"] = fasta_key 
        contig_dict["length"] = length
        contig_dict["name"] = fasta_key

        contig_dict["is_circular"] = "Unknown"
        if fasta_description is not None:
            contig_dict["description"] = fasta_description
        if contig_information_dict is not None: 
            if contig_information_dict[fasta_key] is not None:
                if contig_information_dict[fasta_key]["definition"] is not None:
                    contig_dict["description"] = contig_information_dict[fasta_key]["definition"]
                if contig_information_dict[fasta_key]["is_circular"] is not None:
                    contig_dict["is_circular"] = contig_information_dict[fasta_key]["is_circular"]
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"]= contig_md5
        contig_set_md5_list.append(contig_md5)
        contig_dict["start_position"] = sequence_start
        contig_dict["num_bytes"] = sequence_stop - sequence_start
        
        if fasta_key in fasta_dict:
            raise Exception("The fasta header {0} appears more than once in the file ".format(fasta_key))
        else: 
            fasta_dict[fasta_key] = contig_dict
        input_file_handle.close()

    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(sorted(contig_set_md5_list))).hexdigest()
    contig_set_dict["assembly_id"] = assembly_name
    contig_set_dict["name"] = assembly_name
    contig_set_dict["external_source"] = source
    contig_set_dict["external_source_id"] = os.path.basename(fasta_file_name) 
#    contig_set_dict["external_source_origination_date"] = str(os.stat(fasta_file_name).st_ctime)

    if date_string is not None:
        contig_set_dict["external_source_origination_date"] = date_string
    contig_set_dict["contigs"] = fasta_dict
    contig_set_dict["dna_size"] = total_length
    contig_set_dict["gc_content"] = float(gc_length)/float(total_length)
#    print "Fasta dict Keys :"+",".join(fasta_dict.keys())+":" 
    contig_set_dict["num_contigs"] = len(fasta_dict.keys())
    contig_set_dict["type"] = "Unknown"
    contig_set_dict["notes"] = "Note MD5s are generated from uppercasing the sequences" 
    contig_set_dict["base_counts"] = base_counts 
    if taxon_reference is not None:
        contig_set_dict["taxon_ref"] = taxon_reference


    shock_id = None
    handle_id = None
    if shock_id is None:
        shock_info = script_utils.upload_file_to_shock(logger, shock_service_url, fasta_file_name, token=token)
        shock_id = shock_info["id"]
        handles = script_utils.getHandles(logger, shock_service_url, handle_service_url, [shock_id], [handle_id], token)   
        handle_id = handles[0]

    contig_set_dict["fasta_handle_ref"] = handle_id

    # For future development if the type is updated to the handle_reference instead of a shock_reference
    assembly_not_saved = True 
    assembly_provenance = [{"script": __file__, "script_ver": "0.1", "description": "Generated from fasta files generated from v5 of the CS."}]
    while assembly_not_saved: 
        try: 
            assembly_info =  ws_client.save_objects({"workspace": workspace_name,"objects":[ 
                {"type":"KBaseGenomeAnnotations.Assembly", 
                 "data":contig_set_dict, 
                 "name": assembly_name, 
                 "provenance":assembly_provenance}]}) 
            assembly_not_saved = False 
        except biokbase.workspace.client.ServerError as err: 
            print "ASSEMBLY SAVE FAILED ON genome " + str(assembly_name) + " ERROR: " + str(err) 
            raise 
        except: 
            print "ASSEMBLY SAVE FAILED ON genome " + str(assembly_name) + " GENERAL_EXCEPTION: " + str(sys.exc_info()[0]) 
            raise 
    
    logger.info("Conversion completed.")
def main():
    """
    KBase Convert task manager for converting between KBase objects.
    
    Step 1 - Run a converter to pull the source object and save the destination object.
    
    Args:
        workspace_service_url: URL for a KBase Workspace service where KBase objects 
                               are stored.
        ujs_service_url: URL for a User and Job State service to report task progress
                         back to the user.
        shock_service_url: URL for a KBase SHOCK data store service for storing files 
                           and large reference data.
        handle_service_url: URL for a KBase Handle service that maps permissions from 
                            the Workspace to SHOCK for KBase types that specify a Handle 
                            reference instead of a SHOCK reference.
        source_workspace_name: The name of the source workspace.
        destination_workspace_name: The name of the destination workspace.
        source_object_name: The source object name.
        destination_object_name: The destination object name.
        source_kbase_type: The KBase Workspace type string that indicates the module
                           and type of the object being created.                       
        destination_kbase_type: The KBase Workspace type string that indicates the module
                                and type of the object being created.
        optional_arguments: This is a JSON string containing optional parameters that can
                            be passed in for custom behavior per conversion.
        ujs_job_id: The job id from the User and Job State service that can be used to
                    report status on task progress back to the user.
        job_details: This is a JSON string that passes in the script specific command
                     line options for a given conversion type.  The service pulls
                     these config settings from a script config created by the developer
                     of the conversion script and passes that into the AWE job that
                     calls this script.
        working_directory: The working directory on disk where files can be created and
                           will be cleaned when the job ends with success or failure.
        keep_working_directory: A flag to tell the script not to delete the working
                                directory, which is mainly for debugging purposes.
    
    Returns:
        Literal return value is 0 for success and 1 for failure.
        
        Actual data output is one or more Workspace objects saved to a user's workspace. 
        
    Authors:
        Matt Henderson, Gavin Price            
    """

    logger = script_utils.stderrlogger(__file__, level=logging.DEBUG)
    logger.info("Executing KBase Convert tasks")
    
    script_details = script_utils.parse_docs(main.__doc__)
    
    logger.debug(script_details["Args"])
    
    parser = script_utils.ArgumentParser(description=script_details["Description"],
                                     epilog=script_details["Authors"])
    # provided by service config
    parser.add_argument('--workspace_service_url', 
                        help=script_details["Args"]["workspace_service_url"], 
                        action='store', 
                        required=True)
    parser.add_argument('--ujs_service_url', 
                        help=script_details["Args"]["ujs_service_url"], 
                        action='store', 
                        required=True)
    
    # optional because not all KBase Workspace types contain a SHOCK or Handle reference
    parser.add_argument('--shock_service_url', 
                        help=script_details["Args"]["shock_service_url"], 
                        action='store', 
                        default=None)
    parser.add_argument('--handle_service_url', 
                        help=script_details["Args"]["handle_service_url"], 
                        action='store', 
                        default=None)

    # workspace info for pulling the data
    parser.add_argument('--source_workspace_name', 
                        help=script_details["Args"]["source_workspace_name"], 
                        action='store', 
                        required=True)
    parser.add_argument('--source_object_name', 
                        help=script_details["Args"]["source_object_name"], 
                        action='store', 
                        required=True)

    # workspace info for saving the data
    parser.add_argument('--destination_workspace_name', 
                        help=script_details["Args"]["destination_workspace_name"], 
                        action='store', 
                        required=True)
    parser.add_argument('--destination_object_name', 
                        help=script_details["Args"]["destination_object_name"], 
                        action='store', 
                        required=True)

    # the types that we are transforming between, currently assumed one to one 
    parser.add_argument('--source_kbase_type', 
                        help=script_details["Args"]["source_kbase_type"], 
                        action='store', 
                        required=True)
    parser.add_argument('--destination_kbase_type', 
                        help=script_details["Args"]["destination_kbase_type"], 
                        action='store', 
                        required=True)

    # any user options provided, encoded as a jason string                           
    parser.add_argument('--optional_arguments', 
                        help=script_details["Args"]["optional_arguments"], 
                        action='store', 
                        default='{}')

    # Used if you are restarting a previously executed job?
    parser.add_argument('--ujs_job_id', 
                        help=script_details["Args"]["ujs_job_id"], 
                        action='store', 
                        default=None, 
                        required=False)

    # config information for running the validate and transform scripts
    parser.add_argument('--job_details', 
                        help=script_details["Args"]["job_details"], 
                        action='store', 
                        default=None)

    # the working directory is where all the files for this job will be written, 
    # and normal operation cleans it after the job ends (success or fail)
    parser.add_argument('--working_directory', 
                        help=script_details["Args"]["working_directory"], 
                        action='store', 
                        default=None, 
                        required=True)
    parser.add_argument('--keep_working_directory', 
                        help=script_details["Args"]["keep_working_directory"], 
                        action='store_true')

    # ignore any extra arguments
    args, unknown = parser.parse_known_args()
            
    kb_token = os.environ.get('KB_AUTH_TOKEN')
    ujs = UserAndJobState(url=args.ujs_service_url, token=kb_token)

    est = datetime.datetime.utcnow() + datetime.timedelta(minutes=3)
    if args.ujs_job_id is not None:
        ujs.update_job_progress(args.ujs_job_id, kb_token, "KBase Data Convert started", 
                                1, est.strftime('%Y-%m-%dT%H:%M:%S+0000'))

    # parse all the json strings from the argument list into dicts
    # TODO had issues with json.loads and unicode strings, workaround was using simplejson and base64
    
    args.optional_arguments = simplejson.loads(base64.urlsafe_b64decode(args.optional_arguments))
    args.job_details = simplejson.loads(base64.urlsafe_b64decode(args.job_details))
    
    if not os.path.exists(args.working_directory):
        os.mkdir(args.working_directory)

    if args.ujs_job_id is not None:
        ujs.update_job_progress(args.ujs_job_id, kb_token, 
                                "Converting from {0} to {1}".format(args.source_kbase_type,args.destination_kbase_type), 
                                1, est.strftime('%Y-%m-%dT%H:%M:%S+0000') )

    # Step 1 : Convert the objects
    try:
        logger.info(args)
    
        convert_args = args.job_details["transform"]
        convert_args["optional_arguments"] = args.optional_arguments
        convert_args["working_directory"] = args.working_directory
        convert_args["workspace_service_url"] = args.workspace_service_url
        convert_args["source_workspace_name"] = args.source_workspace_name
        convert_args["source_object_name"] = args.source_object_name
        convert_args["destination_workspace_name"] = args.destination_workspace_name
        convert_args["destination_object_name"] = args.destination_object_name
        
        logger.info(convert_args)
        
        task_output = handler_utils.run_task(logger, convert_args)
        
        if task_output["stdout"] is not None:
            logger.debug("STDOUT : " + str(task_output["stdout"]))
        
        if task_output["stderr"] is not None:
            logger.debug("STDERR : " + str(task_output["stderr"]))        
    except Exception, e:
        handler_utils.report_exception(logger, 
                         {"message": 'ERROR : Conversion from {0} to {1}'.format(args.source_kbase_type,args.destination_kbase_type),
                          "exc": e,
                          "ujs": ujs,
                          "ujs_job_id": args.ujs_job_id,
                          "token": kb_token,
                         },
                         {"keep_working_directory": args.keep_working_directory,
                          "working_directory": args.working_directory})

        ujs.complete_job(args.ujs_job_id,
                         kb_token,
                         "Convert to {0} failed.".format(
                             args.destination_workspace_name), 
                         str(e),
                         None)
def transform(workspace_service_url=None, workspace_name=None, object_name=None,
              version=None, working_directory=None, output_file_name=None, 
              level=logging.INFO, logger=None):  
    """
    Converts KBaseEnigmaMetals.ChromatographyMatrix to TSV-formatted file.
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        workspace_name: Name of the workspace
        object_name: Name of the object in the workspace 
        version: Version number of workspace object, defaults to most recent version
        working_directory: The working directory where the output file should be stored.
        output_file_name: The desired file name of the result file.
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        TSV-formatted file containing data from ChromatographyMatrix object.
    
    Authors:
        Roman Sutormin, Alexey Kazakov
    
    """ 

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of KBaseEnigmaMetals.ChromatographyMatrix to TSV.Chromatography")
    token = os.environ.get("KB_AUTH_TOKEN")
    
    if not working_directory or not os.path.isdir(args.working_directory): 
        raise Exception("The working directory {0} does not exist".format(working_directory)) 

    logger.info("Grabbing Data.")

    classpath = ["$KB_TOP/lib/jars/kbase/transform/kbase_transform_deps.jar",
                 "$KB_TOP/lib/jars/apache_commons/commons-cli-1.2.jar",
                 "$KB_TOP/lib/jars/ini4j/ini4j-0.5.2.jar",
                 "$KB_TOP/lib/jars/jackson/jackson-annotations-2.2.3.jar",
                 "$KB_TOP/lib/jars/jackson/jackson-core-2.2.3.jar",
                 "$KB_TOP/lib/jars/jackson/jackson-databind-2.2.3.jar",
                 "$KB_TOP/lib/jars/jetty/jetty-all-7.0.0.jar",
                 "$KB_TOP/lib/jars/jna/jna-3.4.0.jar",
                 "$KB_TOP/lib/jars/kbase/auth/kbase-auth-0.3.1.jar",
                 "$KB_TOP/lib/jars/kbase/common/kbase-common-0.0.10.jar",
                 "$KB_TOP/lib/jars/servlet/servlet-api-2.5.jar",
                 "$KB_TOP/lib/jars/syslog4j/syslog4j-0.9.46.jar",
                 "$KB_TOP/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar"]
    
    mc = "us.kbase.kbaseenigmametals.ChromatographyMatrixDownloader"

    argslist = ["--workspace_service_url {0}".format(workspace_service_url),
                "--workspace_name {0}".format(workspace_name),
                "--object_name {0}".format(object_name),
                "--working_directory {0}".format(working_directory)]

    if output_file_name:
        argslist.append("--output_file_name {0}".format(output_file_name))

    if version:
        argslist.append("--version {0}".format(version))

    arguments = ["java", "-classpath", ":".join(classpath), mc, " ".join(argslist)]

    logger.debug(arguments)

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    tool_process = subprocess.Popen(" ".join(arguments), stderr=subprocess.PIPE, shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.error("Transformation from KBaseEnigmaMetals.ChromatographyMatrix to TSV.Chromatography failed")
        logger.error(stderr)
        sys.exit(1)
    
    logger.info("Conversion completed.")
Exemple #42
0
def convert(shock_service_url,
            handle_service_url,
            input_directory,
            object_name,
            level=logging.INFO,
            logger=None):
    """
    Converts FASTQ file to KBaseAssembly.PairedEndLibrary json string.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        input_directory: Where the FASTQ file can be found.
        object_name: A name to use when storing the JSON string.
        mean_insert: The average insert size.
        std_dev: standard deviation of the inserts
        interleaved: Are the reads interleaved?
        read_orientation: Do the reads have an outward orientation?
        level: Logging level, defaults to logging.INFO.
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of FASTQ to KBaseAssembly.PairedEndLibrary.")

    token = os.environ.get('KB_AUTH_TOKEN')

    # scan the directory for files
    logger.info("Scanning for FASTQ files.")

    valid_extensions = [".fq", ".fastq", ".fnq"]

    files = os.listdir(working_directory)
    fastq_files = [
        x for x in files if os.path.splitext(x)[-1] in valid_extensions
    ]

    assert len(fastq_files) != 0

    # put the files in shock, get handles
    shock_ids = list()
    for x in fastq_files:
        shock_info = script_utils.upload_file_to_shock(logger,
                                                       shock_service_url,
                                                       input_file_name,
                                                       token=token)
        shock_ids.append(shock_info["id"])

    logger.info("Gathering information.")
    handles = script_utils.getHandles(logger, shock_service_url,
                                      handle_service_url, shock_ids,
                                      [handle_id], token)

    assert len(handles) != 0

    # fill out the object details
    resultObject = dict()
    resultObject["handle_1"] = handles[0]

    if len(handles) == 2:
        resultObject["handle_2"] = handles[1]

    if mean_insert is not None:
        resultObject["insert_size_mean"] = mean_insert

    if std_dev is not None:
        resultObject["insert_size_std_dev"] = std_dev

    if interleaved:
        resultObject["interleaved"] = 1

    if read_orientation:
        resultObject["read_orientation_outward"] = 1

    objectString = json.dumps(resultObject, sort_keys=True, indent=4)

    logger.info("Writing out JSON.")
    with open(args.output_filename, "w") as outFile:
        outFile.write(objectString)

    logger.info("Conversion completed.")
def upload_assembly(
        shock_service_url=None,
        handle_service_url=None,
        input_directory=None,
        #                    shock_id = None,
        #                  handle_id = None,
        input_mapping=None,
        workspace_name=None,
        workspace_service_url=None,
        taxon_reference=None,
        assembly_name=None,
        source=None,
        date_string=None,
        contig_information_dict=None,
        logger=None):
    """
    Uploads CondensedGenomeAssembly
    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle service.
        shock_id: If the shock id exists use same file (NEEDS TO BE UPDATED TO HANDLE ID)
        input_mapping: (not sure, I think for mapping multiple files, not needed here only 1 file expected)
        workspace_name: Name of ws to load into
        workspace_service_url: URL of WS server instance the WS is on.
        taxon_reference: The ws reference the assembly points to.  (Optional)
        assembly_name: Name of the assembly object to be created. (Optional) (defaults to file_name)
        source: The source of the data (Ex: Refseq)
        date_string: Date (or date range) associated with data. (Optional)
        contig_information_dict: A mapping that has is_circular and description information (Optional)
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.
    Authors:
        Jason Baumohl, Matt Henderson
    """
    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of FASTA to Assembly object")
    token = os.environ.get('KB_AUTH_TOKEN')

    if input_mapping is None:
        logger.info("Scanning for FASTA files.")

        valid_extensions = [".fa", ".fasta", ".fna", ".fas"]

        #        files = os.listdir(input_directory)
        files = os.listdir(os.path.abspath(input_directory))
        fasta_files = [
            x for x in files if os.path.splitext(x)[-1] in valid_extensions
        ]

        if (len(fasta_files) == 0):
            raise Exception(
                "The input file does not have one of the following extensions .fa, .fasta, .fas or .fna"
            )

        logger.info("Found {0}".format(str(fasta_files)))

        fasta_file_name = os.path.join(input_directory, fasta_files[0])

        if len(fasta_files) > 1:
            logger.warning(
                "Not sure how to handle multiple FASTA files in this context. Using {0}"
                .format(fasta_file_name))
    else:
        logger.info("Input Mapping not none : " + str(input_mapping))
        fasta_file_name = os.path.join(
            os.path.join(input_directory, "FASTA.DNA.Assembly"),
            simplejson.loads(input_mapping)["FASTA.DNA.Assembly"])

    logger.info("Building Object.")

    if not os.path.isfile(fasta_file_name):
        raise Exception(
            "The fasta file name {0} is not a file!".format(fasta_file_name))

    if not os.path.isdir(input_directory):
        raise Exception(
            "The input directory {0} is not a valid directory!".format(
                input_directory))

    ws_client = biokbase.workspace.client.Workspace(workspace_service_url)

    workspace_object = ws_client.get_workspace_info(
        {'workspace': workspace_name})

    workspace_id = workspace_object[0]
    workspace_name = workspace_object[1]

    print "FASTA FILE Name :" + fasta_file_name + ":"

    if assembly_name is None:
        base = os.path.basename(fasta_file_name)
        assembly_name = "{0}_assembly".format(os.path.splitext(base)[0])

    ##########################################
    #ASSEMBLY CREATION PORTION  - consume Fasta File
    ##########################################

    logger.info("Starting conversion of FASTA to Assemblies")
    logger.info("Building Assembly Object.")

    input_file_handle = TextFileDecoder.open_textdecoder(
        fasta_file_name, 'ISO-8859-1')
    fasta_header = None
    fasta_description = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    sequence_exists = False

    total_length = 0
    gc_length = 0
    #Note added X and x due to kb|g.1886.fasta
    valid_chars = "-AaCcGgTtUuWwSsMmKkRrYyBbDdHhVvNnXx"
    amino_acid_specific_characters = "PpLlIiFfQqEe"

    #Base_counts - is dict of base characters and their counts.
    base_counts = dict()

    sequence_start = 0
    sequence_stop = 0

    current_line = input_file_handle.readline()
    while current_line != None and len(current_line) > 0:
        #        print "CURRENT LINE: " + current_line
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_exists) and first_header_found:
                logger.error(
                    "There is no sequence related to FASTA record : {0}".
                    format(fasta_header))
                raise Exception(
                    "There is no sequence related to FASTA record : {0}".
                    format(fasta_header))
            if not first_header_found:
                first_header_found = True
                sequence_start = 0
            else:
                sequence_stop = input_file_handle.tell() - len(current_line)
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence:
                    logger.error(
                        "There is no sequence related to FASTA record : {0}".
                        format(fasta_header))
                    raise Exception(
                        "There is no sequence related to FASTA record : {0}".
                        format(fasta_header))
#                for character in total_sequence:
#                    if character not in valid_chars:
#                        if character in amino_acid_specific_characters:
#                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
#                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))
                seq_count = collections.Counter(total_sequence.upper())
                seq_dict = dict(seq_count)
                for character in seq_dict:
                    if character in base_counts:
                        base_counts[character] = base_counts[
                            character] + seq_dict[character]
                    else:
                        base_counts[character] = seq_dict[character]
                    if character not in valid_chars:
                        if character in amino_acid_specific_characters:
                            raise Exception(
                                "This fasta file may have amino acids in it instead of the required nucleotides."
                            )
                        raise Exception(
                            "This FASTA file has non nucleic acid characters : {0}"
                            .format(character))

                contig_dict = dict()
                Ncount = 0
                if "N" in seq_dict:
                    Ncount = seq_dict["N"]
                contig_dict["Ncount"] = Ncount
                length = len(total_sequence)
                total_length = total_length + length
                contig_gc_length = len(re.findall('G|g|C|c', total_sequence))
                contig_dict["gc_content"] = float(contig_gc_length) / float(
                    length)
                gc_length = gc_length + contig_gc_length
                fasta_key = fasta_header.strip()
                contig_dict["contig_id"] = fasta_key
                contig_dict["length"] = length
                contig_dict["name"] = fasta_key
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
                contig_dict["md5"] = contig_md5
                contig_set_md5_list.append(contig_md5)

                contig_dict["is_circular"] = "Unknown"
                if fasta_description is not None:
                    contig_dict["description"] = fasta_description
                if contig_information_dict is not None:
                    if contig_information_dict[fasta_key] is not None:
                        if contig_information_dict[fasta_key][
                                "definition"] is not None:
                            contig_dict[
                                "description"] = contig_information_dict[
                                    fasta_key]["definition"]
                        if contig_information_dict[fasta_key][
                                "is_circular"] is not None:
                            contig_dict[
                                "is_circular"] = contig_information_dict[
                                    fasta_key]["is_circular"]
                contig_dict["start_position"] = sequence_start
                contig_dict["num_bytes"] = sequence_stop - sequence_start

                #                print "Sequence Start: " + str(sequence_start) + "Fasta: " + fasta_key
                #                print "Sequence Stop: " + str(sequence_stop) + "Fasta: " + fasta_key

                if fasta_key in fasta_dict:
                    raise Exception(
                        "The fasta header {0} appears more than once in the file "
                        .format(fasta_key))
                else:
                    fasta_dict[fasta_key] = contig_dict

                # get set up for next fasta sequence
                sequence_list = []
                sequence_exists = False

#               sequence_start = input_file_handle.tell()
            sequence_start = 0

            fasta_header_line = current_line.strip().replace('>', '')
            try:
                fasta_header, fasta_description = fasta_header_line.split(
                    ' ', 1)
            except:
                fasta_header = fasta_header_line
                fasta_description = None
        else:
            if sequence_start == 0:
                sequence_start = input_file_handle.tell() - len(current_line)
            sequence_list.append(current_line)
            sequence_exists = True
        current_line = input_file_handle.readline()
#        print "ENDING CURRENT LINE: " + current_line

# wrap up last fasta sequence
    if (not sequence_exists) and first_header_found:
        logger.error(
            "There is no sequence related to FASTA record : {0}".format(
                fasta_header))
        raise Exception(
            "There is no sequence related to FASTA record : {0}".format(
                fasta_header))
    elif not first_header_found:
        logger.error("There are no contigs in this file")
        raise Exception("There are no contigs in this file")
    else:
        sequence_stop = input_file_handle.tell()
        # build up sequence and remove all white space
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence:
            logger.error(
                "There is no sequence related to FASTA record : {0}".format(
                    fasta_header))
            raise Exception(
                "There is no sequence related to FASTA record : {0}".format(
                    fasta_header))

#        for character in total_sequence:
        seq_count = collections.Counter(total_sequence.upper())
        seq_dict = dict(seq_count)
        for character in seq_dict:
            if character in base_counts:
                base_counts[
                    character] = base_counts[character] + seq_dict[character]
            else:
                base_counts[character] = seq_dict[character]
            if character not in valid_chars:
                if character in amino_acid_specific_characters:
                    raise Exception(
                        "This fasta file may have amino acids in it instead of the required nucleotides."
                    )
                raise Exception(
                    "This FASTA file has non nucleic acid characters : {0}".
                    format(character))

        contig_dict = dict()
        Ncount = 0
        if "N" in seq_dict:
            Ncount = seq_dict["N"]
        contig_dict["Ncount"] = Ncount
        length = len(total_sequence)
        total_length = total_length + length
        contig_gc_length = len(re.findall('G|g|C|c', total_sequence))
        contig_dict["gc_content"] = float(contig_gc_length) / float(length)
        gc_length = gc_length + contig_gc_length
        fasta_key = fasta_header.strip()
        contig_dict["contig_id"] = fasta_key
        contig_dict["length"] = length
        contig_dict["name"] = fasta_key

        contig_dict["is_circular"] = "Unknown"
        if fasta_description is not None:
            contig_dict["description"] = fasta_description
        if contig_information_dict is not None:
            if contig_information_dict[fasta_key] is not None:
                if contig_information_dict[fasta_key][
                        "definition"] is not None:
                    contig_dict["description"] = contig_information_dict[
                        fasta_key]["definition"]
                if contig_information_dict[fasta_key][
                        "is_circular"] is not None:
                    contig_dict["is_circular"] = contig_information_dict[
                        fasta_key]["is_circular"]
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"] = contig_md5
        contig_set_md5_list.append(contig_md5)
        contig_dict["start_position"] = sequence_start
        contig_dict["num_bytes"] = sequence_stop - sequence_start

        if fasta_key in fasta_dict:
            raise Exception(
                "The fasta header {0} appears more than once in the file ".
                format(fasta_key))
        else:
            fasta_dict[fasta_key] = contig_dict
        input_file_handle.close()

    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(
        sorted(contig_set_md5_list))).hexdigest()
    contig_set_dict["assembly_id"] = assembly_name
    contig_set_dict["name"] = assembly_name
    contig_set_dict["external_source"] = source
    contig_set_dict["external_source_id"] = os.path.basename(fasta_file_name)
    #    contig_set_dict["external_source_origination_date"] = str(os.stat(fasta_file_name).st_ctime)

    if date_string is not None:
        contig_set_dict["external_source_origination_date"] = date_string
    contig_set_dict["contigs"] = fasta_dict
    contig_set_dict["dna_size"] = total_length
    contig_set_dict["gc_content"] = float(gc_length) / float(total_length)
    #    print "Fasta dict Keys :"+",".join(fasta_dict.keys())+":"
    contig_set_dict["num_contigs"] = len(fasta_dict.keys())
    contig_set_dict["type"] = "Unknown"
    contig_set_dict[
        "notes"] = "Note MD5s are generated from uppercasing the sequences"
    contig_set_dict["base_counts"] = base_counts
    if taxon_reference is not None:
        contig_set_dict["taxon_ref"] = taxon_reference

    shock_id = None
    handle_id = None
    if shock_id is None:
        shock_info = script_utils.upload_file_to_shock(logger,
                                                       shock_service_url,
                                                       fasta_file_name,
                                                       token=token)
        shock_id = shock_info["id"]
        handles = script_utils.getHandles(logger, shock_service_url,
                                          handle_service_url, [shock_id],
                                          [handle_id], token)
        handle_id = handles[0]

    contig_set_dict["fasta_handle_ref"] = handle_id

    # For future development if the type is updated to the handle_reference instead of a shock_reference
    assembly_not_saved = True
    assembly_provenance = [{
        "script":
        __file__,
        "script_ver":
        "0.1",
        "description":
        "Generated from fasta files generated from v5 of the CS."
    }]
    while assembly_not_saved:
        try:
            assembly_info = ws_client.save_objects({
                "workspace":
                workspace_name,
                "objects": [{
                    "type": "KBaseGenomeAnnotations.Assembly",
                    "data": contig_set_dict,
                    "name": assembly_name,
                    "provenance": assembly_provenance
                }]
            })
            assembly_not_saved = False
        except biokbase.workspace.client.ServerError as err:
            print "ASSEMBLY SAVE FAILED ON genome " + str(
                assembly_name) + " ERROR: " + str(err)
            raise
        except:
            print "ASSEMBLY SAVE FAILED ON genome " + str(
                assembly_name) + " GENERAL_EXCEPTION: " + str(
                    sys.exc_info()[0])
            raise

    logger.info("Conversion completed.")
Exemple #44
0
def PluginManager(directory=None, logger=script_utils.stderrlogger(__file__)):
    if directory is None:
        raise Exception("Must provide a directory to read plugin configs from!")

    manager = PlugIns(directory, logger)
    return manager
    def diff_p_distribution(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN diff_p_distribution
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FLTRD_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token

        param = args
 
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
 
 
        cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, 
                                          '--workspace_name', param['workspace_name'],
                                          '--object_name', param['object_name'],
                                          '--working_directory', self.RAWEXPR_DIR,
                                          '--output_file_name', self.EXPRESS_FN
                              ]
 
        # need shell in this case because the java code is depending on finding the KBase token in the environment
        #  -- copied from FVE_2_TSV
        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        self.logger.info("Identifying differentially expressed genes")
 
        ## Prepare sample file
        # detect num of columns
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f:
          fl = f.readline()
        ncol = len(fl.split('\t'))
        
        # force to use ANOVA if the number of sample is two
        if(ncol == 3): param['method'] = 'anova'
 
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol-1):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_filter
        cmd_coex_filter = [self.COEX_FILTER, '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), '-o', "{0}/{1}".format(self.FLTRD_DIR, self.FLTRD_FN),
                           '-m', param['method'], '-n', '10', '-s', "{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN),
                           '-x', "{0}/{1}".format(self.RAWEXPR_DIR, self.GENELST_FN), '-t', 'y', '-j', self.PVFDT_FN]
        if 'num_features' in param:
          cmd_coex_filter.append("-n")
          cmd_coex_filter.append(str(param['num_features']))
 
        if 'p_value' in param:
          cmd_coex_filter.append("-p")
          cmd_coex_filter.append(str(param['p_value']))
 
 
        tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
 
        ## loading pvalue distribution FDT
        pvfdt = {'row_labels' :[], 'column_labels' : [], "data" : [[]]};
        pvfdt = OrderedDict(pvfdt)
        with open(self.PVFDT_FN, 'r') as myfile:
           pvfdt = json.load(myfile)
        data_obj_name = "{0}.fdt".format(param['out_figure_object_name'])
        pvfdt['id'] = data_obj_name
 
 
        fig_properties = {"xlabel" : "-log2(p-value)", "ylabel" : "Number of features", "xlog_mode" : "-log2", "ylog_mode" : "none", "title" : "Histogram of P-values", "plot_type" : "histogram"}
        sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'MAK.FloatDataTable',
                                                                              'data' : pvfdt,
                                                                              'name' : data_obj_name}]})

        data_ref = "{0}/{1}/{2}".format(sstatus[0][6], sstatus[0][0], sstatus[0][4])
        fig_properties['data_ref'] = data_ref

        sstatus = ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'CoExpression.FigureProperties',
                                                                              'data' : fig_properties,
                                                                              'name' : (param['out_figure_object_name'])}]})
        result = fig_properties
        #END diff_p_distribution

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method diff_p_distribution return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
def run_filter_genes(workspace_service_url=None,
                     param_file=None,
                     level=logging.INFO,
                     logger=None):
    """
    Narrative Job Wrapper script to execute coex_filter
    
    Args:
        workspace_service_url:  A url for the KBase Workspace service 
        param_file: parameter file
        object_name: Name of the object in the workspace 
        level: Logging level, defaults to logging.INFO.
    
    Returns:
        Output is written back in WS
    
    Authors:
        Shinjae Yoo
    
    """

    try:
        os.makedirs(RAWEXPR_DIR)
    except:
        pass
    try:
        os.makedirs(FLTRD_DIR)
    except:
        pass
    try:
        os.makedirs(FINAL_DIR)
    except:
        pass

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info(
        "Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
    token = os.environ.get("KB_AUTH_TOKEN")

    with open(param_file) as paramh:
        param = json.load(paramh)

    cmd_dowload_cvt_tsv = [
        FVE_2_TSV, '--workspace_service_url', workspace_service_url,
        '--workspace_name', param['workspace_name'], '--object_name',
        param['object_name'], '--working_directory', RAWEXPR_DIR,
        '--output_file_name', EXPRESS_FN
    ]

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    #  -- copied from FVE_2_TSV
    tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    logger.info("Identifying differentially expressed genes")

    ## Prepare sample file
    # detect num of columns
    with open("{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), 'r') as f:
        fl = f.readline()
    ncol = len(fl.split('\t'))

    with open("{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), 'wt') as s:
        s.write("0")
        for j in range(1, ncol - 1):
            s.write("\t{0}".format(j))
        s.write("\n")

    ## Run coex_filter
    cmd_coex_filter = [
        COEX_FILTER, '-i', "{0}/{1}".format(RAWEXPR_DIR, EXPRESS_FN), '-o',
        "{0}/{1}".format(FLTRD_DIR, FLTRD_FN), '-m', param['method'], '-s',
        "{0}/{1}".format(RAWEXPR_DIR, SAMPLE_FN), '-x',
        "{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), '-t', 'y'
    ]
    if 'num_features' in param:
        cmd_coex_filter.append("-n")
        cmd_coex_filter.append(param['num_features'])

    if 'num_features' not in param and 'p_value' in param:
        cmd_coex_filter.append("-p")
        cmd_coex_filter.append(param['p_value'])

    if 'p_value' not in param and 'num_features' not in param:
        logger.error("One of p_value or num_features must be defined")
        sys.exit(2)

    #if 'p_value' in param and 'num_features' in param:
    #  logger.error("Both of p_value and num_features cannot be defined together");
    #  sys.exit(3)

    tool_process = subprocess.Popen(cmd_coex_filter, stderr=subprocess.PIPE)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    ## Header correction
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'r') as ff:
        fe = ff.readlines()
    with open("{0}/{1}".format(FLTRD_DIR, FLTRD_FN), 'w') as ff:
        ff.write(
            fl)  # use original first line that has correct header information
        fe.pop(0)
        ff.writelines(fe)

    ## Upload FVE
    from biokbase.workspace.client import Workspace
    ws = Workspace(url=workspace_service_url,
                   token=os.environ['KB_AUTH_TOKEN'])
    expr = ws.get_objects([{
        'workspace': param['workspace_name'],
        'name': param['object_name']
    }])[0]['data']

    # change workspace to be the referenced object's workspace_name because it may not be in the same working ws due to referencing
    cmd_upload_expr = [
        TSV_2_FVE, '--workspace_service_url', workspace_service_url,
        '--object_name', param['out_expr_object_name'], '--working_directory',
        FINAL_DIR, '--input_directory', FLTRD_DIR, '--output_file_name',
        FINAL_FN
    ]
    tmp_ws = param['workspace_name']
    if 'genome_ref' in expr:
        cmd_upload_expr.append('--genome_object_name')
        obj_infos = ws.get_object_info_new(
            {"objects": [{
                'ref': expr['genome_ref']
            }]})[0]

        if len(obj_infos) < 1:
            logger.error("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))
            raise Exception("Couldn't find {0} from the workspace".format(
                expr['genome_ref']))

        cmd_upload_expr.append(obj_infos[1])
        tmp_ws = obj_infos[7]
        logger.info("{0} => {1} / {2}".format(expr['genome_ref'], tmp_ws,
                                              obj_infos[1]))

    # updated ws name
    cmd_upload_expr.append('--workspace_name')
    cmd_upload_expr.append(tmp_ws)

    tool_process = subprocess.Popen(" ".join(cmd_upload_expr),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.info(stderr)

    with open("{0}/{1}".format(FINAL_DIR, FINAL_FN), 'r') as et:
        eo = json.load(et)

    if 'description' in expr:
        expr['description'] = "{0}, coex_filter by {1}".format(
            expr['description'], " ".join(cmd_coex_filter))
    if 'feature_mapping' in expr:
        expr['feature_mapping'] = eo['feature_mapping']
    expr['data'] = eo['data']

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseFeatureValues.ExpressionMatrix',
            'data': expr,
            'name': (param['out_expr_object_name'])
        }]
    })

    ## Upload FeatureSet
    fs = {
        'description':
        'Differentially expressed genes generated by {0}'.format(
            " ".join(cmd_coex_filter)),
        'elements': {}
    }

    with open("{0}/{1}".format(RAWEXPR_DIR, GENELST_FN), 'r') as glh:
        gl = glh.readlines()
    gl = [x.strip('\n') for x in gl]

    for g in gl:
        if 'genome_ref' in expr:
            fs['elements'][g] = [expr['genome_ref']]
        else:
            fs['elements'][g] = []

    ws.save_objects({
        'workspace':
        param['workspace_name'],
        'objects': [{
            'type': 'KBaseCollections.FeatureSet',
            'data': fs,
            'name': (param['out_fs_object_name'])
        }]
    })
    def const_coex_net_clust(self, ctx, args):
        # ctx is the context object
        # return variables are: result
        #BEGIN const_coex_net_clust
        try:
            os.makedirs(self.RAWEXPR_DIR)
        except:
            pass
        try:
            os.makedirs(self.CLSTR_DIR)
        except:
            pass
        try:
            os.makedirs(self.FINAL_DIR)
        except:
            pass
 
        if self.logger is None:
            self.logger = script_utils.stderrlogger(__file__)
        
        result = {}
        self.logger.info("Starting conversion of KBaseFeatureValues.ExpressionMatrix to TSV")
        token = ctx['token']

        param = args
 
        from biokbase.workspace.client import Workspace
        ws = Workspace(url=self.__WS_URL, token=token)
        expr = ws.get_objects([{'workspace': param['workspace_name'], 'name' : param['object_name']}])[0]['data']
 
 
        eenv = os.environ.copy()
        eenv['KB_AUTH_TOKEN'] = token
        cmd_dowload_cvt_tsv = [self.FVE_2_TSV, '--workspace_service_url', self.__WS_URL, 
                                          '--workspace_name', param['workspace_name'],
                                          '--object_name', param['object_name'],
                                          '--working_directory', self.RAWEXPR_DIR,
                                          '--output_file_name', self.EXPRESS_FN
                              ]
 
        # need shell in this case because the java code is depending on finding the KBase token in the environment
        #  -- copied from FVE_2_TSV
        tool_process = subprocess.Popen(" ".join(cmd_dowload_cvt_tsv), stderr=subprocess.PIPE, shell=True, env=eenv)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            self.logger.info(stderr)
            #raise Exception(stderr)
 
        self.logger.info("Coexpression clustering analysis")
 
        ## Prepare sample file
        # detect num of columns
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 'r') as f:
          fl = f.readline()
        ncol = len(fl.split('\t'))
        
        with open("{0}/{1}".format(self.RAWEXPR_DIR, self.SAMPLE_FN), 'wt') as s:
          s.write("0")
          for j in range(1,ncol-1):
            s.write("\t{0}".format(j))
          s.write("\n")
 
 
        ## Run coex_cluster
        cmd_coex_cluster = [self.COEX_CLUSTER, '-t', 'y',
                           '-i', "{0}/{1}".format(self.RAWEXPR_DIR, self.EXPRESS_FN), 
                           '-o', "{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN), '-m', "{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN) ]
 
        for p in ['net_method', 'minRsq', 'maxmediank', 'maxpower', 'clust_method', 'minModuleSize', 'detectCutHeight']:
           if p in param:
             cmd_coex_cluster.append("--{0}".format(p))
             cmd_coex_cluster.append(str(param[p]))
  
 
        #sys.exit(2) #TODO: No error handling in narrative so we do graceful termination
 
        #if 'p_value' in param and 'num_features' in param:
        #  self.logger.error("Both of p_value and num_features cannot be defined together");
        #  sys.exit(3)
 
        tool_process = subprocess.Popen(cmd_coex_cluster, stderr=subprocess.PIPE)
        stdout, stderr = tool_process.communicate()
        
        if stdout is not None and len(stdout) > 0:
            self.logger.info(stdout)
 
        if stderr is not None and len(stderr) > 0:
            if re.search(r'^There were \d+ warnings \(use warnings\(\) to see them\)', stderr):
              self.logger.info(stderr)
            else:
              self.logger.error(stderr)
              raise Exception(stderr)
 
        
        # build index for gene list
        pos_index ={expr['data']['row_ids'][i]: i for i in range(0, len(expr['data']['row_ids']))}
 
 
        # parse clustering results
        cid2genelist = {}
        cid2stat = {}
        with open("{0}/{1}".format(self.CLSTR_DIR, self.CSTAT_FN),'r') as glh:
            glh.readline() # skip header
            for line in glh:
                cluster, mcor, msec = line.rstrip().replace('"','').split("\t")
                cid2stat[cluster]= [mcor, msec]
        with open("{0}/{1}".format(self.CLSTR_DIR, self.CLSTR_FN),'r') as glh:
            glh.readline() # skip header
            for line in glh:
                gene, cluster = line.rstrip().replace('"','').split("\t")
                if cluster not in cid2genelist:
                    cid2genelist[cluster] = []
                cid2genelist[cluster].append(gene)
 
        if(len(cid2genelist) < 1) :
          self.logger.error("Clustering failed")
          return empty_results("Error: No cluster output", expr,self.__WS_URL, param, self.logger, ws)
          #sys.exit(4)
 
        self.logger.info("Uploading the results onto WS")
        feature_clusters = []
        for cluster in cid2genelist:
            feature_clusters.append( {"meancor": float(cid2stat[cluster][0]), "msec": float(cid2stat[cluster][0]), "id_to_pos" : { gene : pos_index[gene] for gene in cid2genelist[cluster]}})

        ## Upload Clusters
        feature_clusters ={"original_data": "{0}/{1}".format(param['workspace_name'],param['object_name']),
                           "feature_clusters": feature_clusters}
 
        ws.save_objects({'workspace' : param['workspace_name'], 'objects' : [{'type' : 'KBaseFeatureValues.FeatureClusters',
                                                                          'data' : feature_clusters,
                                                                          'name' : (param['out_object_name'])}]})
        result = {'workspace_name' : param['workspace_name'], 'out_object_name' : param['out_object_name']}
        #END const_coex_net_clust

        # At some point might do deeper type checking...
        if not isinstance(result, dict):
            raise ValueError('Method const_coex_net_clust return value ' +
                             'result is not type dict as required.')
        # return the results
        return [result]
Exemple #48
0
def transform(shock_service_url=None,
              handle_service_url=None,
              output_file_name=None,
              input_directory=None,
              working_directory=None,
              shock_id=None,
              handle_id=None,
              input_mapping=None,
              mzml_file_name=None,
              polarity=None,
              atlases=None,
              group=None,
              inclusion_order=None,
              normalization_factor=None,
              retention_correction=None,
              level=logging.INFO,
              logger=None):
    """
    Converts mzML file to MetaboliteAtlas2_MAFileInfo json string.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be
                          stored.
                          If the output file name is not specified the name
                          will default
                          to the name of the input file appended with
                           '_finfo'.
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be
                           written to.
        shock_id: Shock id for the hdf file if it already exists in shock
        handle_id: Handle id for the hdf file if it already exists as a
                    handle
        input_mapping: JSON string mapping of input files to expected types.
                       If you don't get this you need to scan the input
                       directory and look for your files.
        level: Logging level, defaults to logging.INFO.
        atlases: List of MetaboliteAtlas atlas IDs.
        mzml_file_name: Name of the file, optional.  Defaults to the file name.
        polarity: Run polarity.
        group: Run group.
        inclusion_order: Run inclusion_order.
        retention_correction: Run retention_correction.
        normalization_factor: Run normalization factor.

    Returns:
        JSON files on disk that can be saved as a KBase workspace objects.

    Authors:
        Steven Silvester
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting conversion of mzML to MetaboliteAtlas2.MAFileInfo")
    token = os.environ.get('KB_AUTH_TOKEN')

    if not working_directory or not os.path.isdir(working_directory):
        raise Exception(
            "The working directory {0} is not a valid directory!".format(
                working_directory))

    logger.info("Scanning for mzML files.")

    valid_extensions = [".mzML"]

    files = os.listdir(input_directory)
    mzml_files = [
        x for x in files if os.path.splitext(x)[-1] in valid_extensions
    ]

    assert len(mzml_files) != 0

    logger.info("Found {0} files".format(len(mzml_files)))

    for fname in mzml_files:
        path = os.path.join(input_directory, fname)

        if not os.path.isfile(path):
            raise Exception(
                "The input file name {0} is not a file!".format(path))

        hdf_file = mzml_loader.mzml_to_hdf(path)

        if shock_service_url:
            shock_info = script_utils.upload_file_to_shock(logger,
                                                           shock_service_url,
                                                           hdf_file,
                                                           token=token)

        run_info = dict()
        run_info['mzml_file_name'] = (mzml_file_name
                                      or fname.replace('.mzML', ''))
        run_info['atlases'] = atlases or []
        if polarity is not None:
            run_info['polarity'] = polarity
        if group is not None:
            run_info['group'] = group
        if inclusion_order is not None:
            run_info['inclusion_order'] = inclusion_order
        if normalization_factor is not None:
            run_info['normalization_factor'] = normalization_factor
        if retention_correction is not None:
            run_info['retention_correction'] = retention_correction

        if shock_service_url:
            handle_id = script_utils.getHandles(logger,
                                                shock_service_url,
                                                handle_service_url,
                                                [shock_info["id"]],
                                                token=token)[0]
            run_info["run_file_id"] = handle_id
        else:
            run_info['run_file_id'] = hdf_file

        output_file_name = fname.replace('.mzML', '_finfo.json')

        # This generates the json for the object
        objectString = simplejson.dumps(run_info, sort_keys=True, indent=4)

        output_file_path = os.path.join(working_directory, output_file_name)
        with open(output_file_path, "w") as outFile:
            outFile.write(objectString)

    logger.info("Conversion completed.")
    parser.add_argument('--handle_id', 
                        help=script_details["Args"]["handle_id"], 
                        action='store', type=str, nargs='?', default=None, required=False)

    parser.add_argument('--input_mapping', 
                        help=script_details["Args"]["input_mapping"], 
                        action='store', type=unicode, nargs='?', default=None, required=False)

    # Example of a custom argument specific to this uploader
    parser.add_argument('--fasta_reference_only', 
                        help=script_details["Args"]["fasta_reference_only"], 
                        action='store', type=str, default="False", required=False)

    args, unknown = parser.parse_known_args()

    logger = script_utils.stderrlogger(__file__)

    logger.debug(args)
    try:
        if args.fasta_reference_only.lower() == "true":
            ref_only = True
        elif args.fasta_reference_only.lower() == "false":
            ref_only = False
        else:
            raise Exception("Expected true or false for fasta_reference_only.")
    
        transform(shock_service_url = args.shock_service_url, 
                  handle_service_url = args.handle_service_url, 
                  output_file_name = args.output_file_name, 
                  input_directory = args.input_directory, 
                  working_directory = args.working_directory, 
Exemple #50
0
def transform(shock_service_url=None,
              workspace_service_url=None,
              workspace_name=None,
              object_name=None,
              object_id=None,
              object_version=None,
              working_directory=None,
              output_file_name=None,
              level=logging.INFO,
              logger=None):
    """
    Transforms KBaseGenomes.Genome and KBaseGenomes.ContigSet objects to Genbank file.
    
    Args:
        shock_service_url: If you have shock references you need to make.
        workspace_service_url: KBase Workspace URL
        workspace_name: Name of the workspace to save the data to
        object_name: Name of the genome object to save
        object_id: Id of the genome object to save
        object_version: Version of the genome object to save
        working_directory: A directory where you can do work
        output_file_name: File name for Genbank output
    
    Returns:
        Genbank output file.
    
    Authors:
        Shinjae Yoo, Matt Henderson, Marcin Joachimiak
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)

    logger.info("Starting transformation of KBaseGenomes.Genome to Genbank")

    classpath = [
        "$KB_TOP/lib/jars/kbase/transform/GenBankTransform.jar",
        "$KB_TOP/lib/jars/kbase/genomes/kbase-genomes-20140411.jar",
        "$KB_TOP/lib/jars/kbase/common/kbase-common-0.0.6.jar",
        "$KB_TOP/lib/jars/jackson/jackson-annotations-2.2.3.jar",
        "$KB_TOP/lib/jars/jackson/jackson-core-2.2.3.jar",
        "$KB_TOP/lib/jars/jackson/jackson-databind-2.2.3.jar",
        "$KB_TOP/lib/jars/kbase/auth/kbase-auth-1398468950-3552bb2.jar",
        "$KB_TOP/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar"
    ]

    argslist = [
        "--shock_service_url {0}".format(shock_service_url),
        "--workspace_service_url {0}".format(workspace_service_url),
        "--workspace_name {0}".format(workspace_name),
        "--working_directory {0}".format(working_directory)
    ]

    logger.debug(object_name)

    if object_id is not None and len(object_id) > 0:
        argslist.append("--object_id {0}".format(object_id))
    elif object_name is not None and len(object_name) > 0:
        object_name_print = object_name.replace("|", "\|")
        argslist.append("--object_name {0}".format(object_name_print))
    else:
        logger.error(
            "Transformation from KBaseGenomes.Genome to Genbank.Genome failed due to no object name or id"
        )
        sys.exit(1)

    if object_version is not None:
        try:
            int(object_version)
        except:
            logger.error(
                "Version number not correct!  Expected integer, but found {0}".
                format(type(object_version)))
            sys.exit(1)

        argslist.append("--object_version {0}".format(object_version))

    if output_file_name is not None and len(output_file_name) > 0:
        argslist.append("--output_file {0}".format(output_file_name))

    arguments = [
        "java", "-classpath", ":".join(classpath),
        "us.kbase.genbank.GenometoGbk", " ".join(argslist)
    ]

    logger.debug(arguments)

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    tool_process = subprocess.Popen(" ".join(arguments),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.error(
            "Transformation from KBaseGenomes.Genome to Genbank.Genome failed on {0}"
            .format(object_name))
        logger.error(stderr)
        sys.exit(1)

    logger.info(
        "Transformation from KBaseGenomes.Genome to Genbank.Genome completed.")
    sys.exit(0)
Exemple #51
0
def main():
    script_details = script_utils.parse_docs(transform.__doc__)

    import argparse

    parser = argparse.ArgumentParser(prog=__file__,
                                     description=script_details["Description"],
                                     epilog=script_details["Authors"])

    parser.add_argument('--shock_service_url',
                        help=script_details["Args"]["shock_service_url"],
                        action='store',
                        type=str,
                        nargs='?',
                        required=True)
    parser.add_argument('--handle_service_url',
                        help=script_details["Args"]["handle_service_url"],
                        action='store',
                        type=str,
                        nargs='?',
                        default=None,
                        required=False)
    parser.add_argument('--input_directory',
                        help=script_details["Args"]["input_directory"],
                        action='store',
                        type=str,
                        nargs='?',
                        required=True)
    parser.add_argument('--working_directory',
                        help=script_details["Args"]["working_directory"],
                        action='store',
                        type=str,
                        nargs='?',
                        required=True)
    parser.add_argument('--output_file_name',
                        help=script_details["Args"]["output_file_name"],
                        action='store',
                        type=str,
                        nargs='?',
                        default=None,
                        required=False)
    parser.add_argument('--shock_id',
                        help=script_details["Args"]["shock_id"],
                        action='store',
                        type=str,
                        nargs='?',
                        default=None,
                        required=False)
    parser.add_argument('--handle_id',
                        help=script_details["Args"]["handle_id"],
                        action='store',
                        type=str,
                        nargs='?',
                        default=None,
                        required=False)

    parser.add_argument('--input_mapping',
                        help=script_details["Args"]["input_mapping"],
                        action='store',
                        type=unicode,
                        nargs='?',
                        default=None,
                        required=False)

    # custom arguments specific to this uploader
    parser.add_argument('--polarity',
                        help=script_details["Args"]["polarity"],
                        action='store',
                        type=int,
                        required=False)
    parser.add_argument('--group',
                        help=script_details["Args"]["group"],
                        action='store',
                        type=str,
                        required=False)
    parser.add_argument('--inclusion_order',
                        help=script_details["Args"]["inclusion_order"],
                        action='store',
                        type=int,
                        required=False)
    parser.add_argument('--retention_correction',
                        help=script_details["Args"]["retention_correction"],
                        action='store',
                        type=float,
                        required=False)
    parser.add_argument('--atlases',
                        help=script_details["Args"]["atlases"],
                        action='store',
                        type=str,
                        nargs='?',
                        required=False)
    parser.add_argument('--mzml_file_name',
                        help=script_details["Args"]["mzml_file_name"],
                        action='store',
                        type=str,
                        required=False)
    parser.add_argument('--normalization_factor',
                        help=script_details["Args"]["normalization_factor"],
                        action='store',
                        type=float,
                        required=False)

    args, unknown = parser.parse_known_args()

    logger = script_utils.stderrlogger(__file__)

    logger.debug(args)
    try:
        transform(shock_service_url=args.shock_service_url,
                  handle_service_url=args.handle_service_url,
                  output_file_name=args.output_file_name,
                  input_directory=args.input_directory,
                  working_directory=args.working_directory,
                  shock_id=args.shock_id,
                  handle_id=args.handle_id,
                  input_mapping=args.input_mapping,
                  mzml_file_name=args.mzml_file_name,
                  polarity=args.polarity,
                  atlases=args.atlases,
                  group=args.group,
                  inclusion_order=args.inclusion_order,
                  normalization_factor=args.normalization_factor,
                  retention_correction=args.retention_correction,
                  logger=logger)
    except Exception as e:
        logger.exception(e)
        sys.exit(1)
Exemple #52
0
def main():
    parser = script_utils.ArgumentParser(
        prog=SCRIPT_NAME,
        description='Converts KBaseFile.AssemblyFile to  ' +
        'KBaseGenomes.ContigSet.',
        epilog='Authors: Jason Baumohl, Matt Henderson, Gavin Price')
    # The following 7 arguments should be standard to all uploaders
    parser.add_argument('--working_directory',
                        help='Directory for temporary files',
                        action='store',
                        type=str,
                        required=True)

    # Example of a custom argument specific to this uploader
    parser.add_argument('--workspace_service_url',
                        help='workspace service url',
                        action='store',
                        type=str,
                        required=True)
    parser.add_argument('--source_workspace_name',
                        help='name of the source workspace',
                        action='store',
                        type=str,
                        required=True)
    parser.add_argument('--destination_workspace_name',
                        help='name of the target workspace',
                        action='store',
                        type=str,
                        required=True)
    parser.add_argument('--source_object_name',
                        help='name of the workspace object to convert',
                        action='store',
                        type=str,
                        required=True)
    parser.add_argument('--destination_object_name',
                        help='name for the produced ContigSet.',
                        action='store',
                        type=str,
                        required=True)

    parser.add_argument(
        '--fasta_reference_only',
        help='Creates a reference to the fasta file in Shock, but does not ' +
        'store the sequences in the workspace object.  Not recommended ' +
        'unless the fasta file is larger than 1GB. This is the default ' +
        'behavior for files that large.',
        action='store_true',
        required=False)

    # ignore unknown arguments for now
    args, _ = parser.parse_known_args()

    logger = script_utils.stderrlogger(__file__)
    try:
        # make there's at least something for a token
        if not TOKEN:
            raise Exception("Unable to retrieve KBase Authentication token!")

        shock_url, shock_id, ref, source = download_workspace_data(
            args.workspace_service_url, args.source_workspace_name,
            args.source_object_name, args.working_directory, logger)

        inputfile = os.path.join(args.working_directory,
                                 args.source_object_name)

        cs = convert_to_contigs(None,
                                None,
                                inputfile,
                                args.destination_object_name,
                                args.working_directory,
                                shock_id,
                                None,
                                args.fasta_reference_only,
                                source,
                                logger=logger)

        upload_workspace_data(cs, args.workspace_service_url, ref,
                              args.destination_workspace_name,
                              args.destination_object_name)
    except Exception, e:
        logger.exception(e)
        sys.exit(1)
Exemple #53
0
    parser.add_argument("--output_file_name",
                        help=script_details["Args"]["output_file_name"],
                        action="store",
                        type=str,
                        nargs="?",
                        required=False)
    parser.add_argument("--working_directory",
                        help=script_details["Args"]["working_directory"],
                        action="store",
                        type=str,
                        nargs='?',
                        required=False)

    args, unknown = parser.parse_known_args()

    logger = script_utils.stderrlogger(__file__, level=logging.DEBUG)

    try:
        transform(shock_service_url=args.shock_service_url,
                  workspace_service_url=args.workspace_service_url,
                  workspace_name=args.workspace_name,
                  object_name=args.object_name,
                  object_version=args.object_version,
                  working_directory=args.working_directory,
                  output_file_name=args.output_file_name,
                  logger=logger)
    except Exception, e:
        logger.exception(e)
        sys.exit(1)

    sys.exit(0)
def transform(workspace_service_url=None,
              workspace_name=None,
              object_name=None,
              output_file_name=None,
              input_directory=None,
              working_directory=None,
              has_replicates=None,
              input_mapping=None,
              format_type=None,
              level=logging.INFO,
              logger=None):
    """
    Converts Growth TSV file to json string of KBaseEnigmaMetals.GrowthMatrix type.

    Args:
        workspace_service_url: URL for a KBase Workspace service where KBase objects.
                               are stored.
        workspace_name: The name of the destination workspace.
        object_name: The destination object name.
        output_file_name: A file name where the output JSON string should be stored.
                          If the output file name is not specified the name will
                          default to the name of the input file appended with
                          '_output.json'.
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be
                           written to.
        has_replicates: 0 if the input file contains marked series of replicates, 
        				1 if the input file contains non-marked series of replicates, 
                        2 if the input file contains no replicates.
        input_mapping: JSON string mapping of input files to expected types.
                       If you don't get this you need to scan the input
                       directory and look for your files.
        format_type: Mannually defined type of TSV file format.

    Returns:
        JSON files on disk that can be saved as a KBase workspace objects.

    Authors:
        Roman Sutormin, Alexey Kazakov
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)


#    logger.info("Starting conversion of Growth TSV to KBaseEnigmaMetals.GrowthMatrix")
# token = os.environ.get('KB_AUTH_TOKEN')

    if not working_directory or not os.path.isdir(working_directory):
        raise Exception(
            "The working directory {0} is not a valid directory!".format(
                working_directory))

    classpath = [
        "$KB_TOP/lib/jars/kbase/transform/kbase_transform_deps.jar",
        "$KB_TOP/lib/jars/apache_commons/commons-cli-1.2.jar",
        "$KB_TOP/lib/jars/apache_commons/commons-lang3-3.1.jar",
        "$KB_TOP/lib/jars/ini4j/ini4j-0.5.2.jar",
        "$KB_TOP/lib/jars/jackson/jackson-annotations-2.2.3.jar",
        "$KB_TOP/lib/jars/jackson/jackson-core-2.2.3.jar",
        "$KB_TOP/lib/jars/jackson/jackson-databind-2.2.3.jar",
        "$KB_TOP/lib/jars/jetty/jetty-all-7.0.0.jar",
        "$KB_TOP/lib/jars/jna/jna-3.4.0.jar",
        "$KB_TOP/lib/jars/kbase/auth/kbase-auth-0.3.1.jar",
        "$KB_TOP/lib/jars/kbase/common/kbase-common-0.0.10.jar",
        "$KB_TOP/lib/jars/servlet/servlet-api-2.5.jar",
        "$KB_TOP/lib/jars/syslog4j/syslog4j-0.9.46.jar",
        "$KB_TOP/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar"
    ]

    mc = "us.kbase.kbaseenigmametals.GrowthMatrixUploader"

    argslist = [
        "--workspace_service_url {0}".format(workspace_service_url),
        "--workspace_name {0}".format(workspace_name),
        "--object_name {0}".format(object_name),
        "--input_directory {0}".format(input_directory),
        "--has_replicates {0}".format(has_replicates),
        "--working_directory {0}".format(working_directory)
    ]
    if output_file_name:
        argslist.append("--output_file_name {0}".format(output_file_name))
    if input_mapping:
        argslist.append("--input_mapping {0}".format(input_mapping))
    argslist.append("--format_type {0}".format(format_type))

    arguments = [
        "java", "-classpath", ":".join(classpath), mc, " ".join(argslist)
    ]

    logger.info(arguments)

    # need shell in this case because the java code is depending on finding the KBase token in the environment
    tool_process = subprocess.Popen(" ".join(arguments),
                                    stderr=subprocess.PIPE,
                                    shell=True)
    stdout, stderr = tool_process.communicate()

    if stdout is not None and len(stdout) > 0:
        logger.info(stdout)

    if stderr is not None and len(stderr) > 0:
        logger.error(stderr)
    if tool_process.returncode:
        logger.error(
            "Transformation from TSV.Growth to KBaseEnigmaMetals.GrowthMatrix failed on {0}"
            .format(input_directory))
        sys.exit(1)

    logger.info("Conversion completed.")
if __name__ == "__main__":
    script_details = script_utils.parse_docs(validate.__doc__)

    import argparse

    parser = argparse.ArgumentParser(prog=__file__,
                                     description=script_details["Description"],                                     
                                     epilog=script_details["Authors"])
    parser.add_argument("--input_directory", help=script_details["Args"]["input_directory"], type=str, nargs="?", required=True)
    parser.add_argument("--working_directory", help=script_details["Args"]["working_directory"], type=str, nargs="?", required=True)

    args, unknown = parser.parse_known_args()

    returncode = 0

    try:
        validate(input_directory = args.input_directory, 
                 working_directory = args.working_directory)
    except Exception, e:
        logger = script_utils.stderrlogger(__file__, logging.INFO)
        logger.exception(e)
        returncode = 1

    sys.stdout.flush()
    sys.stderr.flush()
    os.close(sys.stdout.fileno())
    os.close(sys.stderr.fileno())    
    sys.exit(returncode)

Exemple #56
0
def transform(shock_service_url=None, handle_service_url=None, 
              output_file_name=None, input_directory=None, 
              working_directory=None, shock_id=None, handle_id=None, 
              input_mapping=None, fasta_reference_only=False, 
              level=logging.INFO, logger=None):
    """
    Converts FASTA file to KBaseGenomes.ContigSet json string.  
    Note the MD5 for the contig is generated by uppercasing the sequence.
    The ContigSet MD5 is generated by taking the MD5 of joining the sorted 
    list of individual contig's MD5s with a comma separator.

    Args:
        shock_service_url: A url for the KBase SHOCK service.
        handle_service_url: A url for the KBase Handle Service.
        output_file_name: A file name where the output JSON string should be stored.  
                          If the output file name is not specified the name will default 
                          to the name of the input file appended with '_contig_set'
        input_directory: The directory where files will be read from.
        working_directory: The directory the resulting json file will be written to.
        shock_id: Shock id for the fasta file if it already exists in shock
        handle_id: Handle id for the fasta file if it already exists as a handle
        input_mapping: JSON string mapping of input files to expected types.  
                       If you don't get this you need to scan the input 
                       directory and look for your files.
        fasta_reference_only: Creates a reference to the fasta file in Shock, but does not store the sequences in the workspace object.  Not recommended unless the fasta file is larger than 1GB. This is the default behavior for files that large.
        level: Logging level, defaults to logging.INFO.
        
    Returns:
        JSON file on disk that can be saved as a KBase workspace object.

    Authors:
        Jason Baumohl, Matt Henderson
    """

    if logger is None:
        logger = script_utils.stderrlogger(__file__)
    
    logger.info("Starting conversion of FASTA to KBaseGenomes.ContigSet")
    token = os.environ.get('KB_AUTH_TOKEN')
        
    if input_mapping is None:
        logger.info("Scanning for FASTA files.")
    
        valid_extensions = [".fa",".fasta",".fna",".fas"]
    
        files = os.listdir(input_directory)
        fasta_files = [x for x in files if os.path.splitext(x)[-1] in valid_extensions]
            
        if (len(fasta_files) == 0):
            raise Exception("The input file does not have one of the following extensions .fa, .fasta, .fas or .fna")        

    
        logger.info("Found {0}".format(str(fasta_files)))

        input_file_name = os.path.join(input_directory,files[0])
    
        if len(fasta_files) > 1:
            logger.warning("Not sure how to handle multiple FASTA files in this context. Using {0}".format(input_file_name))
    else:
        input_file_name = os.path.join(os.path.join(input_directory, "FASTA.DNA.Assembly"), simplejson.loads(input_mapping)["FASTA.DNA.Assembly"])
        
                
    logger.info("Building Object.")
 
    if not os.path.isfile(input_file_name):
        raise Exception("The input file name {0} is not a file!".format(input_file_name))        

    if not os.path.isdir(working_directory):
        raise Exception("The working directory {0} is not a valid directory!".format(working_directory))        

    logger.debug(fasta_reference_only)

    # default if not too large
    contig_set_has_sequences = True 
    if fasta_reference_only:
        contig_set_has_sequences = False 

    fasta_filesize = os.stat(input_file_name).st_size
    if fasta_filesize > 1000000000:
        # Fasta file too large to save sequences into the ContigSet object.
        contigset_warn = """The FASTA input file seems to be too large. A ContigSet
                            object will be created without sequences, but will
                            contain a reference to the file."""
        logger.warning(contigset_warn) 
        contig_set_has_sequences = False 

    input_file_handle = open(input_file_name, 'r')
    
    fasta_header = None
    sequence_list = []
    fasta_dict = dict()
    first_header_found = False
    contig_set_md5_list = []
    # Pattern for replacing white space
    pattern = re.compile(r'\s+')
    sequence_exists = False
    
    valid_chars = "-AaCcGgTtUuWwSsMmKkRrYyBbDdHhVvNn"
    amino_acid_specific_characters = "PpLlIiFfQqEe" 

    for current_line in input_file_handle:
        if (current_line[0] == ">"):
            # found a header line
            # Wrap up previous fasta sequence
            if (not sequence_exists) and first_header_found:
                logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
                raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
            if not first_header_found:
                first_header_found = True
            else:
                # build up sequence and remove all white space
                total_sequence = ''.join(sequence_list)
                total_sequence = re.sub(pattern, '', total_sequence)
                if not total_sequence :
                    logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
                    raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header))
#                for character in total_sequence:
                seq_count = collections.Counter(total_sequence)
                seq_dict = dict(seq_count)
                for character in seq_dict:
                    if character not in valid_chars:
                        if character in amino_acid_specific_characters:
                            raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                        raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))
#                fasta_key = fasta_header.strip()
                try:
                    fasta_key , fasta_description = fasta_header.strip().split(' ',1)
                except:
                    fasta_key = fasta_header.strip()
                    fasta_description = None

                if fasta_key == '':
                    raise Exception("One fasta header lines '>' does not have an identifier associated with it")
                contig_dict = dict() 
                contig_dict["id"] = fasta_key 
                contig_dict["length"] = len(total_sequence) 
                contig_dict["name"] = fasta_key
                if fasta_description is None:
                    contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
                else:
                    contig_dict["description"] = "%s.  Note MD5 is generated from uppercasing the sequence" % (fasta_description)
                contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest() 
                contig_dict["md5"] = contig_md5 
                contig_set_md5_list.append(contig_md5)
                 
                if contig_set_has_sequences: 
                    contig_dict["sequence"]= total_sequence
                else: 
                    contig_dict["sequence"]= ""
                
                if fasta_key in fasta_dict:
                    raise Exception("The fasta header {0} appears more than once in the file ".format(fasta_key)) 
                else:
                    fasta_dict[fasta_key] = contig_dict                 
               
                # get set up for next fasta sequence
                sequence_list = []
                sequence_exists = False
            
            fasta_header = current_line.replace('>','')
        else:
            sequence_list.append(current_line)
            sequence_exists = True

    input_file_handle.close()

    # wrap up last fasta sequence
    if (not sequence_exists) and first_header_found: 
        logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header))        
        raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
    elif not first_header_found :
        logger.error("There are no contigs in this file") 
        raise Exception("There are no contigs in this file") 
    else: 
        # build up sequence and remove all white space      
        total_sequence = ''.join(sequence_list)
        total_sequence = re.sub(pattern, '', total_sequence)
        if not total_sequence :
            logger.error("There is no sequence related to FASTA record : {0}".format(fasta_header)) 
            raise Exception("There is no sequence related to FASTA record : {0}".format(fasta_header)) 

#        for character in total_sequence: 
        seq_count = collections.Counter(total_sequence)
        seq_dict = dict(seq_count)
        for character in seq_dict:
            if character not in valid_chars: 
                if character in amino_acid_specific_characters:
                    raise Exception("This fasta file may have amino acids in it instead of the required nucleotides.")
                raise Exception("This FASTA file has non nucleic acid characters : {0}".format(character))

#        fasta_key = fasta_header.strip()
        try: 
            fasta_key , fasta_description = fasta_header.strip().split(' ',1)
        except:
            fasta_key = fasta_header.strip()
            fasta_description = None
 
        if fasta_key == '':
            raise Exception("One fasta header lines '>' does not have an identifier associated with it")
        contig_dict = dict()
        contig_dict["id"] = fasta_key 
        contig_dict["length"] = len(total_sequence)
        contig_dict["name"] = fasta_key
 
        if fasta_description is None: 
            contig_dict["description"] = "Note MD5 is generated from uppercasing the sequence" 
        else: 
            contig_dict["description"] = "%s.  Note MD5 is generated from uppercasing the sequence" % (fasta_description) 
        contig_md5 = hashlib.md5(total_sequence.upper()).hexdigest()
        contig_dict["md5"]= contig_md5
        contig_set_md5_list.append(contig_md5)
        
        if contig_set_has_sequences: 
            contig_dict["sequence"] = total_sequence 
        else:
            contig_dict["sequence"]= ""
        if fasta_key in fasta_dict:
            raise Exception("The fasta header {0} appears more than once in the file ".format(fasta_key)) 
        else:
            fasta_dict[fasta_key] = contig_dict 


    if output_file_name is None:
        # default to input file name minus file extenstion adding "_contig_set" to the end
        base = os.path.basename(input_file_name)
        output_file_name = "{0}_contig_set.json".format(os.path.splitext(base)[0])
    
    contig_set_dict = dict()
    contig_set_dict["md5"] = hashlib.md5(",".join(sorted(contig_set_md5_list))).hexdigest()
    contig_set_dict["id"] = output_file_name
    contig_set_dict["name"] = output_file_name
    contig_set_dict["source"] = "KBase"
    contig_set_dict["source_id"] = os.path.basename(input_file_name) 
    contig_set_dict["contigs"] = [fasta_dict[x] for x in sorted(fasta_dict.keys())]

    if shock_id is None:
        shock_info = script_utils.upload_file_to_shock(logger, shock_service_url, input_file_name, token=token)
        shock_id = shock_info["id"]
    
    contig_set_dict["fasta_ref"] = shock_id

    # For future development if the type is updated to the handle_reference instead of a shock_reference

    # This generates the json for the object
    objectString = simplejson.dumps(contig_set_dict, sort_keys=True, indent=4)
    if len(contig_set_dict["contigs"]) == 0:
        raise Exception("There appears to be no FASTA DNA Sequences in the input file.") 
    #The workspace has a 1GB limit
    if sys.getsizeof(objectString) > 1E9 :
        contig_set_dict["contigs"] = []
        objectString = simplejson.dumps(contig_set_dict, sort_keys=True, indent=4)
        logger.warning("The fasta file has a very large number of contigs thus resulting in an object being too large if " 
                       "the contigs are to have metadata. The resulting contigset will not have individual metadata for the contigs.")

    logger.info("ContigSet data structure creation completed.  Writing out JSON.")

    output_file_path = os.path.join(working_directory,output_file_name) 
    with open(output_file_path, "w") as outFile:
        outFile.write(objectString)
    
    logger.info("Conversion completed.")