def validate(input_file_name, working_directory, level=logging.INFO, logger=None): """ Validates any file containing sequence data. Args: input_file_name: An input SBML file. working_directory: A directory where any output files produced by validation can be written. level: Logging level, defaults to logging.INFO. Returns: 0 on success, 1 on failure. All statements passed to standard out via a logger, any errors throw an Exception and result in a non-zero exit status back to the caller. Authors: Srividya Ramikrishnan, Matt Henderson """ if logger is None: logger = script_utils.stdoutlogger(__file__) command = os.path.join(os.environ.get("KB_TOP"), "bin/validateSBML") validated = False fileName = os.path.split(input_file_name)[-1] if not os.path.isfile(input_file_name): raise Exception("Not a file {0}".format(fileName)) logger.info("Starting SBML validation of {0}".format(fileName)) arguments = [command, input_file_name] tool_process = subprocess.Popen(arguments, stderr=subprocess.PIPE) stdout, stderr = tool_process.communicate() if len(stderr) > 0: logger.error("Validation failed on {0}".format(fileName)) else: logger.info("Validation passed on {0}".format(fileName)) validated = True if not validated: raise Exception("Validation failed!") else: logger.info("Validation passed.")
def transform(shock_service_url=None, workspace_service_url=None, workspace_name=None, object_name=None, contigset_object_name=None, input_directory=None, working_directory=None, level=logging.INFO, logger=None): """ Transforms Genbank file to KBaseGenomes.Genome and KBaseGenomes.ContigSet objects. Args: shock_service_url: If you have shock references you need to make. workspace_service_url: KBase Workspace URL workspace_name: Name of the workspace to save the data to object_name: Name of the genome object to save contigset_object_name: Name of the ContigSet object that is created with this Genome input_directory: A directory of either a genbank file or a directory of partial genome files to merge working_directory: A directory where you can do work Returns: Workspace objects saved to the user's workspace. Authors: Shinjae Yoo, Marcin Joachimiak, Matt Henderson """ if logger is None: logger = script_utils.stdoutlogger(__file__, logging.INFO) logger.info("Starting transformation of Genbank to KBaseGenomes.Genome") # TODO get the classpath definition out into the config instead KB_TOP = os.environ["KB_TOP"] classpath = ["{}/lib/jars/kbase/transform/kbase_transform_deps.jar".format(KB_TOP), "{}/lib/jars/kbase/genomes/kbase-genomes-20140411.jar".format(KB_TOP), "{}/lib/jars/kbase/common/kbase-common-0.0.6.jar".format(KB_TOP), "{}/lib/jars/jackson/jackson-annotations-2.2.3.jar".format(KB_TOP), "{}/lib/jars/jackson/jackson-core-2.2.3.jar".format(KB_TOP), "{}/lib/jars/jackson/jackson-databind-2.2.3.jar".format(KB_TOP), "{}/lib/jars/kbase/auth/kbase-auth-1398468950-3552bb2.jar".format(KB_TOP), "{}/lib/jars/kbase/workspace/WorkspaceClient-0.2.0.jar".format(KB_TOP)] for p in classpath: try: assert os.path.exists(p) except AssertionError, e: raise IOError("Unable to find classpath library {}".format(p))
def validate(input_directory, working_directory, level=logging.INFO, logger=None): """ Validates any file containing sequence data. Args: input_directory: A directory containing one or more SequenceRead files. working_directory: A directory where any output files produced by validation can be written. level: Logging level, defaults to logging.INFO. Returns: Currently writes to stderr with a Java Exception trace on error, otherwise no output. Authors: Srividya Ramikrishnan, Jason Baumohl, Matt Henderson """ if logger is None: logger = script_utils.stdoutlogger(__file__, level) # TODO get classpaths and binary paths into the config KB_TOP = os.environ["KB_TOP"] fasta_executable = "{}/lib/jars/FastaValidator/FastaValidator-1.0.jar".format(KB_TOP) fastq_executable = "fastQValidator" fasta_validator_present = False fastq_validator_present = False fastq_validator_runnable = False if os.path.isfile(fasta_executable): fasta_validator_present = True for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, fastq_executable) if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK): fastq_validator_present = True fastq_validator_runnable = True break elif os.path.isfile(exe_file): fastq_validator_present = True break if not fasta_validator_present: logger.warning("FASTA validator executable FastaValidator-1.0.jar could not be found.") if not fastq_validator_present: logger.warning("FASTQ validator executable fastQValidator could not be found.") elif not fastq_validator_runnable: logger.warning("FASTQ validator executable fastQValidator does not have execute permissions.") fasta_extensions = [".fa",".fas",".fasta",".fna"] fastq_extensions = [".fq",".fastq",".fnq"] extensions = fasta_extensions + fastq_extensions checked = False validated = True for input_file_name in os.listdir(input_directory): logger.info("Checking for SequenceReads file : {0}".format(input_file_name)) filePath = os.path.abspath(os.path.join(input_directory, input_file_name)) if not os.path.isfile(filePath): logger.warning("Skipping directory {0}".format(input_file_name)) continue elif os.path.splitext(input_file_name)[-1] not in extensions: logger.warning("Unrecognized file type {}, skipping.".format(os.path.splitext(input_file_name)[-1])) continue logger.info("Starting SequenceReads validation of {0}".format(input_file_name)) if os.path.splitext(input_file_name)[-1] in fasta_extensions: # TODO This needs to be changed, this is really just a demo program for this library and not a serious tool arguments = ["java", "-classpath", fasta_executable, "FVTester", "'{}'".format(filePath)] elif os.path.splitext(input_file_name)[-1] in fastq_extensions: logger.info("Checking FASTQ line count for errors.") line_number = 0 with open(filePath, 'rb') as seqfile: for line in seqfile: line_number += 1 logger.info("FASTQ line count check completed.") if line_number % 4 > 0: logger.warning("Found extra lines, removing blank lines.") out = open(filePath + ".tmp", 'w') with open(filePath, 'r') as seqfile: for line in seqfile: if len(line.strip()) == 0: pass out.write(line) out.close() os.remove(filePath) os.rename(filePath + ".tmp", filePath) logger.warning("Blank lines removed from FASTQ.") arguments = [fastq_executable, "--file", "'{}'".format(filePath), "--maxErrors", "10"] if (check_interleavedPE(filePath) == 1): arguments.append("--disableSeqIDCheck") logger.info("Running {}".format(" ".join(arguments).replace(filePath, input_file_name))) tool_process = subprocess.Popen(" ".join(arguments), shell=True) tool_process.wait() if tool_process.returncode != 0: logger.error("Validation failed on {0}".format(input_file_name)) validated = False break else: logger.info("Validation passed on {0}".format(input_file_name)) checked = True if not validated: raise Exception("Validation failed!") elif not checked: raise Exception("No files were found that had a valid fasta or fastq extension.") else: logger.info("Validation passed.")
raise Exception("Validation failed!") else: logger.info("Validation passed.") if __name__ == "__main__": script_details = script_utils.parse_docs(validate.__doc__) import argparse parser = argparse.ArgumentParser(prog=__file__, description=script_details["Description"], epilog=script_details["Authors"]) parser.add_argument("--input_file_name", help=script_details["Args"]["input_file_name"], type=str, nargs="?", required=True) parser.add_argument("--working_directory", help=script_details["Args"]["working_directory"], type=str, nargs="?", required=True) args = parser.parse_args() logger = script_utils.stdoutlogger(__file__) try: validate(input_file_name = args.input_file_name, working_directory = args.working_directory, level = logging.DEBUG, logger = logger) except Exception, e: logger.exception(e) sys.exit(1) sys.exit(0)
import argparse parser = argparse.ArgumentParser(prog=__file__, description=script_details["Description"], epilog=script_details["Authors"]) parser.add_argument("--input_file_name", help=script_details["Args"]["input_file_name"], type=str, nargs="?", required=True) parser.add_argument("--working_directory", help=script_details["Args"]["working_directory"], type=str, nargs="?", required=True) args = parser.parse_args() logger = script_utils.stdoutlogger(__file__) try: validate(input_file_name=args.input_file_name, working_directory=args.working_directory, level=logging.DEBUG, logger=logger) except Exception, e: logger.exception(e) sys.exit(1) sys.exit(0)
def validate(input_directory, working_directory, level=logging.INFO, logger=None): """ Validates any file containing sequence data. Args: input_directory: A directory containing one or more SequenceRead files. working_directory: A directory where any output files produced by validation can be written. level: Logging level, defaults to logging.INFO. Returns: Currently writes to stderr with a Java Exception trace on error, otherwise no output. Authors: Srividya Ramikrishnan, Jason Baumohl, Matt Henderson """ if logger is None: logger = script_utils.stdoutlogger(__file__, level) # TODO get classpaths and binary paths into the config KB_TOP = os.environ["KB_TOP"] fasta_executable = "{}/lib/jars/FastaValidator/FastaValidator-1.0.jar".format( KB_TOP) fastq_executable = "fastQValidator" fasta_validator_present = False fastq_validator_present = False fastq_validator_runnable = False if os.path.isfile(fasta_executable): fasta_validator_present = True for path in os.environ["PATH"].split(os.pathsep): path = path.strip('"') exe_file = os.path.join(path, fastq_executable) if os.path.isfile(exe_file) and os.access(exe_file, os.X_OK): fastq_validator_present = True fastq_validator_runnable = True break elif os.path.isfile(exe_file): fastq_validator_present = True break if not fasta_validator_present: logger.warning( "FASTA validator executable FastaValidator-1.0.jar could not be found." ) if not fastq_validator_present: logger.warning( "FASTQ validator executable fastQValidator could not be found.") elif not fastq_validator_runnable: logger.warning( "FASTQ validator executable fastQValidator does not have execute permissions." ) fasta_extensions = [".fa", ".fas", ".fasta", ".fna"] fastq_extensions = [".fq", ".fastq", ".fnq"] extensions = fasta_extensions + fastq_extensions checked = False validated = True for input_file_name in os.listdir(input_directory): logger.info( "Checking for SequenceReads file : {0}".format(input_file_name)) filePath = os.path.abspath( os.path.join(input_directory, input_file_name)) if not os.path.isfile(filePath): logger.warning("Skipping directory {0}".format(input_file_name)) continue elif os.path.splitext(input_file_name)[-1] not in extensions: logger.warning("Unrecognized file type {}, skipping.".format( os.path.splitext(input_file_name)[-1])) continue logger.info( "Starting SequenceReads validation of {0}".format(input_file_name)) if os.path.splitext(input_file_name)[-1] in fasta_extensions: # TODO This needs to be changed, this is really just a demo program for this library and not a serious tool arguments = [ "java", "-classpath", fasta_executable, "FVTester", "'{}'".format(filePath) ] elif os.path.splitext(input_file_name)[-1] in fastq_extensions: logger.info("Checking FASTQ line count for errors.") line_number = 0 with open(filePath, 'rb') as seqfile: for line in seqfile: line_number += 1 logger.info("FASTQ line count check completed.") if line_number % 4 > 0: logger.warning("Found extra lines, removing blank lines.") out = open(filePath + ".tmp", 'w') with open(filePath, 'r') as seqfile: for line in seqfile: if len(line.strip()) == 0: pass out.write(line) out.close() os.remove(filePath) os.rename(filePath + ".tmp", filePath) logger.warning("Blank lines removed from FASTQ.") arguments = [ fastq_executable, "--file", "'{}'".format(filePath), "--maxErrors", "10" ] if (check_interleavedPE(filePath) == 1): arguments.append("--disableSeqIDCheck") logger.info("Running {}".format(" ".join(arguments).replace( filePath, input_file_name))) tool_process = subprocess.Popen(" ".join(arguments), shell=True) tool_process.wait() if tool_process.returncode != 0: logger.error("Validation failed on {0}".format(input_file_name)) validated = False break else: logger.info("Validation passed on {0}".format(input_file_name)) checked = True if not validated: raise Exception("Validation failed!") elif not checked: raise Exception( "No files were found that had a valid fasta or fastq extension.") else: logger.info("Validation passed.")