def set_up_sra_cache_folder(temp_folder): """Set up the fastq-dump cache folder within the temp folder.""" logging.info("Setting up fastq-dump cache within {}".format(temp_folder)) for path in [ os.path.expanduser("~/ncbi"), os.path.expanduser("~/ncbi/public") ]: if os.path.exists(path) is False: os.mkdir(path) if os.path.exists(os.path.expanduser("~/ncbi/public/sra")): shutil.rmtree(os.path.expanduser("~/ncbi/public/sra")) # Now make a folder within the temp folder temp_cache = os.path.join(temp_folder, "sra") assert os.path.exists(temp_cache) is False os.mkdir(temp_cache) # Symlink it to ~/ncbi/public/sra/ run_cmds(["ln", "-s", "-f", temp_cache, os.path.expanduser("~/ncbi/public/sra")]) assert os.path.exists(os.path.expanduser("~/ncbi/public/sra"))
def quality_trim(fp_in, folder_out, min_qual, min_len=30): """Trim a FASTQ to a minimum quality score.""" assert os.path.exists(fp_in) assert fp_in.endswith(".gz") is False assert os.path.exists(folder_out) assert isinstance(min_qual, int) fp_out = os.path.join(folder_out, fp_in.split("/")[-1]) run_cmds([ "fastq_quality_trimmer", "-Q", "33", "-t", str(min_qual), "-i", fp_in, "-o", fp_out, "-l", str(min_len), "-v" ]) assert os.path.exists(fp_out) return fp_out
def get_sra(accession, temp_folder): """Get the FASTQ for an SRA accession.""" logging.info("Downloading {} from SRA".format(accession)) local_path = os.path.join(temp_folder, accession + ".fastq") logging.info("Local path: {}".format(local_path)) # Download via fastq-dump logging.info("Downloading via fastq-dump") run_cmds([ "prefetch", accession ]) run_cmds([ "fastq-dump", "--split-files", "--outdir", temp_folder, accession ]) # Make sure that some files were created msg = "File could not be downloaded from SRA: {}".format(accession) assert any([ fp.startswith(accession) and fp.endswith("fastq") for fp in os.listdir(temp_folder) ]), msg # Combine any multiple files that were found logging.info("Concatenating output files") with open(local_path + ".temp", "wt") as fo: cmd = "cat {}/{}*fastq".format(temp_folder, accession) cat = subprocess.Popen(cmd, shell=True, stdout=fo) cat.wait() # Remove the temp files for fp in os.listdir(temp_folder): if fp.startswith(accession) and fp.endswith("fastq"): fp = os.path.join(temp_folder, fp) logging.info("Removing {}".format(fp)) os.unlink(fp) # Remove the cache file, if any cache_fp = "~/ncbi/public/sra/{}.sra".format(accession) if os.path.exists(cache_fp): logging.info("Removing {}".format(cache_fp)) os.unlink(cache_fp) # Clean up the FASTQ headers for the downloaded file run_cmds(["mv", local_path + ".temp", local_path]) # Return the path to the file logging.info("Done fetching " + accession) return local_path
def get_reads_from_url( input_str, temp_folder, random_string=str(uuid.uuid4())[:8], min_qual=None ): """Get a set of reads from a URL -- return the downloaded filepath.""" # Fetch reads into $temp_folder/fetched_reads/ fetched_reads_folder = os.path.join(temp_folder, "fetched_reads") # Reads with cleaned headers go into $temp_folder/cleaned_reads/ cleaned_reads_folder = os.path.join(temp_folder, "cleaned_reads") # Quality trimmed reads go into $temp_folder/trimmed_reads/ trimmed_reads_folder = os.path.join(temp_folder, "trimmed_reads") for folder in [ fetched_reads_folder, cleaned_reads_folder, trimmed_reads_folder ]: if not os.path.exists(folder): logging.info("Making new folder {}".format(folder)) os.mkdir(folder) logging.info("Getting reads from {}".format(input_str)) filename = input_str.split('/')[-1] local_path = os.path.join(fetched_reads_folder, filename) logging.info("Filename: " + filename) logging.info("Local path: " + local_path) if not input_str.startswith(('s3://', 'sra://', 'ftp://')): logging.info("Treating as local path") msg = "Input file does not exist ({})".format(input_str) assert os.path.exists(input_str), msg logging.info("Making a symlink to temporary folder") os.symlink(input_str, local_path) # Get files from AWS S3 elif input_str.startswith('s3://'): logging.info("Getting reads from S3") run_cmds([ 'aws', 's3', 'cp', '--quiet', '--sse', 'AES256', input_str, fetched_reads_folder ]) # Get files from an FTP server elif input_str.startswith('ftp://'): logging.info("Getting reads from FTP") run_cmds(['wget', '-P', fetched_reads_folder, input_str]) # Get files from SRA elif input_str.startswith('sra://'): accession = filename logging.info("Getting reads from SRA: " + accession) local_path = get_sra(accession, fetched_reads_folder) else: raise Exception("Did not recognize prefix for input: " + input_str) # Clean up the FASTQ headers logging.info("Cleaning up FASTQ headers") cleaned_path = clean_fastq_headers( local_path, cleaned_reads_folder ) logging.info("Made new cleaned FASTQ file: {}".format(cleaned_path)) logging.info("Deleting old file: {}".format(local_path)) os.unlink(local_path) if min_qual is None: return cleaned_path else: # Quality trim the FASTQ logging.info("Quality trimming the FASTQ (Q{})".format(min_qual)) trimmed_path = quality_trim( cleaned_path, trimmed_reads_folder, min_qual ) logging.info("Made new quality trimmed FASTQ: {}".format(trimmed_path)) logging.info("Deleting old file: {}".format(cleaned_path)) os.unlink(cleaned_path) return trimmed_path
def align(self): """Align a set of reads with DIAMOND and run FAMLI.""" parser = argparse.ArgumentParser( description= """Align a set of reads with DIAMOND, filter alignments with FAMLI, and return the results""") parser.add_argument( "--input", type=str, required=True, help="""Location for input file(s). Combine multiple files with +. (Supported: sra://, s3://, or ftp://).""") parser.add_argument("--sample-name", type=str, required=True, help="""Name of sample, sets output filename.""") parser.add_argument("--ref-db", type=str, required=True, help="""Folder containing reference database. (Supported: s3://, ftp://, or local path). """) parser.add_argument("--output-folder", type=str, required=True, help="""Folder to place results. (Supported: s3://, or local path).""") parser.add_argument("--min-score", type=float, default=20, help="Minimum alignment score to report.") parser.add_argument("--blocks", type=int, default=5, help="""Number of blocks used when aligning. Value relates to the amount of memory used. Roughly 6Gb RAM used by DIAMOND per block. """) parser.add_argument("--query-gencode", type=int, default=11, help="Genetic code used to translate nucleotides.") parser.add_argument("--threads", type=int, default=16, help="Number of threads to use aligning.") parser.add_argument("--min-qual", type=int, default=None, help="Trim reads to a minimum Q score.") parser.add_argument("--temp-folder", type=str, default='/share', help="Folder used for temporary files.") parser.add_argument("--batchsize", type=int, help="""Number of reads to process at a time.""") parser.add_argument( "--delete-all-files-in-temp-folder", action="store_true", help= """If flag is set, DELETE ALL OF THE FILES IN THE TEMP FOLDER before starting.""" ) args = parser.parse_args(sys.argv[2:]) # Make sure that there are no commas or whitespaces in the input input_str = args.input assert ' ' not in input_str, input_str assert ',' not in input_str, input_str # Make a temporary folder for all files to be placed in temp_folder = os.path.join(args.temp_folder, str(uuid.uuid4())[:8]) assert os.path.exists(temp_folder) is False os.mkdir(temp_folder) # Set up logging log_fp = os.path.join(temp_folder, "log.txt") logFormatter = logging.Formatter( '%(asctime)s %(levelname)-8s [FAMLI] %(message)s') rootLogger = logging.getLogger() rootLogger.setLevel(logging.INFO) # Write to file fileHandler = logging.FileHandler(log_fp) fileHandler.setFormatter(logFormatter) rootLogger.addHandler(fileHandler) # Also write to STDOUT consoleHandler = logging.StreamHandler() consoleHandler.setFormatter(logFormatter) rootLogger.addHandler(consoleHandler) # Delete the files in scratch, if specified if args.delete_all_files_in_temp_folder: logging.info("Deleting all files in temp folder " + args.temp_folder) for fp in os.listdir(args.temp_folder): fp = os.path.join(args.temp_folder, fp) logging.info("Deleting " + fp) shutil.rmtree(fp) logging.info("Done deleting files in temporary folder") # Check to see if DIAMOND is available logging.info("Checking for a working copy of DIAMOND") run_cmds(["diamond", "--version"]) # Get the reference database try: db_fp = get_reference_database(args.ref_db, temp_folder) except: exit_and_clean_up(temp_folder) # Set up the NCBI fastq-dump cache folder within the temp folder set_up_sra_cache_folder(temp_folder) logging.info("Reference database: " + db_fp) # Align the input data and calculate the overall abundance # Keep track of the time elapsed to process this sample start_time = time.time() logging.info("Processing input argument: " + input_str) # Multiple input reads may be separated with a '+' input_str = input_str.split("+") # Make sure that they are all unique arguments assert len(input_str) == len(set(input_str)), "Duplicate arguments" # Make sure that the filenames are also all unique assert len(input_str) == len(set([s.split('/')[-1] for s in input_str ])), "Duplicate filenames" # Capture each command in a try statement # Get the input reads read_fps = [] for s in input_str: logging.info("Fetching {}".format(s)) try: read_fps.append( get_reads_from_url(s, temp_folder, min_qual=args.min_qual)) except: exit_and_clean_up(temp_folder) # Combine the files into a single FASTQ read_fp = os.path.join(temp_folder, "input.fastq") combine_fastqs(read_fps, read_fp) # Run the alignment try: align_fp = align_reads( read_fp, # FASTQ file path db_fp, # Local path to DB temp_folder, # Folder for results query_gencode=args.query_gencode, threads=args.threads, min_score=args.min_score, blocks=args.blocks, ) except: exit_and_clean_up(temp_folder) # Process the alignments, reassigning multi-mapped reads try: with open(align_fp, "rt") as align_handle: aligned_reads, abund, alignments = parse_alignment( align_handle, batchsize=args.batchsize, ) except: exit_and_clean_up(temp_folder) # Calculate the number of deduplicated reads deduplicated_reads = sum([d["nreads"] for d in abund]) # Name the output file based on the input file # Ultimately adding ".json.gz" to the input file name if args.sample_name is not None: output_prefix = args.sample_name else: output_prefix = input_str[0].split("/")[-1] logging.info( "Using sample name {} for output prefix".format(output_prefix)) # Count the total number of reads logging.info("Counting the total number of reads") n_reads = count_fastq_reads(read_fp) logging.info("Reads in input file: {:,}".format(n_reads)) # Read in the logs logging.info("Reading in the logs") logs = open(log_fp, 'rt').readlines() # Wrap up all of the results into a single JSON # and write it to the output folder output = { "input_path": "+".join(input_str), "input": output_prefix, "sample": args.sample_name, "output_folder": args.output_folder, "logs": logs, "ref_db": db_fp, "ref_db_url": args.ref_db, "results": abund, "total_reads": n_reads, "aligned_reads": aligned_reads, "deduplicated_reads": deduplicated_reads, "time_elapsed": time.time() - start_time, "params": { "batchsize": args.batchsize, "min_score": args.min_score, "blocks": args.blocks, "query_gencode": args.query_gencode, "threads": args.threads, "min_qual": args.min_qual } } return_results(output, output_prefix, args.output_folder, temp_folder) # Delete any files that were created for this sample logging.info("Removing temporary folder: " + temp_folder) shutil.rmtree(temp_folder) # Stop logging logging.info("Done") logging.shutdown()