Exemple #1
0
def set_up_sra_cache_folder(temp_folder):
    """Set up the fastq-dump cache folder within the temp folder."""
    logging.info("Setting up fastq-dump cache within {}".format(temp_folder))
    for path in [
        os.path.expanduser("~/ncbi"),
        os.path.expanduser("~/ncbi/public")
    ]:
        if os.path.exists(path) is False:
            os.mkdir(path)

    if os.path.exists(os.path.expanduser("~/ncbi/public/sra")):
        shutil.rmtree(os.path.expanduser("~/ncbi/public/sra"))

    # Now make a folder within the temp folder
    temp_cache = os.path.join(temp_folder, "sra")
    assert os.path.exists(temp_cache) is False
    os.mkdir(temp_cache)

    # Symlink it to ~/ncbi/public/sra/
    run_cmds(["ln", "-s", "-f", temp_cache, os.path.expanduser("~/ncbi/public/sra")])

    assert os.path.exists(os.path.expanduser("~/ncbi/public/sra"))
Exemple #2
0
def quality_trim(fp_in, folder_out, min_qual, min_len=30):
    """Trim a FASTQ to a minimum quality score."""
    assert os.path.exists(fp_in)
    assert fp_in.endswith(".gz") is False
    assert os.path.exists(folder_out)
    assert isinstance(min_qual, int)

    fp_out = os.path.join(folder_out, fp_in.split("/")[-1])

    run_cmds([
        "fastq_quality_trimmer",
        "-Q", "33",
        "-t", str(min_qual),
        "-i", fp_in,
        "-o", fp_out,
        "-l", str(min_len),
        "-v"
    ])

    assert os.path.exists(fp_out)

    return fp_out
Exemple #3
0
def get_sra(accession, temp_folder):
    """Get the FASTQ for an SRA accession."""
    logging.info("Downloading {} from SRA".format(accession))

    local_path = os.path.join(temp_folder, accession + ".fastq")
    logging.info("Local path: {}".format(local_path))

    # Download via fastq-dump
    logging.info("Downloading via fastq-dump")
    run_cmds([
        "prefetch", accession
    ])
    run_cmds([
        "fastq-dump",
        "--split-files",
        "--outdir",
        temp_folder, accession
    ])

    # Make sure that some files were created
    msg = "File could not be downloaded from SRA: {}".format(accession)
    assert any([
        fp.startswith(accession) and fp.endswith("fastq")
        for fp in os.listdir(temp_folder)
    ]), msg

    # Combine any multiple files that were found
    logging.info("Concatenating output files")
    with open(local_path + ".temp", "wt") as fo:
        cmd = "cat {}/{}*fastq".format(temp_folder, accession)
        cat = subprocess.Popen(cmd, shell=True, stdout=fo)
        cat.wait()

    # Remove the temp files
    for fp in os.listdir(temp_folder):
        if fp.startswith(accession) and fp.endswith("fastq"):
            fp = os.path.join(temp_folder, fp)
            logging.info("Removing {}".format(fp))
            os.unlink(fp)

    # Remove the cache file, if any
    cache_fp = "~/ncbi/public/sra/{}.sra".format(accession)
    if os.path.exists(cache_fp):
        logging.info("Removing {}".format(cache_fp))
        os.unlink(cache_fp)

    # Clean up the FASTQ headers for the downloaded file
    run_cmds(["mv", local_path + ".temp", local_path])

    # Return the path to the file
    logging.info("Done fetching " + accession)
    return local_path
Exemple #4
0
def get_reads_from_url(
    input_str,
    temp_folder,
    random_string=str(uuid.uuid4())[:8],
    min_qual=None
):
    """Get a set of reads from a URL -- return the downloaded filepath."""
    # Fetch reads into $temp_folder/fetched_reads/
    fetched_reads_folder = os.path.join(temp_folder, "fetched_reads")

    # Reads with cleaned headers go into $temp_folder/cleaned_reads/
    cleaned_reads_folder = os.path.join(temp_folder, "cleaned_reads")

    # Quality trimmed reads go into $temp_folder/trimmed_reads/
    trimmed_reads_folder = os.path.join(temp_folder, "trimmed_reads")

    for folder in [
        fetched_reads_folder, cleaned_reads_folder, trimmed_reads_folder
    ]:
        if not os.path.exists(folder):
            logging.info("Making new folder {}".format(folder))
            os.mkdir(folder)

    logging.info("Getting reads from {}".format(input_str))

    filename = input_str.split('/')[-1]
    local_path = os.path.join(fetched_reads_folder, filename)

    logging.info("Filename: " + filename)
    logging.info("Local path: " + local_path)

    if not input_str.startswith(('s3://', 'sra://', 'ftp://')):
        logging.info("Treating as local path")
        msg = "Input file does not exist ({})".format(input_str)
        assert os.path.exists(input_str), msg

        logging.info("Making a symlink to temporary folder")
        os.symlink(input_str, local_path)

    # Get files from AWS S3
    elif input_str.startswith('s3://'):
        logging.info("Getting reads from S3")
        run_cmds([
            'aws', 's3', 'cp', '--quiet', '--sse',
            'AES256', input_str, fetched_reads_folder
            ])

    # Get files from an FTP server
    elif input_str.startswith('ftp://'):
        logging.info("Getting reads from FTP")
        run_cmds(['wget', '-P', fetched_reads_folder, input_str])

    # Get files from SRA
    elif input_str.startswith('sra://'):
        accession = filename
        logging.info("Getting reads from SRA: " + accession)
        local_path = get_sra(accession, fetched_reads_folder)

    else:
        raise Exception("Did not recognize prefix for input: " + input_str)

    # Clean up the FASTQ headers
    logging.info("Cleaning up FASTQ headers")
    cleaned_path = clean_fastq_headers(
        local_path,
        cleaned_reads_folder
    )
    logging.info("Made new cleaned FASTQ file: {}".format(cleaned_path))
    logging.info("Deleting old file: {}".format(local_path))
    os.unlink(local_path)

    if min_qual is None:
        return cleaned_path
    else:
        # Quality trim the FASTQ
        logging.info("Quality trimming the FASTQ (Q{})".format(min_qual))
        trimmed_path = quality_trim(
            cleaned_path,
            trimmed_reads_folder,
            min_qual
        )
        logging.info("Made new quality trimmed FASTQ: {}".format(trimmed_path))
        logging.info("Deleting old file: {}".format(cleaned_path))
        os.unlink(cleaned_path)
        return trimmed_path
Exemple #5
0
    def align(self):
        """Align a set of reads with DIAMOND and run FAMLI."""
        parser = argparse.ArgumentParser(
            description=
            """Align a set of reads with DIAMOND, filter alignments with FAMLI,
                and return the results""")

        parser.add_argument(
            "--input",
            type=str,
            required=True,
            help="""Location for input file(s). Combine multiple files with +.
                                    (Supported: sra://, s3://, or ftp://).""")
        parser.add_argument("--sample-name",
                            type=str,
                            required=True,
                            help="""Name of sample, sets output filename.""")
        parser.add_argument("--ref-db",
                            type=str,
                            required=True,
                            help="""Folder containing reference database.
                                    (Supported: s3://, ftp://, or local path).
                                    """)
        parser.add_argument("--output-folder",
                            type=str,
                            required=True,
                            help="""Folder to place results.
                                    (Supported: s3://, or local path).""")
        parser.add_argument("--min-score",
                            type=float,
                            default=20,
                            help="Minimum alignment score to report.")
        parser.add_argument("--blocks",
                            type=int,
                            default=5,
                            help="""Number of blocks used when aligning.
                                  Value relates to the amount of memory used.
                                  Roughly 6Gb RAM used by DIAMOND per block.
                                  """)
        parser.add_argument("--query-gencode",
                            type=int,
                            default=11,
                            help="Genetic code used to translate nucleotides.")
        parser.add_argument("--threads",
                            type=int,
                            default=16,
                            help="Number of threads to use aligning.")
        parser.add_argument("--min-qual",
                            type=int,
                            default=None,
                            help="Trim reads to a minimum Q score.")
        parser.add_argument("--temp-folder",
                            type=str,
                            default='/share',
                            help="Folder used for temporary files.")
        parser.add_argument("--batchsize",
                            type=int,
                            help="""Number of reads to process at a time.""")
        parser.add_argument(
            "--delete-all-files-in-temp-folder",
            action="store_true",
            help=
            """If flag is set, DELETE ALL OF THE FILES IN THE TEMP FOLDER before starting."""
        )

        args = parser.parse_args(sys.argv[2:])

        # Make sure that there are no commas or whitespaces in the input
        input_str = args.input
        assert ' ' not in input_str, input_str
        assert ',' not in input_str, input_str

        # Make a temporary folder for all files to be placed in
        temp_folder = os.path.join(args.temp_folder, str(uuid.uuid4())[:8])
        assert os.path.exists(temp_folder) is False
        os.mkdir(temp_folder)

        # Set up logging
        log_fp = os.path.join(temp_folder, "log.txt")
        logFormatter = logging.Formatter(
            '%(asctime)s %(levelname)-8s [FAMLI] %(message)s')
        rootLogger = logging.getLogger()
        rootLogger.setLevel(logging.INFO)

        # Write to file
        fileHandler = logging.FileHandler(log_fp)
        fileHandler.setFormatter(logFormatter)
        rootLogger.addHandler(fileHandler)
        # Also write to STDOUT
        consoleHandler = logging.StreamHandler()
        consoleHandler.setFormatter(logFormatter)
        rootLogger.addHandler(consoleHandler)

        # Delete the files in scratch, if specified
        if args.delete_all_files_in_temp_folder:
            logging.info("Deleting all files in temp folder " +
                         args.temp_folder)
            for fp in os.listdir(args.temp_folder):
                fp = os.path.join(args.temp_folder, fp)
                logging.info("Deleting " + fp)
                shutil.rmtree(fp)
            logging.info("Done deleting files in temporary folder")

        # Check to see if DIAMOND is available
        logging.info("Checking for a working copy of DIAMOND")
        run_cmds(["diamond", "--version"])

        # Get the reference database
        try:
            db_fp = get_reference_database(args.ref_db, temp_folder)
        except:
            exit_and_clean_up(temp_folder)

        # Set up the NCBI fastq-dump cache folder within the temp folder
        set_up_sra_cache_folder(temp_folder)

        logging.info("Reference database: " + db_fp)

        # Align the input data and calculate the overall abundance

        # Keep track of the time elapsed to process this sample
        start_time = time.time()

        logging.info("Processing input argument: " + input_str)

        # Multiple input reads may be separated with a '+'
        input_str = input_str.split("+")
        # Make sure that they are all unique arguments
        assert len(input_str) == len(set(input_str)), "Duplicate arguments"
        # Make sure that the filenames are also all unique
        assert len(input_str) == len(set([s.split('/')[-1] for s in input_str
                                          ])), "Duplicate filenames"

        # Capture each command in a try statement
        # Get the input reads
        read_fps = []
        for s in input_str:
            logging.info("Fetching {}".format(s))
            try:
                read_fps.append(
                    get_reads_from_url(s, temp_folder, min_qual=args.min_qual))
            except:
                exit_and_clean_up(temp_folder)

        # Combine the files into a single FASTQ
        read_fp = os.path.join(temp_folder, "input.fastq")
        combine_fastqs(read_fps, read_fp)

        # Run the alignment
        try:
            align_fp = align_reads(
                read_fp,  # FASTQ file path
                db_fp,  # Local path to DB
                temp_folder,  # Folder for results
                query_gencode=args.query_gencode,
                threads=args.threads,
                min_score=args.min_score,
                blocks=args.blocks,
            )
        except:
            exit_and_clean_up(temp_folder)

        # Process the alignments, reassigning multi-mapped reads
        try:
            with open(align_fp, "rt") as align_handle:
                aligned_reads, abund, alignments = parse_alignment(
                    align_handle,
                    batchsize=args.batchsize,
                )
        except:
            exit_and_clean_up(temp_folder)

        # Calculate the number of deduplicated reads
        deduplicated_reads = sum([d["nreads"] for d in abund])

        # Name the output file based on the input file
        # Ultimately adding ".json.gz" to the input file name
        if args.sample_name is not None:
            output_prefix = args.sample_name
        else:
            output_prefix = input_str[0].split("/")[-1]
        logging.info(
            "Using sample name {} for output prefix".format(output_prefix))

        # Count the total number of reads
        logging.info("Counting the total number of reads")
        n_reads = count_fastq_reads(read_fp)
        logging.info("Reads in input file: {:,}".format(n_reads))

        # Read in the logs
        logging.info("Reading in the logs")
        logs = open(log_fp, 'rt').readlines()

        # Wrap up all of the results into a single JSON
        # and write it to the output folder
        output = {
            "input_path": "+".join(input_str),
            "input": output_prefix,
            "sample": args.sample_name,
            "output_folder": args.output_folder,
            "logs": logs,
            "ref_db": db_fp,
            "ref_db_url": args.ref_db,
            "results": abund,
            "total_reads": n_reads,
            "aligned_reads": aligned_reads,
            "deduplicated_reads": deduplicated_reads,
            "time_elapsed": time.time() - start_time,
            "params": {
                "batchsize": args.batchsize,
                "min_score": args.min_score,
                "blocks": args.blocks,
                "query_gencode": args.query_gencode,
                "threads": args.threads,
                "min_qual": args.min_qual
            }
        }
        return_results(output, output_prefix, args.output_folder, temp_folder)

        # Delete any files that were created for this sample
        logging.info("Removing temporary folder: " + temp_folder)
        shutil.rmtree(temp_folder)

        # Stop logging
        logging.info("Done")
        logging.shutdown()