コード例 #1
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to run TASR in a multithreaded fashion on individual sequences.'
    )
    parser.add_argument('-t',
                        type=str,
                        required=True,
                        help='Location of the TASR exe (~/tasr_v1.6.2/TASR).')
    parser.add_argument('-d',
                        type=int,
                        required=True,
                        help='Number of threads to use.')
    parser.add_argument(
        '-r',
        type=str,
        required=True,
        help=
        'Path to a file with paths to reads to be used in assembly on each line.'
    )
    parser.add_argument(
        '-s',
        type=str,
        required=True,
        help=
        'Path to a file with the sequences which should be the targets for assembly.'
    )
    parser.add_argument('-o',
                        type=str,
                        required=True,
                        help='Location to generate output directories.')
    args = parser.parse_args()

    make_directory(args.o)

    seqs = SeqIO.to_dict(SeqIO.parse(args.s, "fasta"))

    manager = mp.Manager()
    pool = mp.Pool(args.d - 1)
    jobs = []

    for id in seqs:

        cur_dir = "{}/{}".format(args.o, id)
        cur_fsa = "{}/seq.fasta".format(cur_dir)
        cur_rds = "{}/reads.txt".format(cur_dir)

        make_directory(cur_dir)
        write_fasta(cur_fsa, id, str(seqs[id].seq))
        copyfile(args.r, cur_rds)

        jobs.append(pool.apply_async(run_tasr, (args.t, cur_fsa, cur_rds)))

    for job in jobs:  # Get all the returns from the apply_async function
        job.get()

    pool.close()  #  Tell the queue it's done getting new jobs
    pool.join()  # Make sure these new jobs are all finished
コード例 #2
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to map alleles across GFF3 file. Read the top of the file for more details.'
    )
    parser.add_argument('--ea_input',
                        '-eai',
                        type=str,
                        required=True,
                        help='Path to a TSV list for references and isolates.')
    parser.add_argument(
        '--insert',
        '-i',
        type=int,
        required=False,
        default=0,
        help='Insert size from SRA for the reads that will be used as input.')
    parser.add_argument(
        '--gene_or_exon',
        '-ge',
        type=str,
        required=True,
        help='Either "gene" or "exon" for which level of sequences to pull.')
    parser.add_argument('--out_dir',
                        '-o',
                        type=str,
                        required=False,
                        default='.',
                        help='Directory for where the output should go.')
    args = parser.parse_args()

    make_directory(args.out_dir)

    # dictionary where the key is the ID and the value is a list for ref/loc/coords
    allele_map = {}

    # Iterate over each reference/isolate
    with open(args.ea_input, 'r') as i:
        for entry in i:
            entry = entry.rstrip()
            vals = entry.split('\t')
            type = vals[0]
            gff3 = vals[1]
            name = vals[3]

            # Regardless of reference or isolate, all should be mapping to the same name
            # designated by the reference.
            allele_map = parse_gff3(gff3, allele_map, type, name, args.insert,
                                    args.gene_or_exon, args.out_dir)

    # Iterate over the final hash of lists and print out a TSV
    out = "ea_map.tsv"
    with open(out, 'w') as o:
        for key, value in allele_map.items():
            vals = ('\t').join(value)
            line = "{0}\t{1}\n".format(key, vals)
            o.write(line)
コード例 #3
0
def main():

    parser = argparse.ArgumentParser(description='Script to perform needle alignment on like sequences across FASTA files.')
    parser.add_argument('-f', type=str, required=True, help='Two paths to FASTA files split by a comma.')
    parser.add_argument('-n', type=str, required=True, help='Path to install directory of EMBOSS needle executable (e.g. /path/to/packages/emboss/bin/needle).')
    parser.add_argument('-o', type=str, required=True, help='Location to generate output directories.')
    args = parser.parse_args()

    make_directory(args.o)

    first_seqs = SeqIO.to_dict(SeqIO.parse(args.f.split(',')[0],"fasta"))
    second_seqs = SeqIO.to_dict(SeqIO.parse(args.f.split(',')[1],"fasta"))

    first_map = build_sequence_map(first_seqs.keys())
    second_map = build_sequence_map(second_seqs.keys())

    for key in first_map:
        if key in second_map: 

            cur_key_dir = "{}/{}".format(args.o,key)

            make_directory(cur_key_dir)

            # one of these lists should be of size one
            for entry1_id in first_map[key]:
                for entry2_id in second_map[key]:

                    entry1 = first_seqs[entry1_id] # get the Seq object
                    entry2 = second_seqs[entry2_id]

                    entry2.id = entry2.id.replace('|','.')

                    entry1_file = "{}/{}.fsa".format(cur_key_dir,entry1.id)
                    entry2_file = "{}/{}.fsa".format(cur_key_dir,entry2.id)

                    if not os.path.isfile(entry1_file):
                        write_fasta(entry1_file,entry1.id,str(entry1.seq))
                    if not os.path.isfile(entry2_file):
                        write_fasta(entry2_file,entry2.id,str(entry2.seq))

                    run_needle(
                        args.n,
                        entry1_file,
                        entry2_file,
                        "{}/{}_WITH_{}.align.txt".format(cur_key_dir,entry1.id,entry2.id)
                    )

                    os.remove(entry1_file) # sequences already stored in inputs
                    os.remove(entry2_file)
コード例 #4
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to run TASR in an iterative fashion on individual sequences.')
    parser.add_argument('-t',
                        type=str,
                        required=True,
                        help='Location of the TASR exe (~/tasr_v1.6.2/TASR).')
    parser.add_argument(
        '-r',
        type=str,
        required=True,
        help=
        'Path to a file with paths to reads to be used in assembly on each line.'
    )
    parser.add_argument(
        '-s',
        type=str,
        required=True,
        help=
        'Path to a file with the sequences which should be the targets for assembly.'
    )
    parser.add_argument('-o',
                        type=str,
                        required=True,
                        help='Location to generate output directories.')
    args = parser.parse_args()

    make_directory(args.o)

    seqs = SeqIO.to_dict(SeqIO.parse(args.s, "fasta"))

    for id in seqs:

        cur_dir = "{}/{}".format(args.o, id)
        cur_fsa = "{}/seq.fasta".format(cur_dir)
        cur_rds = "{}/reads.txt".format(cur_dir)

        make_directory(cur_dir)
        write_fasta(cur_fsa, id, str(seqs[id].seq))
        copyfile(args.r, cur_rds)

        command = ("{} -s {} -f {} -w 1 -u 1 -c 1".format(
            args.t, cur_fsa, cur_rds))

        subprocess.call(command.split())
コード例 #5
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to generate EMBOSS Needle alignments given output from format_for_assembly.py.'
    )
    parser.add_argument(
        '--ea_map',
        '-eam',
        type=str,
        required=True,
        help='Path to ea_map.tsv output from extract_alleles.py.')
    parser.add_argument(
        '--assmb_map',
        '-am',
        type=str,
        required=True,
        help=
        'Path to *map.tsv output from format_for_assembly.py or assembly_verdict.py.'
    )
    parser.add_argument('--cpus',
                        '-c',
                        type=int,
                        required=True,
                        help='Number of cores to use.')
    parser.add_argument(
        '--original_fsa',
        '-of',
        type=str,
        required=True,
        help='Path to where the unbuffered FASTA from extract_sequences.py is.'
    )
    parser.add_argument(
        '--min_align_len',
        '-minl',
        type=float,
        required=False,
        default=1.0,
        help=
        'Optional minimum length ratio of an assembled sequence that should be aligned to. For instance, enter .1 to not align constructed sequences less than 10% of the original sequence length. Default 1.0.'
    )
    parser.add_argument(
        '--max_align_len',
        '-maxl',
        type=int,
        required=False,
        default=75000,
        help=
        'Optional maximum length of an assembled sequence that should be aligned to. This is a integer, not a ratio like the min length. Useful to prevent OOM.'
    )
    parser.add_argument(
        '--assmb_path',
        '-asp',
        type=str,
        required=True,
        help=
        'Path to the the directory preceding all the ref directories (e.g. for "/path/to/ref123" put "/path/to" as the input).'
    )
    parser.add_argument(
        '--assmb_type',
        '-at',
        type=str,
        required=True,
        help=
        'Either "SPAdes" or "HGA". Determines how many assembled sequences are aligned to.'
    )
    parser.add_argument(
        '--priority',
        '-p',
        type=str,
        required=False,
        default="",
        help=
        'If given, the prefix of the sequence to solely align to like XYZ.11203981.1 would require "XYZ" as input. Useful when trying to reconstruct a particular sequence.'
    )
    parser.add_argument(
        '--align_path',
        '-alp',
        type=str,
        required=True,
        help='Path to output directory for all these alignments.')
    parser.add_argument(
        '--emboss_tool',
        '-e',
        type=str,
        required=True,
        help=
        'Path to install directory of EMBOSS needle/water executable (e.g. /path/to/packages/emboss/bin/[needle|water]).'
    )
    args = parser.parse_args()

    # First, extract the sequences from the reference file and
    # *STORE IN MEMORY* (careful how big the reference genome used is.
    # We need this to generate small FASTA files for Needle alignment.
    seq_dict = SeqIO.to_dict(SeqIO.parse(args.original_fsa, "fasta"))

    # In order to access these seqs efficiently, rebuild the DS created
    # in extract_alleles.py. This is a dictionary where the key is the
    # shared locus and the value is a list of all the mapped alleles.
    ref_dict = defaultdict(list)
    with open(args.ea_map, 'r') as loc_map:

        for line in loc_map:
            line = line.rstrip()
            ele = line.split('\t')
            locus = ele[0]

            # Need to handle the case where the reference locus is
            # split into multiple like ABC123.1,ABC123.2,etc.
            if '.' in locus:
                split_locus = locus.split('.')
                locus = split_locus[0]

            for j in range(1, len(ele)):
                allele_info = ele[j].split('|')
                allele = allele_info[4]
                ref_dict[locus].append(allele)

    manager = mp.Manager()
    q = manager.Queue()
    pool = mp.Pool(args.cpus)

    pool.apply_async(listener, (q, args.align_path))

    min_len = args.min_align_len
    max_len = args.max_align_len

    # Build a jobs array to make sure these all finish.
    jobs = []

    # Now that we can easily extract the sequences for alignment, iterate over
    # the directory name map file and perform alignments.
    with open(args.assmb_map, 'r') as dir_map:

        for line in dir_map:
            line = line.rstrip()
            ele = line.split('\t')
            locus = ele[0]  # reference/locus that maps to directory number
            loc_dir = ele[
                1]  # the directory number from assembly for grid submission
            out_dir = "{0}/{1}".format(args.align_path,
                                       locus)  # alignment output goes here
            make_directory(out_dir)

            # Split out the contigs if more than one is present and have to do
            # alignment of all refs to all contigs.
            contigs = ""
            if args.assmb_type == "SPAdes":
                contigs = "{0}/{1}/contigs.fasta".format(
                    args.assmb_path, loc_dir)
                jobs.append(
                    pool.apply_async(
                        worker, (locus, contigs, ref_dict[locus], seq_dict,
                                 out_dir, min_len, max_len, q, args.assmb_type,
                                 args.priority, args.emboss_tool)))
            else:
                contigs = "{0}/{1}/f_Scaffold.fasta".format(
                    args.assmb_path, loc_dir)
                jobs.append(
                    pool.apply_async(
                        worker, (locus, contigs, ref_dict[locus], seq_dict,
                                 out_dir, min_len, max_len, q, args.assmb_type,
                                 args.priority, args.emboss_tool)))
                contigs = "{0}/{1}/r_Scaffold.fasta".format(
                    args.assmb_path, loc_dir)
                jobs.append(
                    pool.apply_async(
                        worker, (locus, contigs, ref_dict[locus], seq_dict,
                                 out_dir, min_len, max_len, q, args.assmb_type,
                                 args.priority, args.emboss_tool)))

    # Get all the returns from the apply_async function.
    for job in jobs:
        job.get()

    q.put('stop')  # should be no more messages
    pool.close()  #  Tell the queue it's done getting new jobs
    pool.join()  # Make sure these new jobs are all finished
コード例 #6
0
def main():

    parser = argparse.ArgumentParser(
        description='Script to set up for SPAdes alignment on a grid.')
    parser.add_argument(
        '--ref_map',
        '-rm',
        type=str,
        required=True,
        help='Path to *_ref_map.tsv output from analyze_bam.py.')
    parser.add_argument(
        '--reads_dir',
        '-rd',
        type=str,
        required=True,
        help=
        'Path to where the output directory for the FASTQs went, same as what was used for fastq_reads_to_fastq_alleles.py.'
    )
    parser.add_argument(
        '--assmb_path',
        '-ap',
        type=str,
        required=True,
        help=
        'Path to the the directory to initialize directories for all the assembly output.'
    )
    parser.add_argument(
        '--outfile',
        '-o',
        type=str,
        required=True,
        help='Path to output map (maps the ref to the SGE ID)).')
    args = parser.parse_args()

    ref_map = {}  # dict to hold the ref and its arbitrary ID starting at 1
    id = 1

    # First, rename the directories
    with open(args.ref_map, 'r') as infile:
        for line in infile:

            line = line.rstrip()
            ref = line.split(
                '\t')  # really just want the first column which is the ref ID

            # Now rename the original directory so that it can be iterated over in a grid
            # job.
            old_dir = "{0}/{1}".format(args.reads_dir, ref[0])
            new_dir = "{0}/{1}".format(args.reads_dir, id)

            try:
                os.rename(old_dir, new_dir)
            except OSError as exception:
                if exception.errno != errno.EEXIST:
                    raise
            else:
                ref_map[ref[0]] = id

                # Make a new output directory for all the SPAdes assembly files
                spades_out_dir = "{0}/{1}".format(args.assmb_path, id)
                make_directory(spades_out_dir)

                id += 1  # if no exception, it was renamed and need a new ID

    # Now generate a map to know which directories correlate to what IDs
    with open(args.outfile, 'w') as outfile:
        for k, v in ref_map.items():
            outfile.write("{0}\t{1}\n".format(k, v))
コード例 #7
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to generate stats given output from analyze_bam.py and filter a set of paired-end FASTQ reads.'
    )
    parser.add_argument(
        '--ab_read_map',
        '-ab',
        type=str,
        required=True,
        help='Path to *_read_map.tsv output from analyze_bam.py.')
    parser.add_argument('--fastq1',
                        '-1',
                        type=str,
                        required=True,
                        help='Path to the first paired fastq.gz file.')
    parser.add_argument('--fastq2',
                        '-2',
                        type=str,
                        required=True,
                        help='Path to the second paired fastq.gz file.')
    parser.add_argument(
        '--filter',
        '-f',
        type=str,
        required=True,
        help=
        'Either "yes" or "no" for removing discrepancies + multi-locus mapping reads.'
    )
    parser.add_argument(
        '--paired_suffixes',
        '-ps',
        type=str,
        required=True,
        help=
        'Either "yes" or "no" for whether the reads are mapped to one another with suffixes like .1 and .2 and one wants to assess for concordancy. This is dependent on the aligner. Check the *read_map.tsv file and see if the first elements are by read pair (so no suffix) or individual read (each read has a suffix) and answer accordingly.'
    )
    parser.add_argument(
        '--reads_dir',
        '-rd',
        type=str,
        required=True,
        help='Path to where the output directory for the FASTQs to go.')
    args = parser.parse_args()

    filter = args.filter
    output = args.reads_dir

    counts = {
        'single_map': 0,
        'multi_map': 0,
        'discrepancy': 0
    }  # count these stats as they are processed.

    # Establish three dicts:
    # first two dicts consist of one for each mate
    # third dict is the IDs that need to be mapped (checking based on if the user wants to filter)
    r1, r2, ids_to_keep = (defaultdict(list) for j in range(3)
                           )  # establish each mate dict as an empty list

    unique_refs = set()  # make directories now for where all the reads will go

    if args.paired_suffixes == 'yes':
        # This first iteration only cares about grabbing all mates and their reference alignment info
        with open(args.ab_read_map, 'r') as reads:
            for line in reads:

                line = line.rstrip()
                ele = line.split('\t')

                if ele[0][-1] == "1":  # read mate 1
                    for j in range(1, len(ele)):
                        alignment = ele[j].split(
                            '|')  # split the alignment data
                        ref = alignment[2].split(
                            '.')  # split the reference name
                        ref_loc = ref[1]  # grab just the base reference locus
                        # don't double up on references (possible if mapping to same locus from different samples)
                        if ref_loc not in r1[ele[0]]:
                            r1[ele[0]].append(ref_loc)
                else:  # read mate 2
                    for j in range(1, len(ele)):
                        alignment = ele[j].split('|')
                        ref = alignment[2].split('.')
                        ref_loc = ref[1]
                        if ref_loc not in r2[ele[0]]:
                            r2[ele[0]].append(ref_loc)

        shared_id = ""  # id in the format of ABC.123 for pairs ABC.123.1 + ABC.123.2
        checked_ids, ref_dirs = (
            set() for j in range(2)
        )  # set to speed up processing of R2 if already covered by R1

        # Now, iterate over each dict of mates and filter if required
        for read in r1:  # mate 1
            shared_id = read[:-2]
            mate_id = shared_id + ".2"

            # Generate stats regardless of filtering or not, can help the user decide if they should
            count_val = verify_alignment(r1[read], r2[mate_id])
            counts[count_val] += 1

            if filter == "yes" and count_val == "single_map":  # need to isolate reads that only map once

                # If a single map value, know that both reads share the same locus
                if not r1[read]:  # if R1 didn't map, means R2 did
                    ids_to_keep[shared_id].append(r2[mate_id][0])
                elif not r2[
                        mate_id]:  # same as above, if R2 didn't map, means R1 did
                    ids_to_keep[shared_id].append(r1[read][0])
                else:  # else, they both mapped to the same locus and can use either value
                    ids_to_keep[shared_id].append(r1[read][0])

                unique_refs.add(
                    ids_to_keep[shared_id][0])  # add the single locus

            else:  # no filter needed, add all distinct loci found per read

                for ref in r1[read]:
                    ids_to_keep[shared_id].append(ref)
                    unique_refs.add(ref)  # add all loci

                for ref in r2[mate_id]:
                    if ref not in ids_to_keep[
                            shared_id]:  # make sure not to double up on loci across mates
                        ids_to_keep[shared_id].append(ref)
                        unique_refs.add(ref)  # add all loci

            checked_ids.add(
                shared_id
            )  # identify these as looked at before going into r2 dict

        for read in r2:  # mate 2, only check if the mate wasn't caught by the r1 dict
            shared_id = read[:-2]

            if shared_id not in checked_ids:  # if not checked using mate 1, verify now.

                mate_id = shared_id + ".1"
                count_val = verify_alignment(r1[mate_id], r2[read])
                counts[count_val] += 1

                # If we are here, the read was not found in R1. Thus, get loci strictly from R2.
                if filter == "yes" and count_val == "single_map":
                    ids_to_keep[shared_id].append(r2[read][0])
                    unique_refs.add(
                        ids_to_keep[shared_id][0])  # add the single locus

                else:  # Again, was not found in R1 so we know all loci are from R2.
                    for ref in r2[read]:
                        ids_to_keep[shared_id].append(ref)
                        unique_refs.add(ref)  # add the single locus

        # At this point, ids_to_keep now has a dictionary mapping all read IDs to loci that
        # they aligned to. This is all that's needed to build a set of directories that house
        # reads just mapping to those loci for use in assembly.

        r1, r2, checked_ids = (None for j in range(3)
                               )  # done with these, free up some memory

        # give the user some idea of how much they are potentially filtering out
        out_stats = output + ".stats"
        with open(out_stats, 'w') as stats_file:
            for k, v in counts.items():
                stats_file.write("{0} read-pairs have a {1}.\n".format(v, k))

    elif args.paired_suffixes == 'no':
        with open(args.ab_read_map, 'r') as reads:
            for line in reads:

                line = line.rstrip()
                ele = line.split('\t')

                for j in range(1, len(ele)):
                    alignment = ele[j].split('|')  # split the alignment data
                    ref = alignment[2].split('.')  # split the reference name
                    ref_loc = ref[1]  # grab just the base reference locus
                    # don't double up on references (possible if mapping to same locus from different samples)
                    if ref_loc not in ids_to_keep[ele[
                            0]]:  # just one dict here since gsnap doesn't capture read suffix
                        ids_to_keep[ele[0]].append(ref_loc)
                        unique_refs.add(ref_loc)

    # Write out all the directories
    for ref in unique_refs:
        dir = "{0}/{1}".format(output, ref)
        make_directory(dir)

    # Regardless of filtering based on alignment single/multiple/discrepancies or not, still
    # need to filter all the FASTQ reads to just those that aligned to a gene region.
    filter_fastq(ids_to_keep, args.fastq1, args.fastq2, output)
コード例 #8
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to establish the necessary directory structures for the pipelines output.'
    )
    parser.add_argument('--workspace_location',
                        '-wl',
                        type=str,
                        default='.',
                        help='Path to build directories at.')
    args = parser.parse_args()

    # directories for the grid to output logs and errors
    make_directory("{0}/grid_out".format(args.workspace_location))
    make_directory("{0}/grid_err".format(args.workspace_location))

    make_directory("{0}/gsnap_idx".format(
        args.workspace_location))  # stores GSNAP index
    make_directory("{0}/smalt_idx".format(
        args.workspace_location))  # stores SMALT index

    # Aligner 1
    make_directory("{0}/first_reads".format(
        args.workspace_location))  # individual read sets per locus
    make_directory("{0}/first_spades_assemblies".format(
        args.workspace_location))  # assembly method 1 results
    make_directory("{0}/first_hga_assemblies".format(
        args.workspace_location))  # assembly method 2 results
    make_directory("{0}/first_alignments".format(
        args.workspace_location))  # alignment results

    # Aligner 2
    make_directory("{0}/second_reads".format(args.workspace_location))
    make_directory("{0}/second_spades_assemblies".format(
        args.workspace_location))
    make_directory("{0}/second_hga_assemblies".format(args.workspace_location))
    make_directory("{0}/second_alignments".format(args.workspace_location))

    # Pull HGA and SB from their own repos, note these are modified from their original implementations for this pipeline
    hga_url = 'https://raw.githubusercontent.com/jmatsumura/Hierarchical-Genome-Assembly-HGA/master/HGA.py'
    with urllib.request.urlopen(hga_url) as response, open(
            "{0}/HGA.py".format(args.workspace_location), 'wb') as out_file:
        data = response.read()  # a `bytes` object
        out_file.write(data)

    sb_url = 'https://raw.githubusercontent.com/jmatsumura/Scaffold_builder/master/scaffold_builder.py'
    with urllib.request.urlopen(sb_url) as response, open(
            "{0}/scaffold_builder.py".format(args.workspace_location),
            'wb') as out_file:
        data = response.read()  # a `bytes` object
        out_file.write(data)

    # "touch" these files
    open("{0}/first_ids_v_cov.tsv".format(args.workspace_location),
         'w').close()
    open("{0}/second_ids_v_cov.tsv".format(args.workspace_location),
         'w').close()