Ejemplo n.º 1
0
def extract_sequences(file,genes,buffer,prefix):

    # Since PF is fairly small, can be greedy about how the FASTA entries are being
    # processed and store them in memory.
    contigs = SeqIO.to_dict(SeqIO.parse(file, "fasta"))

    for gene in genes:
        vals = gene.split('|')
        source = vals[0]
        id = vals[4]
        strand = vals[3]
        start = (int(vals[1]) - buffer - 1) # correct for 0-base indexing
        stop = (int(vals[2]) + buffer) 

        # know that start can't be less than 1 due to buffer alteration, 
        # need to handle stop similarly using the sequence length.
        if start < 1:
            start = 0
        if stop > len(contigs[source].seq):
            stop = len(contigs[source].seq)

        # Generate both a buffered and unbuffered sequence. These files will
        # be exactly the same if buffer==0
        sequence1 = str(contigs[source].seq[start:stop].upper())
        sequence2 = str(contigs[source].seq[(int(vals[1]) - 1):int(vals[2])].upper())

        if strand == "-": # if reverse strand, swap the bases
            sequence1 = rev_comp(sequence1)
            sequence2 = rev_comp(sequence2)

        # Print out in standard FASTA format
        write_fasta("{0}_buffered.fsa".format(prefix),id,sequence1)
        write_fasta("{0}_unbuffered.fsa".format(prefix),id,sequence2)
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to run TASR in a multithreaded fashion on individual sequences.'
    )
    parser.add_argument('-t',
                        type=str,
                        required=True,
                        help='Location of the TASR exe (~/tasr_v1.6.2/TASR).')
    parser.add_argument('-d',
                        type=int,
                        required=True,
                        help='Number of threads to use.')
    parser.add_argument(
        '-r',
        type=str,
        required=True,
        help=
        'Path to a file with paths to reads to be used in assembly on each line.'
    )
    parser.add_argument(
        '-s',
        type=str,
        required=True,
        help=
        'Path to a file with the sequences which should be the targets for assembly.'
    )
    parser.add_argument('-o',
                        type=str,
                        required=True,
                        help='Location to generate output directories.')
    args = parser.parse_args()

    make_directory(args.o)

    seqs = SeqIO.to_dict(SeqIO.parse(args.s, "fasta"))

    manager = mp.Manager()
    pool = mp.Pool(args.d - 1)
    jobs = []

    for id in seqs:

        cur_dir = "{}/{}".format(args.o, id)
        cur_fsa = "{}/seq.fasta".format(cur_dir)
        cur_rds = "{}/reads.txt".format(cur_dir)

        make_directory(cur_dir)
        write_fasta(cur_fsa, id, str(seqs[id].seq))
        copyfile(args.r, cur_rds)

        jobs.append(pool.apply_async(run_tasr, (args.t, cur_fsa, cur_rds)))

    for job in jobs:  # Get all the returns from the apply_async function
        job.get()

    pool.close()  #  Tell the queue it's done getting new jobs
    pool.join()  # Make sure these new jobs are all finished
Ejemplo n.º 3
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to assess the results of the base Targeted Assembly pipeline.')
    parser.add_argument(
        '-remove',
        type=str,
        required=True,
        help=
        'Prefix of a strain (3D7) to remove or strains, must be CSV: (3D7,7G8).'
    )
    parser.add_argument(
        '-original_fsa',
        type=str,
        required=True,
        help=
        'Path to where the initial FASTA file generated from the pipeline is.')
    parser.add_argument(
        '-new_fsa',
        type=str,
        required=True,
        help='Path to where the output for this filtered FASTA file should go.'
    )
    args = parser.parse_args()

    nonrelevant_alleles = set()  # ignore these alleles
    contigs = {}  # final FASTA dict

    remove_us = []

    if ',' in args.remove:
        remove_us = args.remove.split(',')
    else:
        remove_us.append(args.remove)

    regex_for_contig_id = ">([a-zA-Z0-9_\.]+)"

    with open(args.original_fsa, 'r') as fasta_in:
        for line in fasta_in:  # iterate over the FASTA file and extract the entirety of each sequence

            line = line.rstrip()

            if line.startswith('>'):

                current_id = re.search(regex_for_contig_id, line).group(1)
                contigs[current_id] = ""

                prefix = line[1:].split('.')[0]

                if prefix in remove_us:
                    nonrelevant_alleles.add(current_id)

            else:
                contigs[current_id] += line  # add all the bases

    for allele in contigs:
        if allele not in nonrelevant_alleles:
            write_fasta(args.new_fsa, allele, contigs[allele])
Ejemplo n.º 4
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to refine a FASTA file to not have duplicate sequences.')
    parser.add_argument('-input',
                        type=str,
                        required=True,
                        help='Path to input FASTA.')
    parser.add_argument('-output',
                        type=str,
                        required=True,
                        help='Path to output FASTA.')
    parser.add_argument('-conflicts',
                        type=str,
                        required=True,
                        help='Output of out where conflicts were found.')
    args = parser.parse_args()

    unique_dict = {}  # keys are seq hashes and values are locus IDs
    duplicate_entries = []
    duplicates, conflicts = (0 for i in range(2))

    # Just need to iterate through once since we've already preferred the
    # the reference as the first sequence in extract_sequences.py.
    for record in SeqIO.parse(args.input, "fasta"):
        id = record.id
        locus = re.search(r'\.([A-Za-z0-9_]+)\.?\d?', id).group(1)
        seq = str(record.seq)
        md5_seq = hashlib.md5(seq.encode('utf-8')).hexdigest()
        # Found the first instance of a sequence, write it out
        if md5_seq not in unique_dict:
            unique_dict[md5_seq] = locus
            write_fasta(args.output, id, seq)
        else:
            # If we find an entry with the same locus, remove this new one
            if locus == unique_dict[md5_seq]:
                duplicates += 1
            else:  # Same sequence but different loci
                conflicts += 1

            duplicate_entries.append(id)

    print("Number of duplicates removed: {0}".format(duplicates))
    print("Number of conflicts sequences removed: {0}".format(conflicts))

    with open(args.conflicts, 'w') as outfile:
        for dupe in duplicate_entries:
            outfile.write("{0}\n".format(dupe))
Ejemplo n.º 5
0
def main():

    parser = argparse.ArgumentParser(description='Script to perform needle alignment on like sequences across FASTA files.')
    parser.add_argument('-f', type=str, required=True, help='Two paths to FASTA files split by a comma.')
    parser.add_argument('-n', type=str, required=True, help='Path to install directory of EMBOSS needle executable (e.g. /path/to/packages/emboss/bin/needle).')
    parser.add_argument('-o', type=str, required=True, help='Location to generate output directories.')
    args = parser.parse_args()

    make_directory(args.o)

    first_seqs = SeqIO.to_dict(SeqIO.parse(args.f.split(',')[0],"fasta"))
    second_seqs = SeqIO.to_dict(SeqIO.parse(args.f.split(',')[1],"fasta"))

    first_map = build_sequence_map(first_seqs.keys())
    second_map = build_sequence_map(second_seqs.keys())

    for key in first_map:
        if key in second_map: 

            cur_key_dir = "{}/{}".format(args.o,key)

            make_directory(cur_key_dir)

            # one of these lists should be of size one
            for entry1_id in first_map[key]:
                for entry2_id in second_map[key]:

                    entry1 = first_seqs[entry1_id] # get the Seq object
                    entry2 = second_seqs[entry2_id]

                    entry2.id = entry2.id.replace('|','.')

                    entry1_file = "{}/{}.fsa".format(cur_key_dir,entry1.id)
                    entry2_file = "{}/{}.fsa".format(cur_key_dir,entry2.id)

                    if not os.path.isfile(entry1_file):
                        write_fasta(entry1_file,entry1.id,str(entry1.seq))
                    if not os.path.isfile(entry2_file):
                        write_fasta(entry2_file,entry2.id,str(entry2.seq))

                    run_needle(
                        args.n,
                        entry1_file,
                        entry2_file,
                        "{}/{}_WITH_{}.align.txt".format(cur_key_dir,entry1.id,entry2.id)
                    )

                    os.remove(entry1_file) # sequences already stored in inputs
                    os.remove(entry2_file)
Ejemplo n.º 6
0
def main():

    parser = argparse.ArgumentParser(
        description=
        'Script to run TASR in an iterative fashion on individual sequences.')
    parser.add_argument('-t',
                        type=str,
                        required=True,
                        help='Location of the TASR exe (~/tasr_v1.6.2/TASR).')
    parser.add_argument(
        '-r',
        type=str,
        required=True,
        help=
        'Path to a file with paths to reads to be used in assembly on each line.'
    )
    parser.add_argument(
        '-s',
        type=str,
        required=True,
        help=
        'Path to a file with the sequences which should be the targets for assembly.'
    )
    parser.add_argument('-o',
                        type=str,
                        required=True,
                        help='Location to generate output directories.')
    args = parser.parse_args()

    make_directory(args.o)

    seqs = SeqIO.to_dict(SeqIO.parse(args.s, "fasta"))

    for id in seqs:

        cur_dir = "{}/{}".format(args.o, id)
        cur_fsa = "{}/seq.fasta".format(cur_dir)
        cur_rds = "{}/reads.txt".format(cur_dir)

        make_directory(cur_dir)
        write_fasta(cur_fsa, id, str(seqs[id].seq))
        copyfile(args.r, cur_rds)

        command = ("{} -s {} -f {} -w 1 -u 1 -c 1".format(
            args.t, cur_fsa, cur_rds))

        subprocess.call(command.split())
Ejemplo n.º 7
0
def align(out, allele, contig, aseq, bseq, f_or_r, assmb_type, emboss_tool):

    initial_align = "{0}/{1}.WITH.{2}.align.txt".format(out, allele, contig)

    call_emboss(emboss_tool, aseq, bseq, initial_align)

    a, b = (None for i in range(2))
    alignment = AlignIO.read(initial_align, "emboss")

    for sequence in alignment:

        if a == None:  # grab both sequences, first being the reference seq
            a = sequence.seq
        else:  # now grab the assembled seq
            b = sequence.seq

        # Once two sequences are extracted, refine and align trimming the
        # outside extended blank sequence.
        if a != None and b != None:

            refined_align = "{0}/{1}.WITH.{2}.{3}.trimmed_align.txt".format(
                out, allele, contig, f_or_r)

            seqs = trim_extensions(a, b)

            a_trim = "{0}.a.trimmed".format(
                f_or_r)  # sequence header, file name makes distinction
            b_trim = "{0}.b.trimmed".format(f_or_r)

            if 'needle' in emboss_tool:
                a_fsa = "{0}/{1}.WITH.{2}.{3}.a.fsa".format(
                    out, allele, contig, f_or_r)  # filename
                b_fsa = "{0}/{1}.WITH.{2}.{3}.b.fsa".format(
                    out, allele, contig, f_or_r
                )  # will be different since alignments will be different

                write_fasta(a_fsa, a_trim, seqs['a'])
                write_fasta(b_fsa, b_trim, seqs['b'])

                call_emboss(emboss_tool, a_fsa, b_fsa, refined_align)

                # No need to keep the initial align at this point as the trimmed
                # should be better. If really needed, can use the original untrimmed
                # sequences and manually re-perform needle alignment.
                os.remove(initial_align)

            elif 'water' in emboss_tool:
                write_fasta(a_fsa, a_trim, a.replace('-', ''))
                write_fasta(b_fsa, b_trim, b.replace('-', ''))

                os.rename(initial_align, refined_align)

            return seqs
Ejemplo n.º 8
0
def extract_sequences(file, assembled, aligned, outfile):

    regex_for_contig_id = ">([a-zA-Z0-9_\.]+)"

    # Since PF is fairly small, can be greedy about how the FASTA entries are being
    # processed and store them in memory.
    contigs = {}
    not_assembled, not_aligned = (
        [] for i in range(2))  # note which IDs should be re-added at the end
    current_id = ""  # store the previous key for the bases to be assigned to

    with open(file, 'r') as fasta:
        for line in fasta:  # iterate over the FASTA file and extract the entirety of each sequence

            line = line.rstrip()

            if line.startswith('>'):

                current_id = re.search(regex_for_contig_id, line).group(1)
                contigs[current_id] = ""

                # in addition to grabbing entire header, check if this entry is needed later
                locus = line.split('.')[1]
                if locus not in aligned:  # we know that if it didn't align, couldn't have assembled
                    not_aligned.append(current_id)
                elif locus not in assembled:
                    not_assembled.append(current_id)

            else:
                contigs[current_id] += line  # add all the bases

    for allele in not_assembled:
        write_fasta(outfile, allele, contigs[allele])

    for allele in not_aligned:
        write_fasta(outfile, allele, contigs[allele])