Example #1
0
 def set_read_fasta(self, read_fasta_file):
     self.reads_set = True
     gfr = SequenceBasics.GenericFastaFileReader(read_fasta_file)
     self.reads = {}
     while True:
         e = gfr.read_entry()
         if not e: break
         if e['name'] in self.reads:
             sys.stderr.write(
                 "Warning duplicate name in fasta file, could be big problems on sequence assignment.\n"
             )
         self.reads[e['name']] = e['seq'].upper()
     gfr.close()
     return
Example #2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("bwa_bam",
                        help="BAMFILE or - for sam streamed to stdin")
    parser.add_argument('-o',
                        '--output',
                        help="OUTFILE or if not set use STDOUT")
    group1 = parser.add_mutually_exclusive_group(required=True)
    group1.add_argument("--query_fasta", help="FASTA for query sequences")
    group1.add_argument("--query_fastq", help="FASTQ for query sequences")
    parser.add_argument("--ref",
                        required=True,
                        help="FASTA for reference genome")
    parser.add_argument(
        "-S",
        "--size",
        help="linux sort option S, in kb if units not specified")
    group2 = parser.add_mutually_exclusive_group()
    group2.add_argument("--tempdir",
                        default='/tmp',
                        help="DIR to store a temp directory")
    group2.add_argument("--specific_tempdir",
                        help="Exact DIR to work in, is not deleted")
    parser.add_argument(
        "--get_all_alignments",
        action='store_true',
        help="Be exhaustive in retrieving secondary and chimeric alignments")
    args = parser.parse_args()

    # Manage your tempdir
    # put it into args.tempdir
    if not args.specific_tempdir:
        rnum = random.randint(1, 100000000)
        args.tempdir = args.tempdir.rstrip('/') + '/weirathe.' + str(rnum)
    else:
        args.tempdir = args.specific_tempdir.rstrip('/')
    if not os.path.exists(args.tempdir):
        os.makedirs(args.tempdir)

    inf = sys.stdin
    if args.bwa_bam != '-':
        p1 = subprocess.Popen(("samtools view " + args.bwa_bam).split(),
                              stdout=subprocess.PIPE)
        inf = p1.stdout
    # 1. Now we can start the process.  First convert the sam to a psl
    cmd = "sam_to_psl.py -"
    if args.get_all_alignments:
        cmd += " --get_all_alignments"

    #No matter what we don't need to see the exact same alignment more than once
    cmd += " | sort -T " + args.tempdir
    if args.size:
        cmd += " -S " + args.size
    cmd += " | uniq"

    #Sort the alignment based on query name
    cmd += " | sort_psl.py - --tempdir " + args.tempdir
    if args.size:
        cmd += " -S " + args.size

    #Put the fragmented alignments together
    cmd += " | defragment_PSL_alignments.py - | sort_psl.py - --tempdir " + args.tempdir + " -o " + args.tempdir + "/1.psl"
    if args.size:
        cmd += " -S " + args.size
    p2 = subprocess.Popen(cmd, shell=True, stdin=subprocess.PIPE)
    for line in inf:
        p2.stdin.write(line)
    p2.communicate()
    if args.bwa_bam != '-':
        p1.communicate()

    # 2. Get a sorted fasta file
    cmd = "sort_fasta.py - --tempdir " + args.tempdir + " -o " + args.tempdir + '/2.query.fa'
    if args.size:
        cmd += " -S " + args.size
    p3 = subprocess.Popen(cmd.split(), stdin=subprocess.PIPE)
    if args.query_fasta:
        gfr = SequenceBasics.GenericFastaFileReader(args.query_fasta)
    else:
        gfr = SequenceBasics.GenericFastqFileReader(args.query_fastq)
    while True:
        e = gfr.read_entry()
        if not e: break
        p3.stdin.write('>' + e['name'] + "\n" + e['seq'] + "\n")
    p3.communicate()
    gfr.close()

    # 3. Fix the psl and output the results
    of = sys.stdout
    if args.output:
        of = open(args.output, 'w')
    cmd = "fix_psl_stats.py " + args.tempdir + '/1.psl ' + args.ref + ' ' + args.tempdir + '/2.query.fa | sort_psl.py - --tempdir ' + args.tempdir
    if args.size:
        cmd += " -S " + args.size
    p4 = subprocess.Popen(cmd, shell=True, stdout=of)
    p4.communicate()

    # Clean up your temporary directory if you aren't in a specific one.
    if not args.specific_tempdir:
        rmtree(args.tempdir)
    return