Example #1
0
    def __generate_test_data__(self, bam1, sam2, contig_positions, min_mapq):

        run_results = {
            "total_queries": 0,
            "passed_filter": 0,
            "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
            "min_mapq": min_mapq,
            "query_sam": sam1,
            "searched_sam": sam2,
            "contig_positions": contig_positions,
            "pickle": None,
            }

        contig_2_contig_dict = defaultdict(list)

        #  CREATE SAM INDEX IF IT DOESN'T EXIST
        if not os.path.exists(sam2 + FileIndex.ext):
            FileIndex.create(sam2, lambda fh: SamLine(fh).qname, allow_multiple=True)

        fi = FileIndex(sam2, SamLine, allow_multiple=True)
        ma = MergeAssemblies()

        contig_starts = [int(l.strip()) for l in open(contig_positions, 'rU')]

        fq1_out = open("test_fq1.10k.fq", 'w')
        fq2_out = open("test_fq2.10k.fq", 'w')

        with open(sam1, 'rU') as qs:

            for count, q in enumerate(qs):

                if q.startswith("@"):                      # skip header lines
                    continue

                # SPLIT LINE AND IDENTIFY QUERY POSITION.
                q = q.strip().split("\t")
                query_pos = int(q[3])
                query_pos = ma.round_bp_pos(query_pos)     # this could be more sophisticated.

                q_id = q[0][:-1] + "2"                     # create query id (e.g., ends with "2")
                q_mapq = int(q[4])                         # and, mapping quality.

                if count > 10000: break

                # SEARCH FOR QUERY AND PARSE RESULTS
                for s in fi[q_id]:

                    fq1 = (q[0], q[9], q[10])
                    fq2 = (s.qname, s.seq, s.qual)

                    fq1_line = "@{}\n{}\n+\n{}\n".format(*fq1)
                    fq2_line = "@{}\n{}\n+\n{}\n".format(*fq2)

                    fq1_out.write(fq1_line)
                    fq2_out.write(fq2_line)
Example #2
0
def unique_b(sam_a, sam_b):

    aidx = FileIndex(sam_a, fast_sam, fast_sam)
    out_b = open(sam_b.replace(".sam", ".better.sam"), "w")
    print "writing to %s" % out_b.name
    unique = 0
    for line in open(sam_b):
        b_id = line.split("\t")[0]
        if not b_id in aidx:
            unique += 1
            print >> out_b, line,
    print "unique to %s: %i" % (sam_b, unique)
Example #3
0
import os.path as op

sys.path.insert(0, op.join(op.dirname(__file__), ".."))
from fileindex import FileIndex


class FastQEntry(object):
    __slots__ = ("name", "seq", "l3", "qual", "fpos")

    def __init__(self, fh):
        self.name = fh.readline().rstrip("\r\n")
        self.seq = fh.readline().rstrip("\r\n")
        self.l3 = fh.readline().rstrip("\r\n")
        self.qual = fh.readline().rstrip("\r\n")


if __name__ == "__main__":
    f = "/usr/local/src/bowtie/bowtie-0.12.1/work/reads/s_1_sequence.txt"
    N = 100

    # if not op.exists(f + FileIndex.ext):
    FileIndex.create(f, lambda fh: FastQEntry(fh).name)

    fi = FileIndex(f, FastQEntry)
    print "getting %i keys..." % N

    for i, k in enumerate(fi.db.iterkeys(str)):
        print fi[k].seq
        if i == N:
            break
Example #4
0
    def associate_contigs(self, bam1, sam2, min_mapq, run_ID, R1_starts=None):
        # print bam1, sam2, min_mapq, run_ID
        # sys.exit()

        run_results = {
            "total_queries": 0,
            "passed_filter": 0,
            "date": datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),
            "min_mapq": min_mapq,
            "query_sam": bam1,
            "searched_sam": sam2,
            # "contig_positions": contig_positions,
            "pickle": None,
            "failed_filter":0,
            }

        contig_2_contig_dict = defaultdict(list)

        #  CREATE SAM INDEX IF IT DOESN'T EXIST
        if not os.path.exists(sam2 + FileIndex.ext):
            FileIndex.create(sam2, lambda fh: SamLine(fh).qname, allow_multiple=True)

        fi = FileIndex(sam2, SamLine, allow_multiple=True)
        ma = MergeAssemblies()

        R1_sam = pysam.Samfile(bam1,'rb')
        path = os.path.split(bam1)[0]

        if R1_starts == None:
            R1_starts = os.path.join(path, "{}.R1.contig_start_pos.txt".format(run_ID))

        low_depth_R1_starts = os.path.join(path, "{}.R1.contig_start_pos.no_pass.txt".format(run_ID))
        low_depth_R1_starts = open(low_depth_R1_starts, 'w')

        all_R1s = set()

        # ITERATE OVER START POSITIONS
        for count, start in enumerate(open(R1_starts,'rU')):
            query_pos = int(start.strip())
            all_R1s.update([query_pos])

            if query_pos is 0:
                continue

            #print "{}.R1".format(run_ID), query_pos-5, query_pos+5
            # samtools view alignments/Trachs_Merged.1.fq.sorted.bam Trachs_Merged_stacks.R1:590-600
            reads = R1_sam.fetch("{}.R1".format(run_ID), query_pos-5, query_pos+5)
            reads = [r for r in reads]                  # unpack interator

            depth = len(reads)

            # if depth < min_depth:
            #     unassociated_R1s.update([query_pos])

            for q in reads:
                q_id = q.qname[:-1] + "2"               # create query id (e.g., ends with "2")
                q_mapq = q.mapq                         # and, mapping quality.


                # SEARCH FOR QUERY AND PARSE RESULTS
                for s in fi[q_id]:

                    if (q_mapq > min_mapq) and (s.mapq > min_mapq):

                        hit_pos = ma.get_hit_pos(s)
                        hit_pos = ma.round_bp_pos(hit_pos)
                        contig_2_contig_dict[query_pos].append(hit_pos)

        run_results["passed_filter"] = len(contig_2_contig_dict.keys())

        unassociated_R1s = all_R1s.difference(set(contig_2_contig_dict.keys()))
        run_results["failed_filter"] = len(unassociated_R1s)

        [low_depth_R1_starts.write("{}\n".format(p)) for p in unassociated_R1s]

        #CREATE PICKLE OUTPUT FILE
        today = datetime.date.today()

        path = os.path.split(bam1)[0]
        pkl_output_file_name = os.path.join(path, '{}.R1_to_R2_contig_associations.pkl'.format(run_ID))
        pkl_out = open(pkl_output_file_name, 'wb')
        pickle.dump(contig_2_contig_dict, pkl_out)

        low_depth_R1_starts.close()

        # UPDATE RUN RESULTS AND GENERATE LOGFILE
        run_results["total_queries"] = count
        run_results['pickle'] = pkl_output_file_name
        self.__associate_contigs_log__(run_results, path)        # format logging results
        return 1
Example #5
0
import sys
import os.path as op
sys.path.insert(0, op.join(op.dirname(__file__), ".."))
from fileindex import FileIndex

class SamLine(object):
    __slots__ = ('name', 'ref_loc', 'ref_seqid')
    def __init__(self, fh):
        line = fh.readline().split("\t") or [None]
        self.name = line[0]
        self.ref_seqid = line[2]
        self.ref_loc = int(line[3])
        # ... other sam format stuff omitted.

if __name__ == "__main__":
    f = '/usr/local/src/methylcode/emen/en-data/out/methylcoded.sam'
    if not op.exists(f + FileIndex.ext):
        FileIndex.create(f, lambda fh: SamLine(fh).name, allow_multiple=True)

    fi = FileIndex(f, SamLine, allow_multiple=True)
    print [(s.name, s.ref_seqid, s.ref_loc) for s in fi['23351265']]

Example #6
0
import sys
import os.path as op
sys.path.insert(0, op.join(op.dirname(__file__), ".."))
from fileindex import FileIndex


class FastQEntry(object):
    __slots__ = ('name', 'seq', 'l3', 'qual', 'fpos')

    def __init__(self, fh):
        self.name = fh.readline().rstrip('\r\n')
        self.seq = fh.readline().rstrip('\r\n')
        self.l3 = fh.readline().rstrip('\r\n')
        self.qual = fh.readline().rstrip('\r\n')


if __name__ == "__main__":
    f = '/usr/local/src/bowtie/bowtie-0.12.1/work/reads/s_1_sequence.txt'
    N = 100

    #if not op.exists(f + FileIndex.ext):
    FileIndex.create(f, lambda fh: FastQEntry(fh).name)

    fi = FileIndex(f, FastQEntry)
    print "getting %i keys..." % N

    for i, k in enumerate(fi.db.iterkeys(str)):
        print fi[k].seq
        if i == N: break