Exemple #1
0
def screen_maf(qa_file, maf_file):
    """
    Screen the .maf file based on the cluster info in the qa_file
    """
    clusters = read_clusters(qa_file)
    filtered_maf = maf_file + ".filtered"

    screened_alignments = set()
    for cluster in clusters:
        for anchor in cluster:
            score = anchor[-1]
            if score != 0:
                screened_alignments.add(anchor)

    fp = file(maf_file)
    reader = maf.Reader(fp)

    fw = file(filtered_maf, "w")
    writer = maf.Writer(fw)

    for rec in reader:
        alignment = []
        for c in rec.components:
            chr, left, right, strand, score = c.src, c.forward_strand_start, \
                    c.forward_strand_end, c.strand, rec.score
            alignment.append((chr, left, right, strand, score))

        cluster = alignment_to_cluster(alignment)
        if cluster[0] in screened_alignments:
            writer.write(rec)

    fp.close()

    print >>sys.stderr, "write (%d) alignments to '%s'" % \
            (len(screened_alignments), filtered_maf)
Exemple #2
0
    def build_index(self, filename, indexfile):
        """
        Recipe from Brad Chapman's blog
        <http://bcbio.wordpress.com/2009/07/26/sorting-genomic-alignments-using-python/>
        """
        indexes = interval_index_file.Indexes()
        in_handle = open(filename)

        reader = maf.Reader(in_handle)
        while True:
            pos = reader.file.tell()
            rec = next(reader)
            if rec is None:
                break
            for c in rec.components:
                indexes.add(
                    c.src,
                    c.forward_strand_start,
                    c.forward_strand_end,
                    pos,
                    max=c.src_size,
                )

        index_handle = open(indexfile, "w")
        indexes.write(index_handle)
        index_handle.close()
Exemple #3
0
def maf_to_blast8(f):
    try:
        print >>sys.stderr, "reading %s" % f
        reader = maf.Reader(open(f))
    except:
        print >>sys.stderr, "[warning] %s not readable" % f
        return
    
    for rec in reader:
        a, b = rec.components
        query = a.src
        subject = b.src
        qstart = a.forward_strand_start
        qstop = a.forward_strand_end
        sstart = b.forward_strand_start
        sstop = b.forward_strand_end
        score = rec.score

        evalue = blastz_score_to_ncbi_expectation(score)
        score = blastz_score_to_ncbi_bits(score)
        evalue, score = "%.2g" % evalue, "%.1f" % score
        hitlen = len(a.text)

        #print a.text
        #print b.text
        pctid, nmismatch, ngaps = alignment_details(a.text, b.text)
        print "\t".join(str(x) for x in (query,subject,pctid,hitlen,nmismatch,ngaps,
                qstart,qstop,sstart,sstop,evalue,score))
Exemple #4
0
def main(in_file):
    base, ext = os.path.splitext(in_file)
    out_file = "%s-sorted%s" % (base, ext)
    index_file = in_file + ".index"
    if not os.path.exists(index_file):
        build_index(in_file, index_file)

    # pull out the sizes and positions of each record
    rec_info = []
    with open(in_file) as in_handle:
        reader = maf.Reader(in_handle)
        while 1:
            pos = reader.file.tell()
            rec = reader.next()
            if rec is None:
                break
            rec_info.append((rec.text_size, pos))
    rec_info.sort(reverse=True)

    # write the records in order, pulling from the index
    index = maf.Indexed(in_file, index_file)
    with open(out_file, "w") as out_handle:
        writer = maf.Writer(out_handle)
        for size, pos in rec_info:
            rec = index.get_at_offset(pos)
            writer.write(rec)
Exemple #5
0
def __main__():
    print "Restricted to species:", sys.argv[3]

    input_filename = sys.argv[1]
    output_filename = sys.argv[2]
    species = sys.argv[3].split(',')
    partial = sys.argv[4]
    num_species = len(species)

    file_in = open(input_filename, 'r')
    maf_reader = maf.Reader(file_in)

    file_out = open(output_filename, 'w')

    block_num = -1

    for i, m in enumerate(maf_reader):
        block_num += 1
        if "None" not in species:
            m = m.limit_to_species(species)
        l = m.components
        if len(l) < num_species and partial == "partial_disallowed": continue
        for c in l:
            spec, chrom = maf.src_split(c.src)
            file_out.write(">" + c.src + "(" + c.strand + "):" + str(c.start) +
                           "-" + str(c.end) + "|" + spec + "_" +
                           str(block_num) + "\n")
            file_out.write(c.text + "\n")
        file_out.write("\n")
    file_in.close()
    file_out.close()
def main(options, args):

    in_file = args[0]
    base, ext = os.path.splitext(in_file)
    out_file = "%s-filtered%s" % (base, ext)
    index_file = in_file + ".index"
    if not os.path.exists(index_file):
        build_index(in_file, index_file)
    index = maf.Indexed(in_file, index_file)

    fp = file(in_file)
    reader = maf.Reader(fp)

    intervals = []  # give each interval a unique id
    endpoints = collections.defaultdict(
        list)  # chromosome => list of endpoints
    filtered_rec = set()
    j = 0
    rec_info = []
    while 1:
        pos = reader.file.tell()
        rec_info.append((j / 2, pos))  # position of alignment j in file
        rec = reader.next()
        if rec is None:
            break
        for c in rec.components:
            chromosome, left, right, weight = c.src, c.forward_strand_start, \
                    c.forward_strand_end, rec.score

            interval = Weighted_interval(chromosome, left, right, weight)
            intervals.append(interval)
            endpoints[chromosome].append((left, j, -weight))  # left end
            endpoints[chromosome].append((right, j, weight))  # right end
            j += 1

    for chromosome in sorted(endpoints.keys()):
        v = endpoints[chromosome]
        print chromosome, ": start with %d intervals" % (len(v) / 2)
        filtered_rec |= interval_chain(intervals, v)

    print "filtered alignment size %d" % len(filtered_rec)

    fw = file(out_file, "w")
    writer = maf.Writer(fw)

    for j, pos in rec_info:
        if j in filtered_rec:
            rec = index.get_at_offset(pos)
            writer.write(rec)

    fp.close()
    fw.close()
def __main__():
    try:
        maf_reader = maf.Reader(open(sys.argv[1]))
    except Exception as e:
        maf_utilities.tool_fail("Error opening input MAF: %s" % e)
    try:
        file_out = open(sys.argv[2], 'w')
    except Exception as e:
        maf_utilities.tool_fail("Error opening file for output: %s" % e)
    try:
        species = maf_utilities.parse_species_option(sys.argv[3])
        if species:
            num_species = len(species)
        else:
            num_species = 0
    except Exception as e:
        maf_utilities.tool_fail("Error determining species value: %s" % e)
    try:
        partial = sys.argv[4]
    except Exception as e:
        maf_utilities.tool_fail("Error determining keep partial value: %s" % e)

    if species:
        print("Restricted to species: %s" % ', '.join(species))
    else:
        print("Not restricted to species.")

    for block_num, block in enumerate(maf_reader):
        if species:
            block = block.limit_to_species(species)
            if len(maf_utilities.get_species_in_block(
                    block)) < num_species and partial == "partial_disallowed":
                continue
        spec_counts = {}
        for component in block.components:
            spec, chrom = maf_utilities.src_split(component.src)
            if spec not in spec_counts:
                spec_counts[spec] = 0
            else:
                spec_counts[spec] += 1
            file_out.write("%s\n" % maf_utilities.get_fasta_header(
                component, {
                    'block_index': block_num,
                    'species': spec,
                    'sequence_index': spec_counts[spec]
                },
                suffix="%s_%i_%i" % (spec, block_num, spec_counts[spec])))
            file_out.write("%s\n" % component.text)
        file_out.write("\n")
    file_out.close()
def main(options, args):
    infile, chr1, chr2 = args

    in_file = args[0]
    base, ext = os.path.splitext(in_file)
    out_file = "%(base)s.%(chr1)s_vs_%(chr2)s_filtered%(ext)s" % locals()

    fp = file(in_file)
    fw = file(out_file, "w")

    reader = maf.Reader(fp)
    writer = maf.Writer(fw)
    for rec in reader:
        c1, c2 = rec.components[0].src, rec.components[1].src
        if (chr1, chr2) == (c1, c2) or (chr1, chr2) == (c2, c1):
            writer.write(rec)
Exemple #9
0
def main():

    if len(sys.argv) < 4:
        print("%s motif inmaf spec1,spec2,... " % sys.argv[0], file=sys.stderr)
        sys.exit(0)

    targmotif = sys.argv[1]
    inmaf = open(sys.argv[2])
    threshold = 0

    species = []

    for sp in sys.argv[3].split(','):
        species.append(sp)

    for maf in align_maf.Reader(inmaf):
        mafchrom = maf.components[0].src.split('.')[1]
        mafstart = maf.components[0].start
        mafend = maf.components[0].end
        reftext = maf.components[0].text

        # maf block scores for each matrix
        for scoremax, width, headers in MafMotifScorer(species, maf,
                                                       targmotif):
            #print >>sys.stderr,headers
            blocklength = width
            mafsrc, mafstart, mafend = headers[0]
            mafchrom = mafsrc.split('.')[1]

            # lists of scores for each position in scoremax
            mx = scoremax
            for offset in range(blocklength):

                # scan all species with threshold
                for i in range(len(species)):
                    if mx[i][offset] > threshold:
                        refstart = mafstart + offset - reftext.count(
                            '-', 0, offset)
                        refend = refstart + len(targmotif)
                        data = " ".join([
                            "%.2f" % mx[x][offset] for x in range(len(species))
                        ])
                        # quote the motif
                        print(mafchrom, refstart, refend,
                              "'" + targmotif + "'", data)
                        break