Esempio n. 1
0
def _main(args):

    if len(args) != 3:
        print("usage: xls_get_region_from_fasta.py <fasta> <xls> <window>")
        sys.exit(1)

    win = int(args[2])

    fasta = fasta_subseq_2.FastaDB()
    fasta.openFastaFile(args[0])

    seqs = []

    for ln in open(args[1]):
        sp = ln[:-1].split()
        print sp
        pk = int(sp[1]) + int(sp[4])
        seq = fasta[sp[0]]['sequence'][(pk - win):(pk + win)]
        get_in = sp[
            -1]  #raw_input(">%s:%d..%d\n\'k\'=keep; \'r\' = reverse comp; \'<anything else>\' = discard: " % (sp[0],pk-win,pk+win))
        if get_in == 'k':
            pass
        elif get_in == 'r':
            seq = fasta_subseq_2.revcomp(seq)
        else:
            continue

        seqs.append(">%s:%d..%d\n%s" % (sp[0], pk - win, pk + win, seq))

    outfile = raw_input("name of output file: ")
    outfh = open(outfile, "w")
    for s in seqs:
        print >> outfh, s
def get_enrich(xls_regs, sgr, winsize, fasta):

    fasta_db = fasta_subseq_2.FastaDB()
    fasta_db.openFastaFile(fasta)
    for reg in xls_regs:

        for x in ('mtx1_hits', 'mtx2_hits'):
            hit_info = []
            for h in reg[x]:
                hit = {'hit_obj': h}
                width = abs(h['start'] - h['end'])
                if h['strand'] == "+":
                    seq = fasta_db[h['chr']]['sequence'][h['start']:(
                        h['start'] + width)]
                    hit['loc'] = h['start']
                else:
                    ### !!!!! CHANGE IF FIX HIT DATABASE!!!!
                    seq = fasta_subseq_2.revcomp(
                        fasta_db[h['chr']]['sequence'][h['end']:(h['end'] +
                                                                 width)])
                hit['loc'] = h['end']
                hit['nearest'] = (0, 0)
                hit['vals'] = []
                hit['seq'] = seq
                hit_info.append(hit)
            reg[x + '_info'] = hit_info

    for y in open(sgr):
        (chr, loc, val) = y.split()
        loc = int(loc)
        val = int(val)
        #print chr
        for x in xls_regs:
            for hit_info in ('mtx1_hits_info', 'mtx2_hits_info'):
                for d in x[hit_info]:
                    #print (loc,target_loc)
                    target_loc = d['loc']
                    if (chr == d['hit_obj']['chr']) and (
                            abs(loc - target_loc) <
                            abs(loc - d['nearest'][0])):
                        d['nearest'] = (loc, val)

                    if (chr == d['hit_obj']['chr']) and (abs(loc - target_loc)
                                                         < (winsize / 2)):
                        d['vals'].append(val)
                        print >> sys.stderr, d
    for x in xls_regs:
        for hit_info in ('mtx1_hits_info', 'mtx2_hits_info'):
            for h in x[hit_info]:
                h['win_mean'] = np.mean(h['vals'])
                h['win_median'] = np.median(h['vals'])
                h['enrich_md'] = h['nearest'][1] / h['win_median']
                h['enrich_mn'] = h['nearest'][1] / h['win_mean']
                print >> sys.stderr, h

    return xls_regs
Esempio n. 3
0
def _main(args):

    usage = "pecan_WGA_runner.py <genome_fastas_file> <treefile> <mercator_map> <mercator_genome_order> <outdir>"
    if len(args) != 5:
        print usage
        sys.exit(0)

    genome_dict = {}
    genome_order = []
    mercator_genome_order = []
    map_lines = []

    mercator_order_file = open(args[3])
    mercator_genome_order = mercator_order_file.readline().split()

    for ln in open(args[0]):
        (sp, fasta) = ln[:-1].split()
        genome_order.append(sp)
        genome_dict[sp] = fasta_subseq_2.FastaDB()
        genome_dict[sp].openFastaFile(fasta)

    tree_dict = generate_trees(genome_dict, args[1])

    for ln in open(args[2]):
        map_ln = ln[:-1].split()
        (species, map_dict) = make_map_dict(map_ln[1:], genome_order,
                                            mercator_genome_order)
        tree_obj = tree_dict[species]
        tree_strIO = StringIO()
        Phylo.write(tree_obj, tree_strIO, "newick")
        fastas = [(sp, genome_dict[sp]) for sp in genome_order
                  if (sp in species)]
        map_entry = {
            'map_dict': map_dict,
            'tree': tree_strIO.getvalue(),
            'map_idx': int(map_ln[0]),
            'fastas': fastas
        }
        map_lines.append(map_entry)

    os.chdir(args[4])

    pool = mp.Pool(2)

    #pool.map(run_aln_mapline, map_lines)
    pool.map(run_aln_mapline, map_lines)

    print "ALL DONE!"
Esempio n. 4
0
 def _buildAlnTables(self, subAlnObj, alnScore):
     species = subAlnObj.getSeqDict().keys()
     dt_init = []  # description for alignment table - build as we read spp
     exp_size = 0
     self.species = species
     for (j, sp) in enumerate(species):
         # open fasta to get unaligned chromosome lengths & seqs
         sp_fasta = fs2.FastaDB()
         sp_fasta.openFastaFile(SPP_FASTAS[sp])
         chrs = sorted(sp_fasta.keys())
         # group for each species' chromosome arrays
         sp_grp = self.h5.createGroup(self.h5.root, sp,
                                      "%s Chromosomes" % (sp, ))
         self.species_chrs[sp] = {}
         # use 1 byte index as chr identifier
         self.chr_key_arrays[sp] = [None] * len(chrs)
         for (i, ch) in enumerate(chrs):
             # build 2 x len(chr) array of (base,aligned_coord) pairs
             # print sp_fasta[ch]
             bases = np.matrix(list(sp_fasta[ch].getFullSeq()),
                               dtype=np.dtype("a1")).T
             maps = np.matrix(np.zeros(len(sp_fasta[ch]) - 1),
                              dtype=np.dtype("u8")).T
             flat_chr_arr = np.ndarray(shape=(len(maps), 1),
                                       dtype=np.dtype([('base', 'a1'),
                                                       ('aln_map', 'u8')]))
             flat_chr_arr['base'] = bases
             flat_chr_arr['aln_map'] = maps
             self.species_chrs[sp][ch] = self.h5.createTable(
                 sp_grp, "chr" + ch,
                 np.dtype([('base', 'a1'), ('aln_map', 'u8')]))
             self.species_chrs[sp][ch].append(flat_chr_arr)
             self.species_chrs[sp][ch].flush()
             self.chr_key_arrays[sp][i] = [ch, self.species_chrs[sp][ch]]
             print "%s %s length = %d added to %s" % (sp, ch, len(bases),
                                                      sp_grp)
             exp_size += len(sp_fasta[ch])
         # add a column to the align table description
         dt_init.append((sp, [('base', 'a1'), ('chr_key', 'u4'),
                              ('position', 'u8')]))
     self.aln_tbl_dtype = np.dtype(dt_init)
     self.aln_table = self.h5.createTable(
         self.h5.root,
         'aln_table',
         self.aln_tbl_dtype,
         expectedrows=exp_size)  # make the alignment table
     self.aln_table.flush()
     self.built_chr_tabs = True
def _main(args):

    if len(args) < 4:
        print >> sys.stderr, "usage: xls_motif_window.py <xls> <fasta> <matrix_file> <window>"
        sys.exit(1)

    fasta = fasta_subseq_2.FastaDB()
    fasta.openFastaFile(args[1])
    xls_regions = []
    for x in open(args[0]):
        spl = x[:-1].split()
        region = {
            'chr': spl[0],
            'start': int(spl[1]),
            'end': int(spl[2]),
            'enrich': spl[7]
        }
        region['seq'] = fasta[
            region['chr']]['sequence'][region['start']:region['end']]
        xls_regions.append(region)

    for r in xls_regions:
        try:
            annot = patser_tools.makePatserAnnotation(sequence=r['seq'],
                                                      matrix=args[2])
        except IOError:
            print >> sys.stderr, "Error in seq %s:%d..%d:" % (
                r['chr'], r['start'], r['end'])
            continue
        if len(annot.getAllFeatures()) < 1:
            continue
        maxhit = annot.getMaxFeature("score")
        winstart = None
        winend = None
        winseq = None
        if maxhit.tags["strand"] == '+':
            winstart = r['start'] + (maxhit.start - int(args[3]) / 2)
            winend = r['start'] + (maxhit.start + int(args[3]) / 2)
            win_seq = fasta[r['chr']]['sequence'][winstart:winend]
        else:
            winstart = r['start'] + ((maxhit.end - 3) - int(args[3]) / 2)
            winend = r['start'] + ((maxhit.end - 3) + int(args[3]) / 2)
            win_seq = fasta_subseq_2.revcomp(
                fasta[r['chr']]['sequence'][winstart:winend])
        print ">%s:%d..%d:%s enr=%s mtx=%s" % (
            r['chr'], winstart, winend, maxhit.tags['strand'], r['enrich'],
            maxhit.tags['score'])
        print win_seq
Esempio n. 6
0
def _main(args):

    if len(args) != 3:
        print "usage: <bed_file> <seq_file> <matrix>"
        sys.exit(0)

    fasta = fasta_subseq_2.FastaDB()
    fasta.openFastaFile(args[1])

    bed_annots = []
    bed_in = open(args[0])

    for line in bed_in:

        spl = line[:-1].split()
        fseq = fasta[spl[0]]["sequence"][int(spl[1]):int(spl[2])]
        if spl[5] == "-":
            fseq = fasta_subseq_2.revcomp(fseq)
        #print spl
        try:
            patannot = patser_tools.makePatserAnnotation(sequence=fseq,
                                                         matrix=args[2])
        except:
            continue
        #print "-" * 30
        #print spl
        #print pp(patannot.getAllFeatures())
        bed_annots.append({
            "seq": spl[0] + "_" + spl[1] + "_" + spl[2],
            "annotation": patannot
        })

    for ann in bed_annots:
        for feat in ann["annotation"].getAllFeatures():
            print "%s\t%i\t%i\t%f\t%f\t%s" % (
                ann["seq"], feat.st, feat.en, feat.tagset["score"],
                feat.tagset["pval"], feat.tagset["strand"])
Esempio n. 7
0
def _main(args):

    if len(args) != 3:
        print "usage: patser_annotate_genome_noxgrid.py <genome_seq> <matrix_file> <matrix_name>"
        sys.exit(0)

    # open fasta
    fasta = fasta_subseq_2.FastaDB()
    fasta.openFastaFile(args[0])

    jobs = []

    for (name, chr) in fasta.items():
        srch = searchObj(chrObj=chr,
                         seq_name=name,
                         matrix=args[1],
                         matrix_name=args[2])
        print srch
        jobs.append(srch)
    print jobs

    pool = mp.Pool()

    results = pool.map(search, jobs)
Esempio n. 8
0
 def _fillWGADBFromFile(self):
     #print self.h5.root
     #print self.h5.root._v_children.items()
     self.aln_table = self.h5.root.aln_table
     for sp in self.h5.root._v_groups.keys():
         self.species_chrs[sp] = {}
         self.species.append(sp)
         sp_fasta = fs2.FastaDB()
         sp_fasta.openFastaFile(SPP_FASTAS[sp])
         chrs = sorted(sp_fasta.keys())  #.sorted()
         self.chr_key_arrays[sp] = [None] * len(chrs)
         for chrom in self.h5.root._v_groups[sp]._v_children.keys():
             ch = chrom.replace("chr", "")
             #print (ch,chrom)
             self.species_chrs[sp][ch] = self.h5.root._v_groups[
                 sp]._v_children[chrom]
         for (i, chrom) in enumerate(chrs):
             #print self.species_chrs
             try:
                 self.chr_key_arrays[sp][i] = self.species_chrs[sp][chrom]
             except:
                 print >> sys.stderr, "WARNING: chromsome %s not found" % (
                     chrom, )
     self.built_chr_tabs = True