Example #1
0
def revcomp(par):

    with utils.openw(par["out_f"]) as outf:
        if par["complement"] and par["reverse"]:
            res = (
                r.reverse_complement(id=r.id, description="RC") for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta")
            )
        elif par["reverse"]:
            res = (r.reverse(id=r.id, description="R") for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta"))
        elif par["complement"]:
            res = (r.complement(id=r.id, description="C") for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta"))
        else:
            res = []

        SeqIO.write(res, outf, "fasta")
Example #2
0
def revcomp(par):

    with utils.openw(par['out_f']) as outf:
        if par['complement'] and par['reverse']:
            res = (r.reverse_complement(id=r.id, description="RC")
                   for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta"))
        elif par['reverse']:
            res = (r.reverse(id=r.id, description="R")
                   for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta"))
        elif par['complement']:
            res = (r.complement(id=r.id, description="C")
                   for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta"))
        else:
            res = []

        SeqIO.write(res, outf, "fasta")
def parse_primersearch( fn ):
    seqs = {}
    cur,seq,fs,rs,al,rseq,fseq = None, None, None, None, None, None, None
    with utils.openr( fn, "U" ) as inpf:
        for l in inpf:
            line = l.strip()
            if line.startswith("Amplimer") and 'Amplimer length' not in line:
                cur = line
            elif line.startswith("Sequence"):
                seq = line.split("Sequence:")[1].strip()
            elif 'hits forward strand at ' in line:
                fseq = line.split()[0]
                fs = int(line.split("hits forward strand at ")[1].split("with")[0])
            elif 'hits reverse strand at ' in line:
                rseq = line.split()[0]
                rs = int(line.split("hits reverse strand at ")[1].split("with")[0].strip()[1:-1])
            elif 'Amplimer length' in line:
                al = int(line.split("Amplimer length: ")[1].split("bp")[0])
                seqs[cur] = { 'seq' : seq, 'fs' : fs, 'rs' : rs, 'al' : al, 'rseq' : rseq, 'fseq' : fseq }
                cur,seq,fs,rs,al,rseq,fseq = None, None, None, None, None, None, None
    return seqs
Example #4
0
def blast_ncbi_outfmt6_screen(par):
    finp,fout = bool(par['inp_f']), bool(par['out_f'])

    inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['inp_f']) if finp else sys.stdin))

    out_mat =(l for l in inp_mat 
                    if float(l[par['pid_col']-1]) >= par['pid'] and
                       float(l[par['length_col']-1]) >= par['length'] and
                       float(l[par['evalue_col']-1]) <= par['evalue'] and
                       float(l[par['bitscore_col']-1]) >= par['bitscore']  )

    if 's' in par and par['s']:
        if par['s'] == 'pid':
            col = par['pid']-1
        elif par['s'] == 'evalue':
            col = par['evalue_col']
        elif par['s'] == 'length':
            col = par['length_col']-1
        elif par['s'] == 'bitscore':
            col = par['bitscore_col']-1

        out_mat = sorted( out_mat, 
                          key=lambda x: float(x[col-1]) )

        if 'n' in par and par['n'] > -1:
            out_mat = out_mat[:par['n']]
    
    unique_queries = collections.defaultdict( int ) 
    with utils.openw(par['out_f']) if fout else sys.stdout as out_file:
        if 't' in par and par['t'] > -1:
            for l in out_mat:
                unique_queries[l[0]] += 1
                if unique_queries[l[0]] > par['t']:
                    continue
                out_file.write("\t".join(l)+"\n")
        else:
            for l in out_mat:
                out_file.write("\t".join(l)+"\n")
Example #5
0
def blast_ncbi_outfmt6_screen(par):
    finp, fout = bool(par['inp_f']), bool(par['out_f'])

    inp_mat = (l.rstrip('\n').split("\t")
               for l in (utils.openr(par['inp_f']) if finp else sys.stdin))

    out_mat = (l for l in inp_mat if float(l[par['pid_col'] - 1]) >= par['pid']
               and float(l[par['length_col'] - 1]) >= par['length']
               and float(l[par['evalue_col'] - 1]) <= par['evalue']
               and float(l[par['bitscore_col'] - 1]) >= par['bitscore'])

    if 's' in par and par['s']:
        if par['s'] == 'pid':
            col = par['pid'] - 1
        elif par['s'] == 'evalue':
            col = par['evalue_col']
        elif par['s'] == 'length':
            col = par['length_col'] - 1
        elif par['s'] == 'bitscore':
            col = par['bitscore_col'] - 1

        out_mat = sorted(out_mat, key=lambda x: float(x[col - 1]))

        if 'n' in par and par['n'] > -1:
            out_mat = out_mat[:par['n']]

    unique_queries = collections.defaultdict(int)
    with utils.openw(par['out_f']) if fout else sys.stdout as out_file:
        if 't' in par and par['t'] > -1:
            for l in out_mat:
                unique_queries[l[0]] += 1
                if unique_queries[l[0]] > par['t']:
                    continue
                out_file.write("\t".join(l) + "\n")
        else:
            for l in out_mat:
                out_file.write("\t".join(l) + "\n")
Example #6
0
import utils

try:
    import argparse as ap
    import bz2 
except ImportError:
    sys.stderr.write( "argparse not found" )
    sys.exit(-1)

def read_params( args ):
    p = ap.ArgumentParser(description='Convert txt files to libsvm\n')

    p.add_argument( 'txt', nargs='?', default=None, type=str,
            help=   "the input txt file [stdin if not present]")
    p.add_argument('ls', nargs='?', default=None, type=str,
            help=   "the output ilibsvm file compressed if fiven with bz2 extension\n"
                    "[stdout if not present]")

    return vars( p.parse_args() )

if __name__ == "__main__":
    args = read_params( sys.argv )
    uc2cl = collections.defaultdict( set )

    with utils.openr(args['txt']) as inp:
        data = zip(*[l.strip().split('\t') for l in inp])
        outd = [[d[0]]+[str(i+1)+":"+dd for i,dd in enumerate(d[1:])] for d in data[1:]]
        with utils.openw(args['ls']) as out:
            for o in outd:
                out.write( "\t".join(o) +"\n" )
    arg("-a", default=None, type=int, help="number of char after the match to report")
    arg("-n", default=None, type=int, help="number of matching primers")

    parser.add_argument("-s", metavar="Subsequene to look for", required=True, type=str)

    return vars(parser.parse_args())


if __name__ == "__main__":
    par = read_params(sys.argv)

    ss = par["s"].lower()
    ssr = Seq(par["s"]).reverse_complement().lower()
    f = os.path.basename(par["inp_f"]).split(".")[0]
    with utils.openw(par["out_f"]) as outf:
        for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta"):
            rl = r.seq.lower()
            if ss in rl or ssr in rl:
                if par["a"]:
                    if ss in rl:
                        i = str(rl).index(str(ss))
                        subs = rl[i : i + len(ss) + par["a"]] if i + len(ss) + par["a"] < len(rl) else rl[i:]
                    else:
                        i = str(rl).index(str(ssr))
                        subs = rl[i : i + len(ssr) + par["a"]] if i + len(ssr) + par["a"] < len(rl) else rl[i:]
                    outf.write(f + "\t" + str(r.id) + "\t" + str(subs) + "\n")
                else:
                    if par["n"]:
                        n = str(rl).count(str(ss)) + str(rl).count(str(ssr))
                        outf.write(f + "\t" + str(r.id) + "\t" + str(n) + "\n")
                    else:
Example #8
0
    uc2cl = collections.defaultdict( set )
   
    if not args['g2t'] and not args['t2g']:
        sys.stdout.write("Error one of --t2g and --g2t must be provided\n")
        sys.exit(0)
    g2t = {}
    if args['g2t']:
        with open( args['g2t'] ) as inp:
            g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
    elif args['t2g']:
        with open( args['t2g'] ) as inp:
            for ll in (l.strip().split('\t') for l in inp):
                for g in ll[1:]:
                    g2t[int(g)] = int(ll[0])
    
    with utils.openr( args['ctxt'] ) as inp:
        valin = (l.strip().split('\t') for l in inp)

        g2c = collections.defaultdict( set )
        
        if args['b6o']:
            inp_mat = ((int(a),int(b)) for a,b in (l.rstrip('\n').split("\t")[:2] for l in utils.openr(args['b6o'])))
    
            #all_targets = set()
            for fr,to in inp_mat:
                #all_targets.add( to )
                if fr != to:
                    g2c[fr].add( to )

        n = args['n'] # if args['n'] else len(all_targets)
        n = float(n)
Example #9
0
        '--txt',
        required=True,
        default=None,
        type=str,
        help=
        "the table of the samples to profiles [tab-delimited, columns ID are profileName]"
    )
    p.add_argument('--nmiss', default=0, type=int)

    return vars(p.parse_args())


if __name__ == "__main__":
    args = read_params(sys.argv)

    fna = SeqIO.to_dict(SeqIO.parse(utils.openr(args['fna']), "fasta"))
    fna_out = []

    profiles = {}
    mlst_names = []
    with utils.openr(args['txt']) as inp:
        for i, line in enumerate(inp):
            if i == 0:
                mlst_names = line.strip().split('\t')[1:]
                continue
            l = line.strip().split('\t')
            profiles[l[0]] = dict([(na, l[n + 1])
                                   for n, na in enumerate(mlst_names)])
    for s, p in profiles.items():
        seq = ""
        skip = 0
Example #10
0
def sss(par):
    subsample = bool(par['subsample'])
    select = bool(par['select'])
    randomize = bool(par['randomize'])
    if bool(par['out_f']):
        n = par['split']
        #openw = bz2.BZ2File if par['out_f'].endswith(".bz2") else open
        if n == 1:
            out_stream = [utils.openw(par['out_f'])]
        else:
            out_stream = [
                utils.openw(par['out_f'] + str(r).zfill(len(str(n))) + ".fna" +
                            (".bz2" if par['out_f'].endswith(".bz2") else ""))
                for r in range(n)
            ]
    else:
        out_stream = [sys.stdout]  # larger buffer?

    if select:
        if os.path.exists(par['ids']):
            #openr = bz2.BZ2File if par['ids'].endswith(".bz2") else open
            es = [s.strip().split('\t')[0] for s in utils.openr(par['ids'])]
        else:
            es = [(s.split("$")[1] if s.count("$") else s)
                  for s in par['ids'].split(":::")]
        es = set(es)

    all_reads = []
    nstreams = len(out_stream)

    p = par['subsample']
    #reads = reader( par['inp_f'], par['min_len'], par['max_len'] )
    cind = 0
    lmin, lmax = par['min_len'], par['max_len']
    for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta"):
        if lmin and len(r.seq) < lmin:
            continue
        if lmax and len(r.seq) > lmax:
            continue
        if select:
            if par['reverse']:
                if r.id in es:
                    continue
            elif r.id not in es:
                continue
        if subsample and rnd.random() > p:
            continue
        if randomize:
            all_reads.append(r)
            continue
        SeqIO.write(r, out_stream[cind], "fasta")
        cind = (cind + 1) % nstreams
    """
    for r in reads:
        if select and r.n not in es:
            continue
        if subsample and rnd.random() > p:
            continue
        if randomize:
            all_reads.append( r )
            continue
        out_stream[cind].write(  str(r)  )
        cind = (cind + 1) % nstreams
    """

    if randomize:
        rnd.shuffle(all_reads)
        step = len(all_reads) / nstreams
        for i, r in enumerate(all_reads):
            #out_stream[cind].write( str(r) )
            SeqIO(r, out_stream[cind], "fasta")
            if not i % step:
                cind = (cind + 1) % nstreams

    for o in out_stream:
        o.close()
Example #11
0
    p = ap.ArgumentParser(description='Convert core gene files to core gene summaries\n')

    p.add_argument( 'cg', nargs='?', default=None, type=str,
            help=   "the input cg file [stdin if not present]")
    p.add_argument('cgs', nargs='?', default=None, type=str,
            help=   "the output summary file\n"
                    "[stdout if not present]")

    return vars( p.parse_args() )

if __name__ == "__main__":
    args = read_params( sys.argv )

    gid2cores = collections.defaultdict( set )
    #with (open(args['uc']) if args['uc'] else sys.stdin) as inp:
    with utils.openr(args['cg']) as inp:
        for line in (l.split('\t') for l in inp):
            if int(line[0]) > 0:
                gid,clade,ncore,ngenomes,pv =  line[:5]
            else:
                gid,clade,ncore,ngenomes,pv =  line[1:6]
            gid2cores[gid].add( (clade,ncore,ngenomes,pv) )

    clades2cores = collections.defaultdict( set )
    for k,v in gid2cores.items():
        if len(v) > 1:
            continue
        clades2cores[list(v)[0][0]].add( k )

    #openw = bz2.BZ2File if args['txt'].endswith(".bz2") else open
    with utils.openw(args['cgs']) as out:
Example #12
0
    p.add_argument( 'fna', nargs='?', default=None, type=str,
            help=   "the input uc file [stdin if not present]")
    p.add_argument('rxl', nargs='?', default=None, type=str,
            help=   "the output txt file compresse if fiven with bz2 extension\n"
                    "[stdout if not present]")
    """
    p.add_argument('--subsample', metavar="Subsampling rate",
            default=1.0, type=float )
    p.add_argument('-n', metavar="Minimum number of matching taxa",
            default=0, type=int )
    p.add_argument('-p', metavar="Prefix for taxon names",
            default="", type=str )
    """
    return vars( p.parse_args() )

if __name__ == "__main__":
    args = read_params( sys.argv )


    fna = SeqIO.to_dict(SeqIO.parse( utils.openr(args['fna']), "fasta"))

    with utils.openw(args['rxl']) as out:
        n = len(fna.values()[0])
        out.write( str(len(fna))+" "+str(n)+"\n" )

        for k,v in fna.items():
            if len(k) > 14:
                k = k[:14]
            out.write( str(k)+" "*(15-len(str(k)[1:]))+str(v.seq) +"\n" )
Example #13
0
def sss( par ):
    subsample = bool(par['subsample']) 
    select = bool(par['select'])
    randomize = bool(par['randomize'])
    if bool(par['out_f']):
        n = par['split']
        #openw = bz2.BZ2File if par['out_f'].endswith(".bz2") else open
        if n == 1:
            out_stream = [utils.openw( par['out_f'])]
        else:
            out_stream = [utils.openw( par['out_f']+str(r).zfill(len(str(n)))+".fna"+(".bz2" if par['out_f'].endswith(".bz2") else "")) for r in range(n)]
    else:
        out_stream = [sys.stdout] # larger buffer?

    if select:
        if os.path.exists(par['ids']):
            #openr = bz2.BZ2File if par['ids'].endswith(".bz2") else open 
            es = [s.strip().split('\t')[0] for s in utils.openr(par['ids'])]
        else:
            es = [(s.split("$")[1] if s.count("$") else s) for s in  par['ids'].split(":::")]
        es = set(es)

    all_reads = []
    nstreams = len( out_stream )

    p = par['subsample']
    #reads = reader( par['inp_f'], par['min_len'], par['max_len'] )
    cind = 0
    lmin,lmax = par['min_len'], par['max_len'] 
    for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta"):
        if lmin and len(r.seq) < lmin:
            continue
        if lmax and len(r.seq) > lmax:
            continue
        if select:
            if par['reverse']:
                if r.id in es:
                    continue
            elif r.id not in es:
                continue
        if subsample and rnd.random() > p:
            continue
        if randomize:
            all_reads.append( r )
            continue
        SeqIO.write(r, out_stream[cind], "fasta")
        cind = (cind + 1) % nstreams
    
    """
    for r in reads:
        if select and r.n not in es:
            continue
        if subsample and rnd.random() > p:
            continue
        if randomize:
            all_reads.append( r )
            continue
        out_stream[cind].write(  str(r)  )
        cind = (cind + 1) % nstreams
    """

    if randomize:
        rnd.shuffle(all_reads)
        step = len(all_reads) / nstreams 
        for i,r in enumerate(all_reads):
            #out_stream[cind].write( str(r) )
            SeqIO(r, out_stream[cind], "fasta" )
            if not i % step:
                cind = (cind + 1) % nstreams

    for o in out_stream:
        o.close()
Example #14
0
 def __init__( self, fn, min_len = None, max_len = None ):
     self.ret = False
     self.min_len, self.max_len = min_len, max_len
     #openr = bz2.BZ2File if bool(fn) and fn.endswith(".bz2") else open
     self.inp = utils.openr(fn) if bool(fn) else sys.stdin
     self.cr = read( )
Example #15
0
    if par['r'].count(":"):
        rn, par['r'] = par['r'].split(":")
        rn += ":"
    if par['l'].count(":"):
        ln, par['l'] = par['l'].split(":")
        ln += ":"

    ne = str(par['e'])
    c_r = par['r'].lower()
    c_r_rev = Seq(par['r']).reverse_complement().lower()
    c_l = par['l'].lower()
    c_l_rev = Seq(par['l']).reverse_complement().lower()
    f = os.path.basename(par['inp_f']).split(".")[0]

    with utils.openw(par['out_f']) as outf:
        for seq in SeqIO.parse(utils.openr(par['inp_f']), "fasta"):
            seql = str(seq.seq.lower())
            r_rev, l_rev, r, l = c_r_rev, c_l_rev, c_r, c_l

            rr = regex.findall("(" + r + "){e<=" + ne + "}", seql)
            lr = regex.findall("(" + l + "){e<=" + ne + "}", seql)
            r_revr = regex.findall("(" + str(r_rev) + "){e<=" + ne + "}", seql)
            l_revr = regex.findall("(" + str(l_rev) + "){e<=" + ne + "}", seql)

            if len(rr) > 1:
                outf.write(str(rr) + " unspecific 1\n")
            if len(lr) > 1:
                outf.write(str(lr) + " unspecific 2\n")
            if len(r_revr) > 1:
                outf.write(str(r_revr) + " unspecific 3\n")
            if len(l_revr) > 1:
Example #16
0
        lt,last = a[0],(a[3:] if "?" not in a else last)
    v += arr[-2:] 
        
    return v 

if __name__ == "__main__":
    args = read_params( sys.argv )
    uc2cl = collections.defaultdict( set )

    tax_lev = "dpcofgs"
    tax_lev_exp = ['Domain','Phylum','Class','Order','Family','Genus','Species']

    fp = tempfile.TemporaryFile()

    if args['corrections']:
        with utils.openr(args['corrections']) as inp:
            frto = {}
            frtoid = {}
            for pat in (l.split('\t') for l in inp):
                if len(pat) == 2:
                    frto[pat[0].strip()] = pat[1].strip()
                else:
                    frtoid[pat[0].strip()] = (pat[1].strip(),pat[2].strip())
            with utils.openr(args['img'],"rU") as inpf:
                nfa = []
                for l in inpf:
                    nf = l
                    for f,t in frto.items():
                        nf = nf.replace(f,t)
                    for i,(f,t) in frtoid.items():
                        if l.startswith(i+"\t"):
Example #17
0
    return vars(p.parse_args())


if __name__ == "__main__":
    args = read_params(sys.argv)
    uc2cl = collections.defaultdict(set)

    gint = str if args['sk'] else int

    if not args['g2t'] and not args['t2g']:
        sys.stdout.write("Error one of --t2g and --g2t must be provided\n")
        sys.exit(0)
    g2t = {}

    if args['g2t']:
        with utils.openr(args['g2t']) as inp:
            #g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
            for l in inp:
                f, t = l.strip().split('\t')
                g2t[gint(f)] = gint(t)
    elif args['t2g']:
        with utils.openr(args['t2g']) as inp:
            for ll in (l.strip().split('\t') for l in inp):
                for g in ll[1:]:
                    g2t[gint(g)] = gint(ll[0])

    with utils.openw(args['txt']) as out:
        with utils.openr(args['ctxt']) as inp:
            for l in inp:
                valin = [gint(a) for a in l.strip().split('\t')]
Example #18
0
    uc2cl = collections.defaultdict(set)

    if not args['g2t'] and not args['t2g']:
        sys.stdout.write("Error one of --t2g and --g2t must be provided\n")
        sys.exit(0)
    g2t = {}
    if args['g2t']:
        with open(args['g2t']) as inp:
            g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
    elif args['t2g']:
        with open(args['t2g']) as inp:
            for ll in (l.strip().split('\t') for l in inp):
                for g in ll[1:]:
                    g2t[int(g)] = int(ll[0])

    with utils.openr(args['ctxt']) as inp:
        valin = (l.strip().split('\t') for l in inp)

        g2c = collections.defaultdict(set)

        if args['b6o']:
            inp_mat = ((int(a), int(b))
                       for a, b in (l.rstrip('\n').split("\t")[:2]
                                    for l in utils.openr(args['b6o'])))

            #all_targets = set()
            for fr, to in inp_mat:
                #all_targets.add( to )
                if fr != to:
                    g2c[fr].add(to)
    if par['r'].count(":"):
        rn,par['r'] = par['r'].split(":")
        rn+=":"
    if par['l'].count(":"):
        ln,par['l'] = par['l'].split(":")
        ln+=":"

    ne = str(par['e'])
    c_r = par['r'].lower()
    c_r_rev = Seq(par['r']).reverse_complement().lower()
    c_l = par['l'].lower()
    c_l_rev = Seq(par['l']).reverse_complement().lower()
    f = os.path.basename(par['inp_f']).split(".")[0]

    with utils.openw( par['out_f'] ) as outf:
        for seq in SeqIO.parse( utils.openr(par['inp_f']), "fasta"):
            seql = str(seq.seq.lower())
            r_rev,l_rev,r,l = c_r_rev,c_l_rev,c_r,c_l      
           
            rr = regex.findall( "("+r+"){e<="+ne+"}", seql )
            lr = regex.findall( "("+l+"){e<="+ne+"}", seql )
            r_revr = regex.findall( "("+str(r_rev)+"){e<="+ne+"}", seql )
            l_revr = regex.findall( "("+str(l_rev)+"){e<="+ne+"}", seql )

            if len(rr) > 1:
                outf.write( str(rr) +" unspecific 1\n" )
            if len(lr) > 1:
                outf.write( str(lr) +" unspecific 2\n" )
            if len(r_revr) > 1:
                outf.write( str(r_revr) +" unspecific 3\n" )
            if len(l_revr) > 1:
Example #20
0
 def __init__(self, fn, min_len=None, max_len=None):
     self.ret = False
     self.min_len, self.max_len = min_len, max_len
     #openr = bz2.BZ2File if bool(fn) and fn.endswith(".bz2") else open
     self.inp = utils.openr(fn) if bool(fn) else sys.stdin
     self.cr = read()
Example #21
0
def read_params(args):
    parser = argparse.ArgumentParser(
        description='List the genes in the genome file')
    arg = parser.add_argument
    arg('inp_f',
        metavar='INPUT_FILE',
        default=None,
        type=str,
        help="the input fna file")
    arg('out_f',
        metavar='OUTPUT_FILE',
        nargs='?',
        default=None,
        type=str,
        help="the output txt file [stdout if not present]")

    return vars(parser.parse_args())


def genome_id(fn):
    return str(-int(os.path.basename(fn).split(".")[0]))


if __name__ == '__main__':
    par = read_params(sys.argv)

    ids = [r.id for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta")]

    with utils.openw(par['out_f']) as out:
        out.write("\t".join([genome_id(par['inp_f'])] + ids) + "\n")
    return vars( p.parse_args() )

if __name__ == "__main__":
    args = read_params( sys.argv )
    uc2cl = collections.defaultdict( set )
    
    gint = str if args['sk'] else int

    if not args['g2t'] and not args['t2g']:
        sys.stdout.write("Error one of --t2g and --g2t must be provided\n")
        sys.exit(0)
    g2t = {}


    if args['g2t']:
        with utils.openr( args['g2t'] ) as inp:
            #g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp))
            for l in inp:
                f,t = l.strip().split('\t')
                g2t[gint(f)] = gint(t)
    elif args['t2g']:
        with utils.openr( args['t2g'] ) as inp:
            for ll in (l.strip().split('\t') for l in inp):
                for g in ll[1:]:
                    g2t[gint(g)] = gint(ll[0])
    
    with utils.openw(args['txt']) as out:
        with utils.openr( args['ctxt'] ) as inp:
            for l in inp:
                valin = [gint(a) for a in l.strip().split('\t')]
Example #23
0
    arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str,
         help="the input fna file [stdin if not present]")
    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
         help="the output fna file [stdout if not present]")

    parser.add_argument('--extract_targets', action='store_true', help="Select fna entries\n")
    parser.add_argument('-i', action='store_true', help="Add hit stats to fna entries\n")
    
    parser.add_argument('--bo6', metavar='Bo6 file', required=True, type = str )

    return vars(parser.parse_args())

if __name__ == '__main__':
    par = read_params(sys.argv)

    inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['bo6'])))

    if par['extract_targets']:
        toextr = ((l[1], l[2], l[3], l[11], int(l[8]), int(l[9])) for l in inp_mat)
    else:
        toextr = ((l[0], l[2],  l[3], l[11], int(l[6]), int(l[7])) for l in inp_mat)
   
    inpfasta = SeqIO.to_dict(SeqIO.parse( utils.openr(par['inp_f']), "fasta"))

    out_seqs = []
    for n,pid,l,bit,fr,to in toextr:
        n = inpfasta[n][min(fr,to):max(fr,to)]
        if par['i']:
            p = "_pid"+pid.strip()+"_l"+l.strip()+"_bs"+bit.strip()
        else:
            p = ""
Example #24
0
import os
import textwrap
from collections import namedtuple as nt
import random as rnd
rnd.seed(1982)
import utils
from Bio import SeqIO

def read_params(args):
    parser = argparse.ArgumentParser(description='List the genes in the genome file')
    arg = parser.add_argument
    arg( 'inp_f', metavar='INPUT_FILE', default=None, type=str,
         help="the input fna file")
    arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str,
         help="the output txt file [stdout if not present]")

    return vars(parser.parse_args())

def genome_id( fn ):
    return str(-int(os.path.basename(fn).split(".")[0]))

if __name__ == '__main__':
    par = read_params(sys.argv)
  
    
    ids = [r.id for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta")]

    with utils.openw( par['out_f']) as out:
        out.write( "\t".join( [genome_id(par['inp_f'])]+ids ) + "\n" )

Example #25
0
        nargs='?',
        default=None,
        type=str,
        help="the output txt file compresse if fiven with bz2 extension\n"
        "[stdout if not present]")
    """
    p.add_argument('--subsample', metavar="Subsampling rate",
            default=1.0, type=float )
    p.add_argument('-n', metavar="Minimum number of matching taxa",
            default=0, type=int )
    p.add_argument('-p', metavar="Prefix for taxon names",
            default="", type=str )
    """
    return vars(p.parse_args())


if __name__ == "__main__":
    args = read_params(sys.argv)

    fna = SeqIO.to_dict(SeqIO.parse(utils.openr(args['fna']), "fasta"))

    with utils.openw(args['rxl']) as out:
        n = len(fna.values()[0])
        out.write(str(len(fna)) + " " + str(n) + "\n")

        for k, v in fna.items():
            if len(k) > 14:
                k = k[:14]
            out.write(
                str(k) + " " * (15 - len(str(k)[1:])) + str(v.seq) + "\n")
Example #26
0
def read_params( args ):
    p = ap.ArgumentParser(description='Create a fasta file with the'
            'concatenated mlst sequence from a mlst table and the single sequences')

    p.add_argument( '--fna', required=True, default=None, type=str,
            help=   "the file with all the MLST profiles [in the format >profilineName_profileID")
    p.add_argument( '--txt', required=True, default=None, type=str,
            help=   "the table of the samples to profiles [tab-delimited, columns ID are profileName]")
    p.add_argument( '--nmiss', default = 0, type = int )

    return vars( p.parse_args() )

if __name__ == "__main__":
    args = read_params( sys.argv )

    fna = SeqIO.to_dict(SeqIO.parse( utils.openr(args['fna']), "fasta"))
    fna_out = [] 

    profiles = {}
    mlst_names = []
    with utils.openr(args['txt']) as inp:
        for i,line in enumerate(inp):
            if i == 0:
                mlst_names = line.strip().split('\t')[1:]
                continue
            l = line.strip().split('\t')
            profiles[l[0]] = dict([(na,l[n+1]) for n,na in enumerate(mlst_names)])
    for s,p in profiles.items():
        seq = ""
        skip = 0 
        for n in mlst_names:
Example #27
0
    sys.exit(-1)

def read_params( args ):
    p = ap.ArgumentParser(description='Convert Usearch ".uc" files in tab-delimited'
            ' files with the seed as first field followed by the other IDs\n')

    p.add_argument( 'uc', nargs='?', default=None, type=str,
            help=   "the input uc file [stdin if not present]")
    p.add_argument('txt', nargs='?', default=None, type=str,
            help=   "the output txt file compressed if fiven with bz2 extension\n"
                    "[stdout if not present]")

    return vars( p.parse_args() )

if __name__ == "__main__":
    args = read_params( sys.argv )
    uc2cl = collections.defaultdict( set )

    #with (open(args['uc']) if args['uc'] else sys.stdin) as inp:
    with utils.openr(args['uc']) as inp:
        for type,cln,seql,pid,strand,ing1,ign2,aln,query,target in (l.split('\t') for l in inp):
            if type == 'H':
                uc2cl[target.strip()].add( query )
            elif type == 'S' and  query not in uc2cl:
                uc2cl[query] = set()

    #openw = bz2.BZ2File if args['txt'].endswith(".bz2") else open
    with utils.openw(args['txt']) as out:
        for k,v in sorted(uc2cl.items(),key=lambda x:-len(x[1])):
            out.write( "\t".join([k]+list(v)) +"\n" )
Example #28
0
                   metavar="Prefix for taxon names",
                   default="",
                   type=str)
    p.add_argument('--sk', action='store_true')

    return vars(p.parse_args())


if __name__ == "__main__":
    args = read_params(sys.argv)
    uc2cl = collections.defaultdict(set)

    gint = str if args['sk'] else int

    valin = []
    with utils.openr(args['ctxt']) as inp:
        for l in inp:
            tset = set([gint(a) for a in l.strip().split('\t')][1:])
            if len(tset) < args['n']:
                continue
            valin.append(tset)
    all_t = set()
    for v in valin:
        all_t |= v

    res = {}
    for t in all_t:
        #if len(t) < args['n']:
        #    continue
        res[t] = [int(t in v) for v in valin]
                cur,seq,fs,rs,al,rseq,fseq = None, None, None, None, None, None, None
    return seqs


if __name__ == "__main__":
    args = read_params( sys.argv )

    extr = parse_primersearch( args['ps']  )
    
    seqs2extr = {}
    for k,v in extr.items():
        if v['seq'] in seqs2extr:
            seqs2extr[v['seq']][k] = v
        else:
            seqs2extr[v['seq']] = { k: v }

    with utils.openw( args['out'] ) as outf:
        for r in SeqIO.parse( utils.openr(args['fna']), "fasta"):
            if r.id in seqs2extr:
                for pn,ext in seqs2extr[r.id].items():
                    sq = SeqRecord( r.id )
                    sq.id = r.id + " " + pn
                    sq.description = r.description + " " + pn
                    sq.seq = r.seq[ ext['fs']+len(ext['fseq']):len(r.seq)-ext['rs']-len(ext['rseq'])]
                    SeqIO.write(sq, outf, "fasta")