Example #1
0
def main(qbed, sbed, pairs_file, qpad, spad, pair_fmt, blast_path, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(ncpu)
    bl2seq = determine_blast_type(blast_path,mask)
#    legacy_bl2seq =   "%s " % blast_path + \
#           " bl2seq -p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
#           " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
#              -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
#            | grep -v 'WARNING' | grep -v 'ERROR' "
#
#    bl2seq = legacy_bl2seq
#   
#    bl2seq = "%s " % blast_path + \
#           " -p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
#           " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
#              -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
#            | grep -v 'WARNING' | grep -v 'ERROR' "
#
    fcnss = sys.stdout
    print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,bitscore...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed)
    # need this for parallization stuff.
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]

        # this helps in parallization
        spad_map = [spad] * len(pairs)
        qpad_map = [qpad] * len(pairs)
        sfastas_map = [sfastas] * len(pairs)
        qfastas_map = [qfastas] * len(pairs)
        bl2seq_map =  [bl2seq] * len(pairs)
        #################################
        cmds = [c for c in map(get_cmd, [l for l in pairs if
                l],bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c]
        #print >>sys.stderr ,cmds
        results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat['accn']),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1

            cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue

            qname, sname = qfeat['accn'], sfeat['accn']
            print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname,
                             ",".join(map(lambda l: ",".join(map(str,l)),cnss)))

    return None
Example #2
0
def main(qbed, sbed, pairs_file, qpad, spad, unmasked_fasta, pair_fmt,blast_path, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(ncpu)
    
    bl2seq = "%s " % blast_path + \
            "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
            " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
            -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
            | grep -v 'WARNING' | grep -v 'ERROR' "


    fcnss = sys.stdout
    print >> fcnss,
    "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,bitscore...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed)
    # need this for parallization stuff.
    
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]
        # this helps in parallelizing.
	spad_map = [spad] * len(pairs)
        qpad_map = [qpad] * len(pairs)
        sfastas_map = [sfastas] * len(pairs)
        qfastas_map = [qfastas] * len(pairs)
        bl2seq_map =  [bl2seq] * len(pairs)
	####################################       
 
	cmds = [c for c in map(get_cmd, [l for l in pairs if
                l],bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c]
	results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))

        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat['accn']),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
            cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue

            qname, sname = qfeat['accn'], sfeat['accn']
            print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname,
                             ",".join(map(lambda l: ",".join(map(str,l)),cnss)))

    return None
Example #3
0
def main(cns_bed, ortho_bed, pairs_file, qpad, spad, blast_path, ncpu, mask):
  "imput a cns_dict and otholog_dict, cns_bed "
  pool = Pool(ncpu)
  fcns = sys.stdout
  print >> fcns, "#cns_start, cns_stop, interval"
  bl2seq = "%s " % blast_path + \
          "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
          " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
             -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
           | grep -v 'WARNING' | grep -v 'ERROR' "
           
  qfastas = get_fastas(cns_bed, True)
  sfastas = get_fastas(ortho_bed, False)
  pairs = [True]
  _get_pair_gen = get_pair(pairs_file ,'pck', ortho_bed, cns_bed)
  
  def get_pair_gen():
    try: return _get_pair_gen.next()
    except StopIteration: return None
      
  while any(pairs):
    pairs = [get_pair_gen() for i in range(ncpu)]
    
    def get_cmd(pairs):
      qfeat, sfeat = pairs
      if pairs is None: return None
      
      sfasta = sfastas[sfeat['seqid']]
      qfasta = qfastas[qfeat['seqid']]
      
      qstart, qstop = max(qfeat['start'] - qpad, 1), qfeat['end'] + qpad
      sstart, sstop = max(sfeat['start'] - spad, 1), sfeat['end'] + spad
      
      m = qstop - qstart
      n = sstop - sstart
      
      e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise
      assert e_value > 0
      
      cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart,
                          sstart=sstart, qstop=qstop, sstop=sstop, e_value=e_value)
      return cmd, qfeat
      
    cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c]
    results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
    #cmd = get_cmd(qfeat, sfeat, qpad, spad, mask)
    for res,(cmd, qfeat) in zip(results,cmds):
      if not res.strip(): continue #ask tom?
      interval = parse_blast(res, qfeat)
      if len(interval) >= 1:
        print >> fcns, '{0},{1},{2}'.format(qfeat['start'], qfeat['end'], interval)
Example #4
0
def run():    
    import random
    from processing import Pool

    p=Pool(2)

    print "ahora vemos si escala"
 
    numero = 5000000
    a = [random.randint(0, 100) for a in xrange(0, numero)]
    
    print "Ya tenemos los numeros"
    lista = [a[:numero/2] , a[numero/2:]]
    print "Lista bisectada"
    result=p.mapAsync(my_sort, lista)
    print "threads lanzados"
    lista1, lista2 = result.get()
    print "Uniendo listas"
    b = my_merge(lista1, lista2)
   # b = my_sort(a)
    print "largo", len(b), "llamadas a my_sort"
Example #5
0
def main(qbed, sbed, pairs_file, qpad, spad, unmasked_fasta, pair_fmt, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(options.ncpu)

    bl2seq = "~/src/blast-2.2.25/bin/bl2seq " \
           "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
           " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
              -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
            | grep -v 'WARNING' | grep -v 'ERROR' "

    fcnss = sys.stdout
    print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed)
    # need this for parallization stuff.
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]

        # this helps in parallelizing.
        def get_cmd(pair):
            if pair is None: return None
            qfeat, sfeat = pair
            #if qfeat['accn'] != "Bradi4g01820": return None
            #print >>sys.stderr, qfeat, sfeat

            qfasta = qfastas[qfeat['seqid']]
            sfasta = sfastas[sfeat['seqid']]

            qstart, qstop = max(qfeat['start'] - qpad, 1), qfeat['end'] + qpad
            sstart, sstop = max(sfeat['start'] - spad, 1), sfeat['end'] + spad

            assert qstop - qstart > 2 * qpad or qstart == 1, (qstop, qstart)
            assert sstop - sstart > 2 * spad or sstart == 1, (sstop, sstart)
            
            m = qstop - qstart
            n = sstop - sstart
            # if (m*n) >= 812045000: # if the database and query is large keep e_value at 2.11 else change it to something smaller
            #     e_value = 2.11
            # else:
            e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise
            assert e_value > 0

            cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart,
                                sstart=sstart, qstop=qstop, sstop=sstop, e_value=e_value)
            #print >>sys.stderr,  "%s" % (cmd)
            return cmd, qfeat, sfeat

        cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c]
        results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
        #results = (r for r in map(commands.getoutput, [c[0] for c in cmds]))

        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat['accn']),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1

            cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue

            qname, sname = qfeat['accn'], sfeat['accn']
            print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname,
                             ",".join(map(lambda l: ",".join(map(str,l)),cnss)))

    return None
Example #6
0
def test():
    print 'cpuCount() = %d\n' % cpuCount()

    #
    # Create pool
    #

    PROCESSES = 4
    print 'Creating pool with %d processes\n' % PROCESSES
    pool = Pool(PROCESSES)

    #
    # Tests
    #

    TASKS = [(mul, (i, 7)) for i in range(10)] + \
            [(plus, (i, 8)) for i in range(10)]

    results = [pool.applyAsync(calculate, t) for t in TASKS]
    imap_it = pool.imap(calculatestar, TASKS)
    imap_unordered_it = pool.imapUnordered(calculatestar, TASKS)

    print 'Ordered results using pool.applyAsync():'
    for r in results:
        print '\t', r.get()
    print

    print 'Ordered results using pool.imap():'
    for x in imap_it:
        print '\t', x
    print

    print 'Unordered results using pool.imapUnordered():'
    for x in imap_unordered_it:
        print '\t', x
    print

    print 'Ordered results using pool.map() --- will block till complete:'
    for x in pool.map(calculatestar, TASKS):
        print '\t', x
    print

    #
    # Simple benchmarks
    #

    N = 100000
    print 'def pow3(x): return x**3'

    t = time.time()
    A = map(pow3, xrange(N))
    print '\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t)

    t = time.time()
    B = pool.map(pow3, xrange(N))
    print '\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t)

    t = time.time()
    C = list(pool.imap(pow3, xrange(N), chunksize=N // 8))
    print '\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \
          ' seconds' % (N, N//8, time.time() - t)

    assert A == B == C, (len(A), len(B), len(C))
    print

    L = [None] * 1000000
    print 'def noop(x): pass'
    print 'L = [None] * 1000000'

    t = time.time()
    A = map(noop, L)
    print '\tmap(noop, L):\n\t\t%s seconds' % \
          (time.time() - t)

    t = time.time()
    B = pool.map(noop, L)
    print '\tpool.map(noop, L):\n\t\t%s seconds' % \
          (time.time() - t)

    t = time.time()
    C = list(pool.imap(noop, L, chunksize=len(L) // 8))
    print '\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \
          (len(L)//8, time.time() - t)

    assert A == B == C, (len(A), len(B), len(C))
    print

    del A, B, C, L

    #
    # Test error handling
    #

    print 'Testing error handling:'

    try:
        print pool.apply(f, (5, ))
    except ZeroDivisionError:
        print '\tGot ZeroDivisionError as expected from pool.apply()'
    else:
        raise AssertionError, 'expected ZeroDivisionError'

    try:
        print pool.map(f, range(10))
    except ZeroDivisionError:
        print '\tGot ZeroDivisionError as expected from pool.map()'
    else:
        raise AssertionError, 'expected ZeroDivisionError'

    try:
        print list(pool.imap(f, range(10)))
    except ZeroDivisionError:
        print '\tGot ZeroDivisionError as expected from list(pool.imap())'
    else:
        raise AssertionError, 'expected ZeroDivisionError'

    it = pool.imap(f, range(10))
    for i in range(10):
        try:
            x = it.next()
        except ZeroDivisionError:
            if i == 5:
                pass
        except StopIteration:
            break
        else:
            if i == 5:
                raise AssertionError, 'expected ZeroDivisionError'

    assert i == 9
    print '\tGot ZeroDivisionError as expected from IMapIterator.next()'
    print

    #
    # Testing timeouts
    #

    print 'Testing ApplyResult.get() with timeout:',
    res = pool.applyAsync(calculate, TASKS[0])
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % res.get(0.02))
            break
        except TimeoutError:
            sys.stdout.write('.')
    print
    print

    print 'Testing IMapIterator.next() with timeout:',
    it = pool.imap(calculatestar, TASKS)
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % it.next(0.02))
        except StopIteration:
            break
        except TimeoutError:
            sys.stdout.write('.')
    print
    print

    #
    # Testing callback
    #

    print 'Testing callback:'

    A = []
    B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729]

    r = pool.applyAsync(mul, (7, 8), callback=A.append)
    r.wait()

    r = pool.mapAsync(pow3, range(10), callback=A.extend)
    r.wait()

    if A == B:
        print '\tcallbacks succeeded\n'
    else:
        print '\t*** callbacks failed\n\t\t%s != %s\n' % (A, B)

    #
    # Check there are no outstanding tasks
    #

    assert not pool._cache, 'cache = %r' % pool._cache

    #
    # Check close() methods
    #

    print 'Testing close():'

    for worker in pool._pool:
        assert worker.isAlive()

    result = pool.applyAsync(time.sleep, [0.5])
    pool.close()
    pool.join()

    assert result.get() is None

    for worker in pool._pool:
        assert not worker.isAlive()

    print '\tclose() succeeded\n'

    #
    # Check terminate() method
    #

    print 'Testing terminate():'

    pool = Pool(2)
    ignore = pool.apply(pow3, [2])
    results = [pool.applyAsync(time.sleep, [10]) for i in range(10)]
    pool.terminate()
    pool.join()

    for worker in pool._pool:
        assert not worker.isAlive()

    print '\tterminate() succeeded\n'

    #
    # Check garbage collection
    #

    print 'Testing garbage collection:'

    pool = Pool(2)
    processes = pool._pool

    ignore = pool.apply(pow3, [2])
    results = [pool.applyAsync(time.sleep, [10]) for i in range(10)]

    del results, pool

    time.sleep(0.2)

    for worker in processes:
        assert not worker.isAlive()

    print '\tgarbage collection succeeded\n'
Example #7
0
import os, shutil, subprocess, zipfile, random
from processing import Pool
from zipfile import ZipFile
from config import *


def unzip(params):
    base, ext = params
    zf = zipfile.ZipFile(rawDir + '/' + base + '.' + ext, 'r')
    for name in zf.namelist():
        if name != 'PIC/':
            print name
            zf.extract(name, extractDir)


files = []
for f in os.listdir(rawDir):
    if (f.endswith('exe')):
        base, ext = f.split('.')
        if (len(base) == 3):
            files.append((base, ext))
p = Pool()
p.map(unzip, files)
Example #8
0
def main(qbed, sbed, pairs_file, pair_fmt, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(options.ncpu)


    bl2seq = "~/src/blast-2.2.25/bin/bl2seq " \
           "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
           " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
             -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
            | grep -v 'WARNING' | grep -v 'ERROR' "

    fcnss = sys.stdout
    print >> fcnss, "#qaccn,qseqid,saccn,[sleft_gene,sright_gene],sseqid,res"#"qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas



    pairs = [True]
    _get_pair_gen = get_pair(pairs_file , pair_fmt, sbed, qbed)
    # need this for parallization stuff.
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]

        # this helps in parallelizing.
        def get_cmd(pair):
            if pair is None: return None
            sfeat, qfeat = pair
            
            #if qfeat['accn'] != "Bradi4g01820": return None
            #print >>sys.stderr, qfeat, sfeat

            qfasta = qfastas[qfeat['seqid']]
            sfasta = sfastas[sfeat['seqid']]

            sstart, sstop = sfeat['start'], sfeat['end'] #region gets no padding
            qstart, qstop = grab_flanking_region(qfeat, sfeat) # sfeat here is the final table with sfeat info from qfeat dict
            
            m = sstop - sstart
            n = qstop - qstart
            # if (m*n) >= 812045000: # if the database and query is large keep e_value at 2.11 else change it to something smaller
            #     e_value = 2.11
            # else:
            e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise
            assert e_value > 0

            cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart,
                                sstart=sstart, qstop=qstop, sstop=sstop, e_value=e_value)
            return cmd, qfeat, sfeat

        cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c]
        results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
        #results = (r for r in map(commands.getoutput, [c[0] for c in cmds]))

        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat["accn"]),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
            
            cnss =  parse_blast(res, orient, qfeat, sfeat, qbed, sbed)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue
                       
            qname, sname = qfeat['accn'], sfeat['accn']
            
            #urls = url_params(cnss, qfeat['seqid'], sfeat['seqid'], qfeat['ORG2_qfeat'])
            
            print >> fcnss, "%s,%s,%s,[%s,%s],%s,%s" % (qname, qfeat['seqid'], sname, sfeat['qleft_gene'], sfeat['qright_gene'], sfeat['seqid'],
                             ",".join(map(lambda l: ",".join(map(str,l)), cnss)))

    return None
Example #9
0
def main(qbed, sbed,cns_bed, pairs_file, qpad, spad, pair_fmt, blast_path, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(ncpu)


    bl2seq = "%s " % blast_path + \
           "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
           " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
              -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
            | grep -v 'WARNING' | grep -v 'ERROR' "

    fcnss = sys.stdout
    print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,evalue...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file, pair_fmt, cns_bed, sbed)
    # need this for parallization stuff.
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]

        # this helps in parallelizing.
        def get_cmd(pair):
            if pair is None: return None
            qfeat, sfeat = pair
            #if qfeat['accn'] != "Bradi4g01820": return None
            #print >>sys.stderr, qfeat, sfeat

            qfasta = qfastas[qfeat['seqid']]
            sfasta = sfastas[sfeat['seqid']]

            qstart, qstop = max(qfeat['start'] - qpad, 1), qfeat['end'] + qpad
            sstart, sstop = max(sfeat['start'] - spad, 1), sfeat['end'] + spad

            assert qstop - qstart > 2 * qpad or qstart == 1, (qstop, qstart)
            assert sstop - sstart > 2 * spad or sstart == 1, (sstop, sstart)
            
            #m = qstop - qstart
            #n = sstop - sstart
            #e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise
            #assert e_value > 0

            cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart,
                                sstart=sstart, qstop=qstop, sstop=sstop,
                                e_value=30)
            return cmd, qfeat, sfeat

        cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c]
        results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
        #results = (r for r in map(commands.getoutput, [c[0] for c in cmds]))

        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat['accn']),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1

            cnss = parse_blast(res, orient, qfeat, sfeat, cns_bed, sbed, qpad, spad)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue

            qname, sname = qfeat['accn'], sfeat['accn']
            print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname,
                             ",".join(map(lambda l: ",".join(map(str,l)),cnss)))

    return None
Example #10
0
def main(cns_file,qdups_path,sdups_path,pair_file,fmt,qbed,sbed,qpad,spad,blast_path,mask='F',ncpu=8):
    pool = Pool(ncpu)
    bl2seq = "%s " % blast_path + \
            "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
            " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
             -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
             | grep -v 'WARNING' | grep -v 'ERROR' "

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas
    
    ################# file paths #####################
    qnolocaldups_path =  qbed.path.split(".")[0] + ".all.nolocaldups.bed"
    snolocaldups_path = sbed.path.split(".")[0] + ".all.nolocaldups.bed"
    qlocaldups_path = qbed.path.split(".")[0] + ".all.localdups"
    slocaldups_path = sbed.path.split(".")[0] + ".all.localdups"
    npair_file,nqlocaldups,nslocaldups, ncns_file = map(make_copy_of_file,[pair_file,qlocaldups_path,slocaldups_path,cns_file])
    ##########################################
    
    qdups = parse_dups(qdups_path)
    sdups = parse_dups(sdups_path)
    dups,rdups = get_pairs(pair_file,fmt,qdups,sdups)
    print len(dups), len(rdups)
    ldups = get_large_dups(dups,qdups,sdups)

    rdups_dic = defaultdict(dict)
    rdups_both = [(qparent,sparent) for qparent,sparent in dups if qparent in rdups and sparent in rdups]
    for (qparent,sparent) in dups:
        if skip_pair(qparent,sparent,rdups,rdups_both,ldups):continue
        cnss_size = []
        qfeat_dups = get_all_dups(qdups,qparent)
        sfeat_dups = get_all_dups(sdups,sparent)
        pairs = [True]
        _get_dups_gen = get_dups(qfeat_dups,sfeat_dups,qbed,sbed)

        def get_dups_gen():
            try: return _get_dups_gen.next()
            except StopIteration: return None
        while any(pairs):
            cnss_dups = []
            pairs = [get_dups_gen() for i in range(ncpu)]
            ###this is for parellization#########
            spad_map = [spad] * len(pairs)
            qpad_map = [qpad] * len(pairs)
            sfastas_map = [sfastas] * len(pairs)
            qfastas_map = [qfastas] * len(pairs)
            bl2seq_map =  [bl2seq] * len(pairs)
            ###################################
            cmds = [c for c in map(get_cmd, [l for l in pairs if l],
                bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c]
            results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
            for res, (cmd, qfeat, sfeat) in zip(results, cmds):
                orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 
                if not res.strip(): cnss = []
                else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad,spad)
                print >>sys.stderr, "(%i)" % len(cnss)
                cnss_fmt = ",".join(map(lambda l: ",".join(map(str,l)),cnss))
                cnss_size.append((len(cnss)*-1,qfeat["start"],sfeat["start"],qfeat["accn"],sfeat["accn"],cnss_fmt))
            pairs = [pairs[-1]]
        ######################################################################
        if qparent in rdups:
            if (qparent,sparent) in rdups_dic[qparent].keys(): logging.info((qparent,sparent))
            rdups_dic[qparent].update({(qparent,sparent):cnss_size})
        elif sparent in rdups:
            if (qparent,sparent) in rdups_dic[sparent].keys(): logging.info((qparent,sparent))
            rdups_dic[sparent].update({(qparent,sparent):cnss_size})
        else:
            cnss_size.sort()
            cns_number,qfeat_start,sfeat_start,qaccn,saccn,largest_cnss = cnss_size[0]
            qfeat = qbed.accn(qaccn)
            sfeat = sbed.accn(saccn)
            print >>sys.stderr, "FINAL: {0},{1},{2}".format(qaccn,saccn,cns_number)
            write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,cnss_size,qparent,sparent,qfeat,sfeat,qdups,sdups)
    
    best_reps = best_repeats(rdups_dic)
    for dparents in best_reps.keys():
	#print dparents
        qparent,sparent = dparents
        #print parents,best_reps[parents]
        ### one or list? cnss[0]?
        cns_number,qfeat_start, sfeat_start,qaccn,saccn,largest_cnss = best_reps[dparents]
        qfeat= qbed.accn(qaccn)
        sfeat = sbed.accn(saccn)
        write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,[best_reps[dparents]],qparent,sparent,qfeat,sfeat,qdups,sdups)

    write_nolocaldups(qbed.path,nqlocaldups,"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0]))
    write_nolocaldups(sbed.path,nslocaldups,"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0]))
    pairs_to_qa(npair_file,'pair',"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0]),"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
Example #11
0
import os, shutil, subprocess, zipfile, random
from processing import Pool
from zipfile import ZipFile
from config import *


def unzip(params):
    base, ext = params
    zf = zipfile.ZipFile(rawDir+'/'+base+'.'+ext, 'r')
    for name in zf.namelist():
        if name != 'PIC/':
            print name
            zf.extract(name, extractDir)

files = []
for f in os.listdir(rawDir):
    if(f.endswith('exe')):
        base, ext = f.split('.')
        if(len(base) == 3):
            files.append((base,ext))
p=Pool()
p.map(unzip, files)
        
Example #12
0
def test():
    print 'cpuCount() = %d\n' % cpuCount()
    
    #
    # Create pool
    #
    
    PROCESSES = 4
    print 'Creating pool with %d processes\n' % PROCESSES
    pool = Pool(PROCESSES)    

    #
    # Tests
    #

    TASKS = [(mul, (i, 7)) for i in range(10)] + \
            [(plus, (i, 8)) for i in range(10)]

    results = [pool.applyAsync(calculate, t) for t in TASKS]
    imap_it = pool.imap(calculatestar, TASKS)
    imap_unordered_it = pool.imapUnordered(calculatestar, TASKS)

    print 'Ordered results using pool.applyAsync():'
    for r in results:
        print '\t', r.get()
    print

    print 'Ordered results using pool.imap():'        
    for x in imap_it:
        print '\t', x
    print

    print 'Unordered results using pool.imapUnordered():'
    for x in imap_unordered_it:
        print '\t', x
    print

    print 'Ordered results using pool.map() --- will block till complete:'
    for x in pool.map(calculatestar, TASKS):
        print '\t', x
    print    

    #
    # Simple benchmarks
    #

    N = 100000
    print 'def pow3(x): return x**3'
    
    t = time.time()
    A = map(pow3, xrange(N))
    print '\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t)
    
    t = time.time()
    B = pool.map(pow3, xrange(N))
    print '\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \
          (N, time.time() - t)

    t = time.time()
    C = list(pool.imap(pow3, xrange(N), chunksize=N//8))
    print '\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \
          ' seconds' % (N, N//8, time.time() - t)
    
    assert A == B == C, (len(A), len(B), len(C))
    print
    
    L = [None] * 1000000
    print 'def noop(x): pass'
    print 'L = [None] * 1000000'
    
    t = time.time()
    A = map(noop, L)
    print '\tmap(noop, L):\n\t\t%s seconds' % \
          (time.time() - t)
    
    t = time.time()
    B = pool.map(noop, L)
    print '\tpool.map(noop, L):\n\t\t%s seconds' % \
          (time.time() - t)

    t = time.time()
    C = list(pool.imap(noop, L, chunksize=len(L)//8))
    print '\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \
          (len(L)//8, time.time() - t)

    assert A == B == C, (len(A), len(B), len(C))
    print    

    del A, B, C, L

    #
    # Test error handling
    #

    print 'Testing error handling:'

    try:
        print pool.apply(f, (5,))
    except ZeroDivisionError:
        print '\tGot ZeroDivisionError as expected from pool.apply()'
    else:
        raise AssertionError, 'expected ZeroDivisionError'

    try:
        print pool.map(f, range(10))
    except ZeroDivisionError:
        print '\tGot ZeroDivisionError as expected from pool.map()'
    else:
        raise AssertionError, 'expected ZeroDivisionError'
            
    try:
        print list(pool.imap(f, range(10)))
    except ZeroDivisionError:
        print '\tGot ZeroDivisionError as expected from list(pool.imap())'
    else:
        raise AssertionError, 'expected ZeroDivisionError'

    it = pool.imap(f, range(10))
    for i in range(10):
        try:
            x = it.next()
        except ZeroDivisionError:
            if i == 5:
                pass
        except StopIteration:
            break
        else:
            if i == 5:
                raise AssertionError, 'expected ZeroDivisionError'
            
    assert i == 9
    print '\tGot ZeroDivisionError as expected from IMapIterator.next()'
    print
    
    #
    # Testing timeouts
    #
    
    print 'Testing ApplyResult.get() with timeout:',
    res = pool.applyAsync(calculate, TASKS[0])
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % res.get(0.02))
            break
        except TimeoutError:
            sys.stdout.write('.')
    print
    print

    print 'Testing IMapIterator.next() with timeout:',
    it = pool.imap(calculatestar, TASKS)
    while 1:
        sys.stdout.flush()
        try:
            sys.stdout.write('\n\t%s' % it.next(0.02))
        except StopIteration:
            break
        except TimeoutError:
            sys.stdout.write('.')
    print
    print
            
    #
    # Testing callback
    #

    print 'Testing callback:'
    
    A = []
    B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729]
        
    r = pool.applyAsync(mul, (7, 8), callback=A.append)
    r.wait()

    r = pool.mapAsync(pow3, range(10), callback=A.extend)
    r.wait()

    if A == B:
        print '\tcallbacks succeeded\n'
    else:
        print '\t*** callbacks failed\n\t\t%s != %s\n' % (A, B)
    
    #
    # Check there are no outstanding tasks
    #
    
    assert not pool._cache, 'cache = %r' % pool._cache

    #
    # Check close() methods
    #

    print 'Testing close():'

    for worker in pool._pool:
        assert worker.isAlive()

    result = pool.applyAsync(time.sleep, [0.5])
    pool.close()
    pool.join()

    assert result.get() is None

    for worker in pool._pool:
        assert not worker.isAlive()

    print '\tclose() succeeded\n'

    #
    # Check terminate() method
    #

    print 'Testing terminate():'

    pool = Pool(2)
    ignore = pool.apply(pow3, [2])
    results = [pool.applyAsync(time.sleep, [10]) for i in range(10)]
    pool.terminate()
    pool.join()

    for worker in pool._pool:
        assert not worker.isAlive()

    print '\tterminate() succeeded\n'

    #
    # Check garbage collection
    #

    print 'Testing garbage collection:'

    pool = Pool(2)
    processes = pool._pool
    
    ignore = pool.apply(pow3, [2])
    results = [pool.applyAsync(time.sleep, [10]) for i in range(10)]

    del results, pool

    time.sleep(0.2)
    
    for worker in processes:
        assert not worker.isAlive()

    print '\tgarbage collection succeeded\n'
Example #13
0
def main(qbed,sbed,missed_pairs, ncpu):
    """run tblastx on missed pairs..."""
    #print >>sys.stderr,ncpu
    ncpu = int(ncpu)
    pool = Pool(ncpu)
    pairs_file = get_pairs_file(missed_pairs)
    print >>sys.stdout, "#hit,ref_gene,blastn_introns,blastx_hits, blastx_gene_hits, blastx_frame, blastn_gaps, blastx_gaps,orf_perdiction,orf_blastx,frame_shift"
    blastn = "/Users/gturco/blast-2.2.25/bin/bl2seq -p blastn -G 5 -E 2 -W 7 -q -2 -e 0.001 -D 1 -i {0} -j {1} -I {2},{3} -J {4},{5} | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR' "
    qfastas = split_fastas(qbed)#MASK CODING
    sfastas = get_mask_non_cds(sbed) #mask noncoding

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file,"pair", qbed,sbed)

    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None
        
    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]
        
        def get_blastn_cmd(pair):
            """creates the dictionary values used to fill in blast cmd"""
            if pair is None: return None
            hit, gene = pair
            hstart, hstop = abs(3000 - hit['start']), (3000 + hit['end'])
            # double check fasta to make sure i dont need to add or remove one
            gstart,gstop = gene['start'],gene['end']
            # checks the entire gene...
            query_file = qfastas[hit['seqid']]
            subject_file = sfastas[gene['seqid']]

            blastn_cmd = blastn.format(query_file, subject_file, hstart, hstop, gstart, gstop)
            #print >> sys.stderr,'{0},{1},{2}'.format(hit['accn'],gene['accn'],cmd)
            
            return blastn_cmd,hit, gene
        
        cmds = [c for c in map(get_blastn_cmd, [l for l in pairs if l]) if c]
        #print >>sys.stderr, "results: {0}".format(cmds[0][0])
        results = (r for r in pool.map(commands.getoutput,[c[0] for c in cmds]))
        for res, (cmd, hit, gene) in zip(results,cmds):
            print >>sys.stderr, "CMD: {0},{1}".format(gene['accn'],hit['accn'])
            d,no_res = group_cds(res, gene)
            gap_list =[]
            intron_list = []
            hit['locs'] = []
            if no_res == True: continue
            for group_key in d.keys():
                exon_hits = d[group_key]
                non_crossing = remove_crossing_hits(exon_hits,hit,gene)
                if len(non_crossing) > 1:
                    gaps,hstart,hend =bites(non_crossing)
                    gap_list.append(sum(gaps))
                elif len(non_crossing) == 1:
                   # print >>sys.stderr, non_crossing
                    [(hstart,hend,sstart,send,evalue)] = non_crossing
                if len(non_crossing) >= 1:
                    intron_list.append(group_key[0])
                    hit['locs'].append((hstart,hend))
            hit['locs'].sort()
            #print >>sys.stderr, "hit_loc : {0}".format(hit['locs'])
            if len(hit['locs']) < 1: continue
            orf_prediction = find_orf(qbed,hit)
            introns = "{0}/{1}".format(len(intron_list),len(gene['locs']))
            gap_totaln = sum(gap_list)
            # new hit locs made from blastn res
            hit_percent, gene_percent, frame_percent,frame_shift, best_frame, gap_total,orf_start= protein_parse(hit,gene,sbed,qbed)
            orf_start = abs(min(hit['locs'][0]) + int(orf_start))
            w ="{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}".format(hit['accn'],gene['accn'],introns,hit_percent,gene_percent, frame_percent,gap_totaln,gap_total,orf_prediction,orf_start,frame_shift)
            print >>sys.stdout, w
import numpy as np
import math
def f(x):
    print x
    y = [1]*10000000
    [math.exp(i) for i in y]
def g(x):
    print x
    y = np.ones(10000000)
    np.exp(y)

from handythread import foreach
from processing import Pool
from timings import f,g
def fornorm(f,l):
    for i in l:
        f(i)
%timeit fornorm(f,range(10))
%timeit foreach(g,range(100),threads=2)
%timeit foreach(f,range(10),threads=2)
p = Pool(2)
%timeit p.map(g,range(100))
%timeit p.map(f,range(100))
Example #15
0
def main(qbed, sbed, pairs_file, pad, pair_fmt, mask='F', ncpu=8):
    """main runner for finding cnss"""
    pool = Pool(options.ncpu)


    bl2seq = "/usr/bin/bl2seq " \
           "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
           " -Y 812045000 -d 26195 -e 2.11 -i %(qfasta)s -j %(sfasta)s \
              -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
            | grep -v 'WARNING' | grep -v 'ERROR' "

    fcnss = sys.stdout
    print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send...]"

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas

    pairs = [True]
    _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed)
    # need this for parallization stuff.
    def get_pair_gen():
        try: return _get_pair_gen.next()
        except StopIteration: return None

    while any(pairs):
        pairs = [get_pair_gen() for i in range(ncpu)]

        # this helps in parallelizing.
        def get_cmd(pair):
            if pair is None: return None
            qfeat, sfeat = pair
            #if qfeat['accn'] != "Bradi4g01820": return None
            #print >>sys.stderr, qfeat, sfeat

            qfasta = qfastas[qfeat['seqid']]
            sfasta = sfastas[sfeat['seqid']]

            qstart, qstop = max(qfeat['start'] - pad, 1), qfeat['end'] + pad
            sstart, sstop = max(sfeat['start'] - pad, 1), sfeat['end'] + pad

            assert qstop - qstart > 2 * pad or qstart == 1, (qstop, qstart)
            assert sstop - sstart > 2 * pad or sstart == 1, (sstop, sstart)

            cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart,
                                sstart=sstart, qstop=qstop, sstop=sstop)
            return cmd, qfeat, sfeat

        cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c]
        results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
        #results = (r for r in map(commands.getoutput, [c[0] for c in cmds]))

        for res, (cmd, qfeat, sfeat) in zip(results, cmds):
            if not res.strip(): continue
            print >>sys.stderr,  "%s %s" % (qfeat["accn"], sfeat['accn']),
            orient = qfeat['strand'] == sfeat['strand'] and 1 or -1

            cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, pad)
            print >>sys.stderr, "(%i)" % len(cnss)
            if len(cnss) == 0: continue

            qname, sname = qfeat['accn'], sfeat['accn']
            print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname,
                             ",".join(map(lambda l: ",".join(map(str,l)),cnss)))

    return None
Example #16
0
def main(cns_file,
         qdups_path,
         sdups_path,
         pair_file,
         fmt,
         qbed,
         sbed,
         qpad,
         spad,
         blast_path,
         unmasked_fasta,
         mask='F',
         ncpu=8):
    pool = Pool(ncpu)
    bl2seq = "%s " % blast_path + \
            "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \
            " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \
             -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \
             | grep -v 'WARNING' | grep -v 'ERROR' "

    qfastas = get_masked_fastas(qbed)
    sfastas = get_masked_fastas(
        sbed) if qbed.filename != sbed.filename else qfastas

    ################# file paths #####################
    qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed"
    snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed"
    qlocaldups_path = qbed.path.split(".")[0] + ".localdups"
    slocaldups_path = sbed.path.split(".")[0] + ".localdups"
    npair_file, nqlocaldups, nslocaldups, ncns_file = map(
        make_copy_of_file,
        [pair_file, qlocaldups_path, slocaldups_path, cns_file])
    ##########################################

    qdups = parse_dups(qdups_path)
    sdups = parse_dups(sdups_path)
    dups, rdups = get_pairs(pair_file, fmt, qdups, sdups)
    print len(dups), len(rdups)
    ldups = get_large_dups(dups, qdups, sdups)

    rdups_dic = defaultdict(dict)
    rdups_both = [(qparent, sparent) for qparent, sparent in dups
                  if qparent in rdups and sparent in rdups]
    for (qparent, sparent) in dups:
        if skip_pair(qparent, sparent, rdups, rdups_both, ldups): continue
        cnss_size = []
        qfeat_dups = get_all_dups(qdups, qparent)
        sfeat_dups = get_all_dups(sdups, sparent)
        pairs = [True]
        _get_dups_gen = get_dups(qfeat_dups, sfeat_dups, qbed, sbed)

        def get_dups_gen():
            try:
                return _get_dups_gen.next()
            except StopIteration:
                return None

        while any(pairs):
            cnss_dups = []
            pairs = [get_dups_gen() for i in range(ncpu)]
            ###this is for parellization#########
            spad_map = [spad] * len(pairs)
            qpad_map = [qpad] * len(pairs)
            sfastas_map = [sfastas] * len(pairs)
            qfastas_map = [qfastas] * len(pairs)
            bl2seq_map = [bl2seq] * len(pairs)
            ###################################
            cmds = [
                c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map,
                               qfastas_map, sfastas_map, qpad_map, spad_map)
                if c
            ]
            results = (
                r for r in pool.map(commands.getoutput, [c[0] for c in cmds]))
            for res, (cmd, qfeat, sfeat) in zip(results, cmds):
                orient = qfeat['strand'] == sfeat['strand'] and 1 or -1
                if not res.strip(): cnss = []
                else:
                    cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed,
                                       qpad, spad, unmasked_fasta)
                print >> sys.stderr, "(%i)" % len(cnss)
                cnss_fmt = ",".join(map(lambda l: ",".join(map(str, l)), cnss))
                cnss_size.append(
                    (len(cnss) * -1, qfeat["start"], sfeat["start"],
                     qfeat["accn"], sfeat["accn"], cnss_fmt))
            pairs = [pairs[-1]]
        ######################################################################
        if qparent in rdups:
            if (qparent, sparent) in rdups_dic[qparent].keys():
                logging.info((qparent, sparent))
            rdups_dic[qparent].update({(qparent, sparent): cnss_size})
        elif sparent in rdups:
            if (qparent, sparent) in rdups_dic[sparent].keys():
                logging.info((qparent, sparent))
            rdups_dic[sparent].update({(qparent, sparent): cnss_size})
        else:
            cnss_size.sort()
            cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = cnss_size[
                0]
            qfeat = qbed.accn(qaccn)
            sfeat = sbed.accn(saccn)
            print >> sys.stderr, "FINAL: {0},{1},{2}".format(
                qaccn, saccn, cns_number)
            write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups,
                           cnss_size, qparent, sparent, qfeat, sfeat, qdups,
                           sdups)

    best_reps = best_repeats(rdups_dic)
    for dparents in best_reps.keys():
        qparent, sparent = dparents
        ### one or list? cnss[0]?
        cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = best_reps[
            dparents]
        qfeat = qbed.accn(qaccn)
        sfeat = sbed.accn(saccn)
        write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups,
                       [best_reps[dparents]], qparent, sparent, qfeat, sfeat,
                       qdups, sdups)

    write_nolocaldups(
        qbed.path, nqlocaldups,
        "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]))
    write_nolocaldups(
        sbed.path, nslocaldups,
        "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]))
    pairs_to_qa(npair_file, 'pair',
                "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]),
                "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]),
                "{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
Example #17
0
            print "Error: cant write the file %s.%s" % (base,ext)
        else:
            for line in open(extractedDir+"/"+base+"."+ext, "r"):

                if len(line) > 5:
                    begin = 0
                    end = 0
                    length = 0
                    output = ""
                    for i in configList:
                        length=int(i)
                        end = begin + length
                        output += line[begin:end]+";"
                        begin = end
                    dest.write(output+"\n")
            dest.close()
            print "Done parsing %s.%s" % (base, ext)
    else:
        print "File does not exist %s.%s" % (base, ext)

files = []
for f in os.listdir(extractedDir):
    if f != '.DS_Store':
        base, ext = f.split('.')
        if ext == 'dat' or int(ext):
            files.append((base,ext))

random.shuffle(files)
p = Pool()
p.map(parse, files)