Example #1
0
File: phylo.py Project: mb1511/PLSA
def make_single_genes(seq_lists, wd='src'):
    seqs = fasta.fasta_read(seq_lists[0], generator=False)
    for i in xrange(len(seqs)):
        start = True
        gene_list = ''
        for faa in seq_lists:
            title = os.path.basename(faa)
            seqs = fasta.fasta_read(faa, generator=False)
            if start:
                start = False
                with open(join(wd, 'sampling/accs%d.txt' % i), 'a') as s:
                    s.write(title + '\t'  + seqs[i].name[1:seqs[i].name.find(' ')] + '\n')
            gene_list += '>' + title + '\n'
            gene_list += seqs[i].seq
            gene_list += '\n'
        yield (i, gene_list)
Example #2
0
def run(n1, n2, n1_nam='genome1', n2_nam='genome2', k=19, mp=''):
    '''Return MUMi value for 2 genomes'''
    # seq generators
    if not mp:
        mp = 'mummer'
    n1_s = fasta.fasta_read(n1)
    n2_s = fasta.fasta_read(n2)

    n2_l = 0
    n1_l = 0

    # concatenate files, beacause mumi cannot process multiple contigs
    with tempfile.NamedTemporaryFile(delete=True) as f1:
        with tempfile.NamedTemporaryFile(delete=True) as f2:
            f1.write('>temp_seq_1\n')
            for seq in n1_s:
                f1.write(seq.seq)

            f2.write('>temp_seq_2\n')
            for seq in n2_s:
                f2.write(seq.seq)

            cmd = [mp, '-mum', '-b', '-c', '-l', '%d' % k, f1.name, f2.name]
            # run MUMmer
            output = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE)

            o, e = output.communicate()

            # get genome lengths
            for line in e.splitlines():
                if not n1_l:
                    if f1.name in line and 'length' in line:
                        n1_l = int(line[line.rfind(' ') + 1:])
                if not n2_l:
                    if f2.name in line and 'length' in line:
                        n2_l = int(line[line.rfind(' ') + 1:])

            # get MUMi
            m = '%.5f' % give_mumi.get(from_text=o, l1=n1_l, l2=n2_l)

    return '\t'.join((n1_nam, n2_nam, m))
Example #3
0
File: f_con.py Project: mb1511/PLSA
def run_new(start='', d='blast_data', cov=65, wd='', **kw):
    
    num_threads = kw.pop('num_threads', 7)
    dbs = [str(d + '/' + f[:-4]) for f in os.listdir(d) if f.endswith('.phr')]   
    ref_genes = fasta.fasta_read(start, generator=False)
    matches = {}
    
    if not os.path.isdir(os.path.join(wd, 'ref_dbs')):
        os.mkdir(os.path.join(wd, 'ref_dbs'))
    
    # segment dbs into blocks (size = num_threads) - blocks don't have to be full
    # process blocks
    # refine
    # repeat until all blocks completed
    blocks = []
    for i in xrange(0, len(dbs), num_threads):
        blocks.append(dbs[i:i + num_threads])
    
    for block in blocks:
        args = zip(range(len(block)), block)
        to_remove = []
        print len(ref_genes)
        for i, res in enumerate(
            comps(
                arg_list=args,
                cov=cov,
                ref=ref_genes,
                cores=num_threads)):
            for r in res:
                if r[1] is None:
                    to_remove.append(r[0])
                else:
                    try:
                        matches[r[0]][block[i]] = r[1]
                    except KeyError:
                        matches[r[0]] = {block[i]: r[1]}
        # remove 0 score proteins from master list
        for r in set(to_remove):
            print matches.pop(r)[dbs[5]].index, 'Removed'
        ref_genes = [matches[g][dbs[5]] for g in matches]
    
    for d in dbs:
        with open(os.path.join(wd, 'ref_dbs', os.path.basename(d)), 'w') as s:
            for g in matches:
                s.write(matches[g][d].fasta)
                s.write('\n')
Example #4
0
File: f_con.py Project: mb1511/PLSA
def compare(database, file_path=None, from_text=None, from_list=None, cov=65, remove=[], **kw):
    removed = 0      
    
    # implemented for multi blast
    prot = _faa_dict.faa(database)
    if not from_list:
        g = fasta.fasta_read(file_path=file_path, from_text=from_text, generator=False)
    else:
        g = from_list
    print len(g)
    ref = []
    # create blast query
    with open('src/tools/blast/query.fasta', 'w') as q:
        q.write('\n'.join(gene.fasta for gene in g))
    # run (multi) blast search
    aln = local_blast.run(db_path=database, mr=1, make_db=False,
                          r_a=True, outfmt='details', join_hsps=True, **kw)
    
    for index, gene in enumerate(g):
        try:
            # grab top alignment (position 0) % similarity
            a = aln[index][0]
            sim = a.psim
        except IndexError:
            # if no record, move onto the next gene
            for j in xrange(len(remove)):
                del remove[j][index - removed]
                removed += 1
            continue
    
        if sim < cov:
            # if psim lower than cut off value, move onto next gene
            for j in xrange(len(remove)):
                del remove[j][index - removed]
                removed += 1
            continue
        else:
            print index, gene.name[1:gene.name.find(' ')], a.h_acc, sim
            # matched lists
            ref.append(fasta.sequence(a.h_def, prot[a.h_acc]))
            # keep track of genes to keep from previous queries
            #indices.append(index)
    
    remove.append(ref)
    return remove
Example #5
0
def fasta_chunks(fname, chunk_size=5000):
    """ Split fasta into chunks and save (free up ram during run). For
    continuity, a list of contig_ids and chunk numbers are pickled for
    back-checking at the end of the process with checkout.py. """
    fas = fasta_read(fname)
    chunk_num = math.ceil(len(fas) / chunk_size)

    # Iterate over chunk numbers
    for i in range(chunk_num):
        i += 1
        if len(fas) > chunk_size:
            # Take first 5k elements
            chunk = dict(list(fas.items())[:chunk_size])
            fas = dict(list(fas.items())[chunk_size:])
        else:
            # Else take all elements
            chunk = fas
        fasta_write(chunk, "temp/chunk_%s.fas" % i)
        log("%s written to /temp" % i)

    return chunk_num
Example #6
0
File: phylo.py Project: mb1511/PLSA
def make_gene_list(seq_lists, size=10, repeats=1000, wd='src'):
    '''
    Generator Function to create samples
    
    seq_lists    - sequence of file paths to conserved gene lists
    size         - size of sample
    repeats      - number of samples to take
    '''
    # TODO: add check to see if samples dir is empty
    for rep in xrange(repeats):
        gene_list = ''
        start = True
        max_l = 0
        for faa in seq_lists:
            title = os.path.basename(faa)
            seqs = fasta.fasta_read(faa, generator=False)
            if not start:
                # should always be true - if not, something has gone
                # wrong somewhere
                assert len(seqs) == max_l
            else:
                max_l = len(seqs)
            if start:
                start = False
                # get some random indices
                indices = [i for i in np.random.choice(range(len(seqs)),
                                                       size,
                                                       replace=False)]
                for i in indices:
                    # write sample genes to file
                    seq = seqs[i]
                    #print(seq.name)
                    with open(join(wd, 'sampling/accs%d.txt' % rep), 'a') as s:
                        s.write(title + '\t'  + seq.name[1:seq.name.find(' ')] + '\n')
                            
            gene_list += '>' + title + '\n'
            gene_list += ''.join(seqs[i].seq for i in indices)
            gene_list += '\n'
        yield (rep, gene_list)