Beispiel #1
0
def rename(direc, db, files):
    '''
    This isn't really for bioinformatics, this is more for the pipeline, to
    rename the files generated by cluster.py with a little human interaction.
    '''

    names = []
    seqdb = dict((s.name, s) for s in io.open(db, 'r'))
    nt_dir, aa_dir = direc + 'nt' + sep, direc + 'aa' + sep
    for f in files:
        seq = io.open(nt_dir + f, 'r').next()
        ids = seq.defline.split(', ')
        print("File\033[33;1m", f, \
              "\033[0mis described by the following sequences:")
        try:
            for id in ids:
                seqdb[id]
                print("* " + seqdb[id].name + ': ' +
                      seqdb[id].defline.split('[')[0])
        except KeyError:
            print("* (none)")
            continue
        pre = get_input("\033[33;1mWhat should we call this file " +
                        "(or hit enter to skip)? \033[0m")
        fpre = f[:f.find('.')]

        if pre != "":
            count = 0
            while True:
                rpre = pre + ((" (%d)" % count) if count > 0 else "")
                try:
                    fh = open(nt_dir + rpre + ".fasta", 'r')
                    fh.close()
                    count += 1
                    continue
                except IOError:
                    nt_old, nt_new = nt_dir + fpre, nt_dir + rpre
                    aa_old, aa_new = aa_dir + fpre, aa_dir + rpre
                    print("Renaming " + fpre + ".* to " + rpre + ".*")
                    try:
                        mv(nt_old + ".fasta", nt_new + ".fasta")
                        mv(aa_old + ".fasta", aa_new + ".fasta")
                        mv(nt_old + ".clustalw", nt_new + ".clustalw")
                        mv(aa_old + ".clustalw", aa_new + ".clustalw")
                        names.append(nt_new + '.clustalw')
                        names.append(aa_new + '.clustalw')
                    except OSError:
                        pass
                    break
    return names
Beispiel #2
0
    def run_clustal():
        while not q.empty():
            cid = q.get()
            dig = hashlib.md5()
            dig.update(' '.join(cid))
            dig = dig.hexdigest()

            fpre = direc + 'nt' + sep + dig
            apre = direc + 'aa' + sep + dig
            fname = fpre + ".fasta"
            aname = apre + ".fasta"

            fh = io.open(fname, 'w')
            ah = io.open(aname, 'w')
            for ipt in clusters:
                counter = 0
                name = '_'.join(ipt.split(sep)[-1].split('.')[0].split())
                for cluster in clusters[ipt]:
                    if cid & cluster[0]:
                        nm = name + '_' + str(counter)
                        seq = cluster[1]
                        curr = sequ.Sequence(nm, seq, defline=', '.join(cid))
                        tr = tran.translate(curr)
                        tr.name = curr.name
                        fh.write(curr)
                        ah.write(tr)
                        counter += 1
            fh.close()
            ah.close()

            try:
                clustal.run(fname, fpre + '.clustalw')
                clustal.run(aname, apre + '.clustalw')
                filenames.append(dig + '.fasta')
            except ValueError:
                pass

            q.task_done()
Beispiel #3
0
def run(infile, outfile, **kwargs):
    n = 0
    for seq in io.open(infile, 'r'):
        n += 1
        if n > 1:
            seqtype = seq.type
            break

    if n > 1:
        cmd = "clustalw"

        try:
            ignore = open('/dev/null', 'w')
        except IOError:
            ignore = open('nul', 'w')

        if seqtype == 'nucl':
            defaults = {
                'OUTORDER': 'ALIGNED',
                'GAPOPEN': '10',
                'GAPEXT': '0.1',
                'DNAMATRIX': 'IUB'
            }
            others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg]))
                      for arg in set(name.upper() for name in kwargs) &
                      set(defaults.keys())]
            subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN", "-TYPE=DNA",
                            "-OUTFILE=" + outfile] + others, stdout=ignore)
        else:
            defaults = {
                'OUTORDER': 'ALIGNED',
                'GAPOPEN': '10',
                'GAPEXT': '0.1',
                'MATRIX': 'BLOSUM'
            }
            others = ["-%s=%s" % (arg, kwargs.get(arg, defaults[arg]))
                      for arg in set(name.upper() for name in kwargs) &
                      set(defaults.keys())]
            subprocess.call([cmd, "-INFILE=" + infile, "-ALIGN",
                            "-TYPE=PROTEIN", "-OUTFILE=" + outfile] + others,
                            stdout=ignore)
        pos = infile.rfind('.')
        if pos > -1:
            prefix = infile[:pos]
        else:
            prefix = infile
        remove(prefix + '.dnd')
Beispiel #4
0
def var(files):
    '''
    Returns plot data and metadata for plotting later on in the pipeline.
    '''
    sort = {}
    for f in files:
        seqs = [s for s in io.open(f)]
        type = set(s.type for s in seqs)
        if len(type) > 1:
            type = set(['prot'])
        fid = (type.pop(), f)
        seqs = [''.join(s.seq.split('-')).strip() for s in seqs]
        seqs = [translate(s) if fid[0] == 'nucl' else s for s in seqs]
        sset = frozenset(seqs)
        srtr = (len(seqs), sset)
        sort[srtr] = sort.get(srtr, set()) | set([fid])

    couples = []
    for partners in sort.values():
        trim = lambda x: '.'.join(x.split('.')[:-1]) \
                         if f.endswith('.clustalw') or \
                            f.endswith('.clustal') or \
                            f.endswith('.aln') else x
        names = ', '.join(set(trim(f.split(sep)[-1]) for type, f in partners))
        pair = {}
        for type, f in partners:
            if len(pair) == 2:
                break
            if type in pair:
                continue
            pair[type] = f
        if 0 < len(pair) < 2:
            raise TypeError("Unmatched clustal alignment(s): " + 
                            ", ".join(f for type, f in partners))
        if len(pair) == 0:
          continue
        couples.append((pair['nucl'], pair['prot'], names))

    for nt, aa, strain in couples:
        plotdata = {
            'nt': SaySNPs(nt),
            'aa': SaySNPs(aa)
        }
        metadata = {'strain': strain, 'filename': strain + '.pdf'}

        yield {'plotdata': plotdata, 'metadata': metadata}
    raise StopIteration
Beispiel #5
0
def SaySNPs(input):
    '''
    Takes a clustalw alignment and will return a dictionary of data
    relevent to plotting the sequence variance for the sequences in the
    given clustalw alignment. These data are:
    * `var`: the measure of sequence variation,
    * `starts`: the starting positions for each gene model in amino acids,
    * `ends`: the ending positions for each gene model in amino acids, and
    * `count`: the number of sequences with a particular gene model.
    The values given in `starts`, `ends`, and `counts` are sorted to that the 
    nth element in starts corresponds to the nth value in ends and the nth 
    value in counts.
    '''

    catalogue = []
    lengths = {}
    for seq in io.open(input, 'r'):
        key = (seq.start, seq.end)
        lengths[key] = lengths.get(key, 0) + 1
        for (i, c) in zip(xrange(key[1] - key[0] + 1), seq):
            if i >= len(catalogue):
                catalogue.append({})
            if c != " ":
                catalogue[i][c] = catalogue[i].get(c, 0) + 1

    calc = []
    for s in catalogue:
        tot = float(sum(s.values()))
        cnt = float(len(s))
        calc.append(1.0 - sum((s[c] / tot) ** 2 for c in s))
    llist = sorted(list(lengths.keys()))

    return {
        'var': calc,
        'starts': [s for s, e in llist],
        'ends': [e for s, e in llist],
        'count': [lengths[k] for k in llist]
    }
Beispiel #6
0
def run(db, sfile, mega_blast=False, **kwargs):
    '''
    Takes a database and a query and runs the appropriate type of BLAST on
    them. The database can be an existing BLAST database or a fasta/fastq
    file. If it is a sequence file, this function will look in the places
    where BLAST would look for an existing database created from that file and
    use that instead. If there is no such database, this function will make
    one for you and then use the newly created database with BLAST.

    Optional named arguments can currently only be `evalue`, `num_threads`,
    `gapopen`, or `gapextend`. The correspond to the BLAST options of the same
    name.
    '''

    cmds = {
        'prot': {
            'prot': 'blastp',
            'nucl': 'tblastn'
        },
        'nucl': {
            'nucl': 'blastn',
            'prot': 'blastx'
        }
    }

    seq = io.open(sfile, 'r').next()
    qtype = seq.type

    rcloc = ''
    for loc in (".:~:" + (getenv("NCBI") or "")).split(':'):
        if loc and loc[-1] == sep:
            loc += sep
        try:
            for line in (l.strip() for l in open(loc + '.ncbirc', 'r')):
                pos = line.find('=')
                if pos >= 0 and line[:pos].strip() == "BLASTDB":
                    rcloc = line[pos + 1:].strip()
        except IOError:
            pass

    dbtype = None
    bdbenv = getenv("BLASTDB")
    dblocations = (":." + ((':' + bdbenv) if bdbenv else '') + 
                   ((':' + rcloc) if rcloc else '')).split(':')
    for loc in dblocations:
        if loc and loc[-1] != sep:
            loc += sep
        try:
            open(loc + db + '.pin', 'r')
            dbtype = 'prot'
            break
        except IOError:
            try:
                open(loc + db + '.nin', 'r')
                dbtype = 'nucl'
                break
            except IOError:
                pass

    if not dbtype:
        odb = db
        pos = db.rfind(".")
        for seq in io.open(db, 'r'):
            dbtype = seq.type
            break
        if not dbtype:
            raise IOError("Database not found: " + odb)

        ndb = None
        sp = db.rfind(sep)
        if sp > -1:
            dbdir, db = db[:sp], db[sp + 1:pos]
        else:
            dbdir, db = '.', db[:pos]

        for file in listdir(dbdir):
            dpos = file.rfind('.')
            if dpos >= 0 and file[dpos + 1:] == dbtype[0] + 'in':
                fh = open(dbdir + sep + file, 'r')
                c = ord(fh.read(12)[-1])
                fname = fh.read(c)
                if fname[0] in ("'", '"'):
                    fname = fname[1:-1]
                if fname.endswith(odb):
                    ndb = dbdir + sep + file[:dpos]
                    break
        if not ndb:
            ndb = '_'.join(db.split())
            try:
                ignore = open('/dev/null', 'w')
            except IOError:
                ignore = open('nul', 'w')

            try:  # possible race condition
                open(ndb, 'r').close()
            except IOError:
                subprocess.call(["makeblastdb", "-in", '"%s"' % odb,
                                 "-out", ndb, "-dbtype", dbtype],
                                 stdout=ignore)
                try:
                    for suff in ['in', 'hr', 'sq']:
                        name = ndb + '.' + dbtype[0] + suff
                        shutil.move(name, dbdir + sep + name)
                except shutil.Error:
                    pass
            db = dbdir + sep + ndb
        else:
            db = ndb
    else:
        raise IOError("Database not found: " + db)
    allowed = set(["evalue", "gapopen", "gapextend", "num_threads"]) & \
        set(kwargs.keys())
    cmd = cmds[qtype][dbtype]
    pn = ["-db", "-query"]
    if mega_blast:
        cmd = "megablast"
        pn = ["-d", "-i"]
        allowed = ["e", "a"]

    proc = subprocess.Popen([cmd, pn[0], db, pn[1], sfile] +
                            [arg for pair in
                             [["-" + k, str(kwargs[k])] for k in allowed]
                             for arg in pair],
                            bufsize=1, stdout=subprocess.PIPE)
    return Result(iter(proc.stdout.readline, ''))
Beispiel #7
0
def GeneFromBLAST(db, sequences, pref, names):
    '''
    BLASTs database against sequences, and for those results that pass the
    length and percent identity requirements, attempt to locate the full gene
    that corresponds to that BLAST hit. Genes that are found are saved in the
    subdirectory sequences under the given directory, divided depending on
    whether the sequnece is amino acid or nucleotide.
    '''
    PIPING = True
    wd = options.DIRECTORY + 'sequences' + sep

    for d in [options.DIRECTORY, wd]:
        try:
            mkdir(d)
        except OSError:
            pass

    subj = dict((s.name, s) for s in io.open(db, 'r'))
    options.debug("Database sequences loaded from file %s." % db)

    try:
        orfs = dict((s.name, [orf for orf in ORFGenerator(s)])
                    for s in io.open(sequences, 'r'))
        options.debug("ORFs loaded from file %s." % sequences)
    except IOError:
        options.debug("No file \"" + sequences + ",\" skipping.")
        return

    def target():
        while 1:
            try:
                res = qin.get(PIPING, 1)
            except queue.Empty:
                if not PIPING:
                    break
                else:
                    continue

            qname, sname = res['query']['name'], res['subject']['name']
            start, end = res['query']['start'], res['query']['end']
            alignments = []
            max_match = (options.MIN_IDENTITY, None)

            if subj[sname].type == 'nucl':
                subject = translate(subj[sname])
            else:
                subject = subj[sname]

            while qname:
                try:
                    o = orfs[qname]
                    break
                except KeyError:
                    qname = qname[:-1]
            if not qname:
                qin.task_done()
                continue

            for orf in o:
                if in_range(orf, start, end, 0):
                    orf = orf[:-3]
                    query = translate(orf)
                    options.debug("Aligning %33s v. %33s." % (qname, sname))
                    alignment = align(subject.seq, query.seq)
                    alignments.append((orf, sname, alignment))

            for orf, refname, aln in alignments:
                hitlen = aln['sublength']
                region = orf[-3 * hitlen:]
                identity = float(aln['identities']) / aln['length']
                if identity >= max_match[0]:
                    max_match = (identity, (region, sname, aln))

            if max_match[1]:
                seq, name, _ = max_match[1]
                odl = subject.defline.split('[')[0].strip()
                src = seq.original.name
                start, end, strand = seq.start, seq.end, seq.step
                defline = '%s[source=%s] [start=%d] [end=%d] [strand=%d]' % \
                    (odl + (' ' if odl else ''), src, start, end, strand)

                new = Sequence(name.strip(), seq.seq, defline=defline,
                               original=seq.original, type=seq.type,
                               start=seq.start, end=seq.end, step=seq.step)
                qout.put(new)
            qin.task_done()

    def in_range(seq, start, end, frame):
        ss, se = sorted((seq.start, seq.end))
        os, oe = sorted((start, end))
        frame = int(frame)

        return (ss < oe and se > os and (se % 3 == oe % 3 or ss % 3 == oe % 3) ) 
        

    qout = queue.Queue()
    qin = ThreadQueue(target)

    blastopts = {
        'evalue': options.MAX_EVALUE,
        'num_threads': options.NUM_THREADS
    }

    for res in BLAST.run(db, sequences, **blastopts):
        if float(res['expect']) > options.MAX_EVALUE:
            continue

        sbjl = len(subj[res['subject']['name']])
        ident = float(res['identities'].split('(')[1][:-2]) / 100
        lerr = float(res['subject']['length']) / sbjl

        if ident >= options.MIN_IDENTITY:
            if lerr >= (1.0 - options.LENGTH_ERR):
                qin.put(res)
    PIPING = False
    options.debug("BLAST done.")

    target()
    qin.join()
    options.debug("Done Aligning sequences.")

    options.debug("Now writing sequences (%d)." % qout.qsize())
    seqs = {}
    nuc_file = io.open(wd + pref + '.fasta', 'w')
    count = 0
    while 1:
        try:
            seq = qout.get(False)
            if seq.seq not in seqs:
                seqs[seq.seq] = set()
            seqs[seq.seq].add(seq)
            nuc_file.write(seq)
            count += 1
            options.debug("Wrote %s (%d)." % (seq.name, count));
        except queue.Empty:
            break
    nuc_file.close()
    options.debug("Done Aligning sequences.")

    gh = io.open(wd + pref + '.gff3', 'w')
    names.append(wd + pref + '.fasta')

    for id in seqs:
        gh.write(ann(seqs[id].copy().pop(), pref, 'gene',
                     homologs=','.join(s.name for s in seqs[id])))
    gh.close()
Beispiel #8
0
            seqs[seq.seq].add(seq)
            nuc_file.write(seq)
            count += 1
            options.debug("Wrote %s (%d)." % (seq.name, count));
        except queue.Empty:
            break
    nuc_file.close()
    options.debug("Done Aligning sequences.")

    gh = io.open(wd + pref + '.gff3', 'w')
    names.append(wd + pref + '.fasta')

    for id in seqs:
        gh.write(ann(seqs[id].copy().pop(), pref, 'gene',
                     homologs=','.join(s.name for s in seqs[id])))
    gh.close()


def run(subject, query, prefix, names):
    GeneFromBLAST(subject, query, prefix, names)


if __name__ == '__main__':
    options.START_CODONS = ['TTG']
    import sys
    f = io.open(sys.argv[1], 'r')
    for seq in f:
        print(seq.name + ' ' + seq.defline)
        for orf in ORFGenerator(seq):
            print('%d ... %d' % (orf.start, orf.end))
Beispiel #9
0
def run(direc, inputs):
    '''
    Takes a collection of files generated by gene prediction, creates clusters
    based off of the genes that have homology to those predicted genes, and
    creates new fasta files in the clusters sub directory under the given
    directory and separated according to whether they are nucleotide or amino
    acid sequnces. These new fasta files are then used to create clustalw
    alignments of the genes if more than 1 sequence exists in the fasta file.
    '''

    clusters = {}
    all_ids = set()
    ids = {}
    q = queue.Queue()
    filenames = []

    def run_clustal():
        while not q.empty():
            cid = q.get()
            dig = hashlib.md5()
            dig.update(' '.join(cid))
            dig = dig.hexdigest()

            fpre = direc + 'nt' + sep + dig
            apre = direc + 'aa' + sep + dig
            fname = fpre + ".fasta"
            aname = apre + ".fasta"

            fh = io.open(fname, 'w')
            ah = io.open(aname, 'w')
            for ipt in clusters:
                counter = 0
                name = '_'.join(ipt.split(sep)[-1].split('.')[0].split())
                for cluster in clusters[ipt]:
                    if cid & cluster[0]:
                        nm = name + '_' + str(counter)
                        seq = cluster[1]
                        curr = sequ.Sequence(nm, seq, defline=', '.join(cid))
                        tr = tran.translate(curr)
                        tr.name = curr.name
                        fh.write(curr)
                        ah.write(tr)
                        counter += 1
            fh.close()
            ah.close()

            try:
                clustal.run(fname, fpre + '.clustalw')
                clustal.run(aname, apre + '.clustalw')
                filenames.append(dig + '.fasta')
            except ValueError:
                pass

            q.task_done()

    if direc:
        for d in [direc, direc + 'nt' + sep, direc + 'aa' + sep]:
            try:
                mkdir(d)
            except OSError:
                pass

    for ipt in inputs:
        seqs = {}
        ids[ipt] = set()
        for seq in io.open(ipt, 'r'):
            ids[ipt].add(seq.name)
            all_ids.add(seq.name)
            if seq.seq not in seqs:
                seqs[seq.seq] = set()
            seqs[seq.seq].add(seq.name)
        clusters[ipt] = [(seqs[k], k) for k in seqs]
        del seqs

    sub_ids = []
    while all_ids:
        cid = all_ids.pop()
        subcluster = (all_ids | set([cid])) & \
            set(i for ipt in clusters for cluster in clusters[ipt]
                for i in cluster[0] if cid in cluster[0])

        for ipt in clusters:
            for cluster in clusters[ipt]:
                if cid in cluster[0]:
                    subcluster = (subcluster & cluster[0]) | \
                        (subcluster - ids[ipt])
        sub_ids.append(subcluster)
        all_ids -= subcluster

    for cid in sub_ids:
        q.put(cid)

    threads = []
    for i in xrange(options.NUM_PROCESSES - 1):
        curr = threading.Thread(target=run_clustal)
        threads.append(curr)
        curr.start()
    run_clustal()
    q.join()
    return filenames