コード例 #1
0
ファイル: backalign.py プロジェクト: bsmith89/deep-pipeline
def main():
    for afa_rec, fn_rec in \
        zip(parse(sys.argv[1], 'fasta'), parse(sys.argv[2], 'fasta')):
        assert afa_rec.id == fn_rec.id
        afn_str = backalign(afa_rec.seq, fn_rec.seq.ungap('-'))
        write(SeqRecord(Seq(afn_str), id=afa_rec.id, description=""),
              sys.stdout, 'fasta')
コード例 #2
0
ファイル: seqmaptosb.py プロジェクト: wangdi2014/snp2str
def retriving(a,b,c):
    pdbid     = a
    chainid   = b
    uniid     = c
    my_record = []
    log       = open('pdb.fasta','w')
    seqpy     = Popen(["python","pdb_seq.py",pdbid],stdout=PIPE,stderr=PIPE)
    stdout    = seqpy.communicate()[0]
    log.write(stdout)
    wait      = seqpy.wait()
    log.close()
    seqfile   = open("pdb.fasta")
    for seq_record in parse(seqfile, "fasta"):
        r = seq_record.id.split('_')
        if r[0][-1]==chainid:
            my_record.append(seq_record)
    seqfile.close()
    url = 'http://www.uniprot.org/uniprot/'+uniid+'.fasta'
    seqfile2 = urlopen(url)
    for seq_record in parse(seqfile2, "fasta"):
        r = seq_record.id.split('|')
        uniprot = r[1]
        my_record.append(seq_record)
    seqfile2.close()
    write(my_record, "test.fasta", "fasta")
コード例 #3
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    for rc_rec in revcompl_recs(parse(args.in_handle, args.fmt_infile)):
        write(rc_rec, args.out_handle, args.fmt_outfile)
コード例 #4
0
ファイル: tree_order.py プロジェクト: bsmith89/seq-div
def main():
    tree = read(sys.argv[1], 'newick')
    seqs = index(sys.argv[2], 'fasta')
    if not tree.rooted:
        tree.root_at_midpoint()
    tree.ladderize(reverse=True)
    for leaf in tree.get_terminals():
        write(seqs[leaf.name], sys.stdout, 'fasta')
コード例 #5
0
ファイル: translate.py プロジェクト: bsmith89/compbio-scripts
def main():
    signal(SIGPIPE,SIG_DFL)
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    for trans_rec in translate_recs(parse(args.in_handle, args.fmt_infile), code=args.code):
        write(trans_rec, args.out_handle, args.fmt_outfile)
コード例 #6
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    for rec in rename_recs(parse(args.in_handle, args.fmt_infile),
                           get_map(args.map_handle)):
        logger.debug("Writing {}".format(rec.id))
        write(rec, args.out_handle, args.fmt_outfile)
コード例 #7
0
ファイル: drop_seqs.py プロジェクト: bsmith89/seq-div
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    recs = list(parse(args.in_handle, args.fmt_infile))
    assert len(recs) > 0

    for rec in rm_recs(recs,
                       get_list(args.list_handle)):
        write(rec, args.out_handle, args.fmt_outfile)
コード例 #8
0
ファイル: codonalign.py プロジェクト: bsmith89/seq-div
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    align_index = {rec.id: rec
                   for rec in parse(args.align_handle, args.fmt_align)}

    for rec in backalign_recs(parse(args.in_handle, args.fmt_infile),
                              align_index):
        write(rec, args.out_handle, args.fmt_outfile)
コード例 #9
0
ファイル: remove_seqs.py プロジェクト: bsmith89/rrnum
def main():
    with open(sys.argv[1]) as names_handle:
        remove_names = set(line.strip() for line in names_handle)

    out_recs = []
    for rec in parse(sys.stdin, 'fasta'):
        if rec.name in remove_names:
            continue
        else:
            out_recs.append(rec)

    write(out_recs, sys.stdout, 'fasta')
コード例 #10
0
ファイル: fetch_seqs.py プロジェクト: bsmith89/seq-div
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    if args.match_order:
        for rec in get_recs(parse(args.in_handle, args.fmt_infile),
                            get_list(args.list_handle)):
            write(rec, args.out_handle, args.fmt_outfile)
    else:
        recs = get_rec_list(parse(args.in_handle, args.fmt_infile),
                            get_list(args.list_handle))
        write(recs, args.out_handle, args.fmt_outfile)
コード例 #11
0
ファイル: fasta.py プロジェクト: benjschiller/seriesoftubes
def permute_fasta(f):
    '''
    takes a FASTA file and returns a new FASTA file with each sequence randomly
    permuted (separately, such that its % A,T,G,C doesn't change)
    '''
    mute = Bio.Seq.Seq.tomutable
    shuffle = random.shuffle
    with open(f + '_permuted.fa', 'w') as output:
        with open(f, 'rU') as fobj:
            for seq_rec in parse(fobj, 'fasta'):
                seq_rec.seq = mute(seq_rec.seq)
                shuffle(seq_rec.seq)
                write(seq_rec, output, 'fasta')
コード例 #12
0
ファイル: drop_missized.py プロジェクト: bsmith89/seq-div
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    all_recs = OrderedDict()
    for rec in parse(args.in_handle, args.fmt_infile):
        all_recs[rec] = len(rec)

    mode = Counter(all_recs.values()).most_common()[0][0]
    for rec in all_recs:
        if all_recs[rec] == mode:
            write(rec, args.out_handle, args.fmt_outfile)
        else:
            warn(cli.DropSequenceWarning(
                "{} had length {}, not {}".format(rec.id, len(rec), mode)))
コード例 #13
0
ファイル: qtrim_reads.py プロジェクト: bsmith89/seq-div
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    for rec_in in parse(args.in_handle, 'fastq'):
        logger.debug(rec_in)
        rec_out = quality_trim(rec_in, args.quality_threshold,
                               keep_columns=args.keep_columns)
        length = len(rec_out.seq)
        if length < args.min_length:
            warn(("Length of sequence {} less than threshold. "
                  "{} < {}. Dropping.").\
                     format(rec_out.id, length, args.min_length),
                 cli.DropSequenceWarning)
        else:
            write(rec_out, args.out_handle, args.fmt_outfile)
コード例 #14
0
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    if args.match_order and args.excluding:
        raise ValueError("--match-order and --excluding cannot both be set.")

    to_fetch = [line.strip() for line in args.list_handle]
    fetch_set = set(to_fetch)

    rec_iter = parse(args.in_handle, args.fmt_infile)

    if args.excluding:
        out_iter = exclude_iter(rec_iter, fetch_set)
    elif args.match_order:
        out_iter = order_iter(fetch_iter(rec_iter, fetch_set), to_fetch)
    else:
        out_iter = fetch_iter(rec_iter, fetch_set)

    write(out_iter, sys.stdout, args.fmt_outfile)
コード例 #15
0
ファイル: find_amplicon.py プロジェクト: bsmith89/seq-div
def main():
    args = parse_args(sys.argv)
    logging.basicConfig(level=args.log_level)
    logger.debug(args)

    hits = read_table(args.table_handle)
    hits['mis_sum'] = hits.mis_start + hits.mis_stop
    if args.max_mismatch:
        hits = hits[hits.mis_sum <= args.max_mismatch]
    if args.primer_set:
        hits = hits[hits.primer_set == args.primer_set]

    recs = parse(args.in_handle, args.fmt_infile)
    for rec in recs:
        amplicon, hit_info = get_amplicon(rec, hits,
                                          trim_primers=args.trim_primers)
        logger.debug(hit_info)
        if (type(hit_info) == type(None)) and args.drop:
            warn(cli.DropSequenceWarning("No hit found for {rec.id}".format(rec=rec)))
        else:
            write(amplicon, args.out_handle, args.fmt_outfile)
コード例 #16
0
ファイル: make_fastq.py プロジェクト: bsmith89/seq-div
def main():
    seq = read(sys.argv[1], 'fasta')
    qual = read(sys.argv[2], 'qual')
    seq.letter_annotations = qual.letter_annotations
    write(seq, sys.stdout, 'fastq')
コード例 #17
0
ファイル: fetch_frags.py プロジェクト: bsmith89/deep-pipeline
    hits.columns = ['orf', 'model', 'e_value', 'score', 'bias']
    hits = hits[hits.e_value < opts.e_value_cutoff]
    if len(hits) == 0:
        sys.stdout.write("")
        sys.exit()
    hits['read'], hits['start'], hits['stop'] = \
            np.array(hits.orf.str.\
                match('^(.*)\(([0-9]*)-([0-9]*)\)$').values.tolist()).T
    read_set = set(hits.read)
    out_recs = []
    for rec in parse(args[1], opts.format):
        if rec.name in read_set:
            read_hits = hits[hits.read == rec.name]
            read_set.remove(rec.name)
            orf_seq = Seq('')
            if len(read_hits) > 1:
                # If multiple orfs in one read then it's probably a frame
                # shift error or pseudo gene
                continue
            for index, hit in read_hits.sort('start').iterrows():
                # This for-loop is completely uneccesary thanks to the if statement
                # above.
                start = int(hit['start'])
                stop = int(hit['stop'])
                if start < stop:
                    orf_seq += rec.seq[(start - 1):stop]
                elif start > stop:
                    orf_seq = rec.seq[(stop - 1):start].reverse_complement() + orf_seq
                orf_rec = SeqRecord(id=hit['orf'], seq=orf_seq, description='')
                write(orf_rec, sys.stdout, 'fasta')
コード例 #18
0
ファイル: fasta.py プロジェクト: benjschiller/seriesoftubes
def writer(foo, iterable):
    '''
    writes SeqRecord objects from iterable to FASTA file foo.
    Warning: overwrites foo, does not append
    '''
    write(iterable, open(foo, 'w'), 'fasta')
コード例 #19
0
ファイル: seq.py プロジェクト: bsmithers/hpf
 def __enter__(self):
     self.temp_file = NamedTemporaryFile(**self.kwargs)
     with open(self.temp_file.name,"w") as handle:
         write(self.records, handle, self.format)
     return self.temp_file.name
コード例 #20
0
cas9Assemblies = listdir(assemblyDir)
goodDomIDS = load(open("pickles/%s_GoodDomainIDS.p" % (gene), "rb"))
goodDomMap = load(open("pickles/%s_GoodDomMap.p" % (gene), "rb"))
hmm_parser = load(open("pickles/%s_HMM_Parsing_Results.p" % (gene), "rb"))

print("All loaded")
#Copy unique nucleotide sequence
from Bio.SeqIO import index
nukSeqHash, protSeqHash = set(), set()
alreadyGotIt, count = 0, 0
for assembly in cas9Assemblies:
    baseID = assembly[:-6]
    allAssemblySeqs = index(assemblyDir + assembly, "fasta")
    overlap = goodDomIDS.intersection(allAssemblySeqs.keys())
    for recID in overlap:
        seq = str(allAssemblySeqs[recID].seq).upper()
        if seq in nukSeqHash and len(goodDomMap[recID]) == 1:
            alreadyGotIt += 1
            continue
        nukSeqHash.add(seq)
        #There may be more than 1 protein on the pseudochromosome, save both as separate files
        if len(goodDomMap[recID]) > 1:
            print("%i Cas9s on %s %s" %
                  (len(goodDomMap[recID]), recID, baseID))
        for orfID in goodDomMap[recID]:
            #                 protSeq = str(hmm_parser.results[baseID].proteins[orfID].seq).upper()
            with open("assemblies/pseudoChromos/%s.fasta" % (orfID),
                      "w") as fh:
                write(allAssemblySeqs[recID], fh, "fasta")
                count += 1
        if count % 1000 == 0: print(count, end=" ")
コード例 #21
0
from sklearn.metrics import euclidean_distances
from random import randint
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from Bio.SeqIO import parse, write
from Bio.SeqRecord import SeqRecord
annotatedRegions = open("assemblies/AnnotatedContigs.fa", "w")
geneDescripts = open("annotations/AnnotationDescripts.bed", "w")
annotations = glob("mags/final.contigs.*/*.gbk")
seqs = set()
hypoCounter = 0
for fname in annotations:
    print(fname)
    for rec in parse(fname, 'genbank'):
        for feature in rec.features:
            try:
                product = feature.qualifiers['product'][0]
                hypoCounter += int(product == "hypothetical protein")
                geneDescripts.write("%s\t%i\t%i\t%s\n" %
                                    (rec.id, feature.location.start,
                                     feature.location.end, product))
                if rec.id in seqs: continue
                rec.seq = rec.seq.upper()
                write(rec, annotatedRegions, "fasta")
                seqs.add(rec.id)
            except:
                pass
annotatedRegions.close()
print(len(seqs), "Seqs with hypo prots:", len(hypoCounter))
コード例 #22
0
ファイル: translate.py プロジェクト: bsmith89/deep-pipeline
    parser = optparse.OptionParser(usage=usage)
    parser.add_option("-f", "--informat", dest="informat",
                      default=_DEFAULT_FORMAT,
                      help=("the format of the input file. "
                            "DEFAULT: {}").format(_DEFAULT_FORMAT))
    parser.add_option("-F", "--outformat", dest="outformat",
                      default=_DEFAULT_FORMAT,
                      help=("the format of the output. "
                            "DEFAULT: {}").format(_DEFAULT_FORMAT))
    parser.add_option("-u", "--ungap", dest="ungap", action="store_true",
                      default=_DEFAULT_UNGAP,
                      help=("should the input sequences have gap "
                            "characters ('-' and '.') removed before "
                            "translation? DEFAULT: {}").\
                           format(_DEFAULT_UNGAP))
    opts, args = parser.parse_args()

    if len(args) == 0:
        rec_iter = parse(sys.stdin, opts.informat)
    elif len(args) == 1:
        rec_iter = parse(args[0], opts.informat)
    else:
        parser.error("Too many arguments.")

    for rec in rec_iter:
        seq = rec.seq
        if opts.ungap:
            seq = seq.ungap('-').ungap('.')
        rec.seq = Seq(transl(seq))
        write(rec, sys.stdout, opts.outformat)
コード例 #23
0
                DESCS[name] = re.search(r": (.*) \[",
                                        gb_archive.description).group(1)
            rbs.type = "RBS"
            rbs.qualifiers["note"] = [DESCS[name], "color: #f58a5e"]

        # sort features by start location, source always first
        gb_archive.features.sort(key=lambda f: (-len(gb.seq)) *
                                 (f.type == "source") + f.location.start)

        # translate color from notes to ApEinfo
        for feature in gb_archive.features:
            translate_color(feature)

        # Fix the direct submission reference
        if (gb_archive.annotations["references"][-1].title ==
                "Direct Submission"):
            ref = gb_archive.annotations["references"][-1]
        else:
            ref = Reference()
            ref.title = "Direct Submission"
            gb_archive.annotations.append(ref)
        ref.authors = "Larralde M"
        ref.journal = "Distributed with the MoClo Python library\nhttps://github.com/althonos/moclo"

        # write the final record
        dst_dir = os.path.abspath(
            os.path.join(__file__, "..", "..", "moclo-cidar", "registry",
                         "cidar"))
        dst_file = os.path.join(dst_dir, "{}.gb").format(info["id"])
        write(gb_archive, dst_file, "gb")
コード例 #24
0
    def index_and_save( self, out_handle, format='fasta'):
        """Writes input sequences to out_handle in the specified format.
        Updates self.index simultaneously, to match the new file co-ordinates.
        """
        try:
            pos = out_handle.tell()
        except IOError, e:
            # not seekable. out_handle has no context of position.
            msg = "{0}\n    {1} has no knowledge of position."\
                            .format(e, out_handle)
            raise IOError(msg)

        from Bio.SeqIO import write
        idx = self.indexes
        for seq_record in self.parse(self.in_handle):
            write( seq_record, out_handle, format )
            idx[seq_record.id] = pos
            pos = out_handle.tell()
        idx.save()


    @classmethod
    def parse(cls, handle):
        """An iterator function that yield's SeqRecord objects from a readable
        file-like object full of fasta-formatted sequences.

        This serves the same purpose as BioPython's SeqIO.FastaIO.FastaIterator 
        function. It doesn't actually do any indexing, as is intended to be
        used when indexing an output file, as in `self.index_and_save(...)`
        """
        sequence = ''
コード例 #25
0
            translate_color(feature)

        # merge annotations
        annotations = copy.deepcopy(gba.annotations)
        annotations.update(gbd.annotations)

        # Fix the direct submission annotation
        ref = annotations["references"][-1]
        ref.authors = "Larralde M"
        ref.journal = "Distributed with the MoClo Python library\nhttps://github.com/althonos/moclo"

        # Add the YTK type to the record comments
        annotations["comment"] = ["YTK:{}".format(type_)]

        # create the final record
        final = CircularRecord(
            seq=gba.seq,
            id=info["id"],
            name=info["id"],
            description=info["name"],
            dbxrefs=gba.dbxrefs + gbd.dbxrefs,
            features=features,
            annotations=annotations,
        )

        # write the final record
        dst_dir = os.path.abspath(
            os.path.join(__file__, "..", "..", "moclo-ytk", "registry", "ptk"))
        dst_file = os.path.join(dst_dir, "{}.gb").format(info["id"])
        write(final, dst_file, "gb")
コード例 #26
0
ファイル: infer_frame.py プロジェクト: bsmith89/seq-div
def main():
    for rec in parse(sys.argv[1], 'fasta'):
        rec.seq = inframe(rec.seq)
        write(rec, sys.stdout, 'fasta')