Example #1
0
File: ks.py Project: rrane/jcvi
def muscle_align_protein(recs, work_dir, outfmt="fasta", inputorder=True):
    """
    Align given proteins with muscle.
    recs are iterable of Biopython SeqIO objects
    """
    fasta_file = op.join(work_dir, "prot-start.fasta")
    align_file = op.join(work_dir, "prot.aln")
    SeqIO.write(recs, file(fasta_file, "w"), "fasta")

    muscle_cl = MuscleCommandline(
        cmd=MUSCLE_BIN("muscle"), input=fasta_file, out=align_file, seqtype="protein", clwstrict=True
    )
    stdout, stderr = muscle_cl()
    alignment = AlignIO.read(muscle_cl.out, "clustal")

    if inputorder:
        try:
            muscle_inputorder(muscle_cl.input, muscle_cl.out)
        except ValueError:
            return ""
        alignment = AlignIO.read(muscle_cl.out, "fasta")

    print >>sys.stderr, "\tDoing muscle alignment: %s" % muscle_cl
    if outfmt == "fasta":
        return alignment.format("fasta")
    if outfmt == "clustal":
        return alignment.format("clustal")
Example #2
0
 def simple_check(self, filename, format, alphabet):
     id_list = [rec.id for rec in SeqIO.parse(open(filename), format, alphabet)]
     rec_dict = SeqIO.indexed_dict(filename, format, alphabet)
     self.assertEqual(set(id_list), set(rec_dict.keys()))
     # This is redundant, I just want to make sure len works:
     self.assertEqual(len(id_list), len(rec_dict))
     # Make sure boolean evaluation works
     self.assertEqual(bool(id_list), bool(rec_dict))
     for key in id_list:
         self.assert_(key in rec_dict)
         self.assertEqual(key, rec_dict[key].id)
         self.assertEqual(key, rec_dict.get(key).id)
     # Check non-existant keys,
     try:
         rec = rec_dict[chr(0)]
         raise ValueError("Accessing a non-existant key should fail")
     except KeyError:
         pass
     self.assertEqual(rec_dict.get(chr(0)), None)
     self.assertEqual(rec_dict.get(chr(0), chr(1)), chr(1))
     # Now check iteritems...
     for key, rec in rec_dict.iteritems():
         self.assert_(key in id_list)
         self.assert_(isinstance(rec, SeqRecord))
         self.assertEqual(rec.id, key)
     # Now check non-defined methods...
     self.assertRaises(NotImplementedError, rec_dict.values)
     self.assertRaises(NotImplementedError, rec_dict.popitem)
     self.assertRaises(NotImplementedError, rec_dict.pop, chr(0))
     self.assertRaises(NotImplementedError, rec_dict.pop, chr(0), chr(1))
     self.assertRaises(NotImplementedError, rec_dict.clear)
     self.assertRaises(NotImplementedError, rec_dict.__setitem__, "X", None)
     self.assertRaises(NotImplementedError, rec_dict.copy)
     self.assertRaises(NotImplementedError, rec_dict.fromkeys, [])
Example #3
0
def load_examples_from_fasta(signal, org, data_path):
    """
    load examples from fasta file

    signal 
    """

    fn_pos = "%s/%s_sig_%s_example.fa" % (data_path, signal, "pos")
    fn_neg = "%s/%s_sig_%s_example.fa" % (data_path, signal, "neg")
    print "loading: \n %s \n %s" % (fn_pos, fn_neg)

    # parse file
    xt_pos = [str(rec.seq) for rec in SeqIO.parse(fn_pos, "fasta")]
    xt_neg = [str(rec.seq) for rec in SeqIO.parse(fn_neg, "fasta")]

    labels = [+1] * len(xt_pos) + [-1] * len(xt_neg)
    examples = xt_pos + xt_neg

    print (
        "organism: %s, signal %s,\t num_labels: %i,\t num_examples %i,\t num_positives: %i,\t num_negatives: %i"
        % (org, signal, len(labels), len(examples), len(xt_pos), len(xt_neg))
    )

    examples_shuffled, labels_shuffled = helper.coshuffle(examples, labels)
    ret = {"examples": numpy.array(examples_shuffled), "labels": numpy.array(labels_shuffled)}

    return ret
Example #4
0
def fasta2karyotype(name, color, in_fname):
    genome = []

    for seq_rec in SeqIO.parse(in_fname, "fasta"):
        genome.append(str(seq_rec.seq))

    genome_size = sum([len(x) for x in genome])

    # write karyotype
    new_file = [" ".join(["chr -", name, name, "0", str(genome_size), color])]

    running_total = 0

    for contig_num, contig in enumerate(genome):
        contig_name = "contig%s" % (contig_num + 1,)

        new_file.append(
            " ".join(
                ["band", name, contig_name, contig_name, str(running_total), str(running_total + len(contig)), color]
            )
        )

        running_total += len(contig)

    with open(os.path.join(options.output_dir, "karyotype." + name + ".txt"), "w") as out_fp:
        out_fp.write("\n".join(new_file))

    # write FASTA
    seq_rec = SeqRecord(id=name, name=name, description="", seq=Seq("".join(genome)))
    SeqIO.write(seq_rec, os.path.join(options.output_dir, name + ".fasta"), "fasta")
Example #5
0
def main(args=sys.argv[1:]):
    """
    Parse a generic template and insert sequences from a FASTA file into the middle,
    separated by the appropriate XML tags.
    """

    parser = build_parser()
    a = parser.parse_args()
    patients = patient_dd()

    for datafile in a.datafiles:
        processFasta(datafile, patients)

    samples = [sample for p in patients.values() for sample in p.values()]
    dates = [s["date"] for s in samples]

    for pid, patient in patients.items():
        for sid, sample in patient.items():
            for gene, region in sample["regions"].items():
                if not a.dryrun:
                    print("{}  {} {}  {}".format(pid, gene, sid, sample["timepoint"]))
                filename = "{}_{}_{}.fa".format(pid, gene, sample["timepoint"])
                filename = os.path.join(a.outdir, filename)
                if a.dryrun:
                    print(filename)
                else:
                    with open(filename, "w") as fh:
                        SeqIO.write(region, fh, "fasta")
Example #6
0
def count_lengths(fasta, fastq):
    # setup
    if fasta == None and fastq == None:
        raise Exception("no input specified.")
    if fastq == None:
        fd = open(fasta, "r")
        seq = SeqIO.parse(fd, "fasta")
    else:
        fd = open(fastq, "r")
        seq = SeqIO.parse(fd, "fastq")

    total = 0.0
    lengths = {21: 0, 22: 0, 23: 0, 24: 0}
    for record in seq:
        try:
            lengths[len(record)] += 1
        except KeyError:
            pass
        finally:
            total += 1
            # potentially note
    for i in lengths:
        lengths[i] = lengths[i] / total
    fd.close()
    return lengths
Example #7
0
def check_convert_fails(in_filename, in_format, out_format, alphabet=None):
    qual_truncate = truncation_expected(out_format)
    # We want the SAME error message from parse/write as convert!
    err1 = None
    try:
        records = list(SeqIO.parse(in_filename, in_format, alphabet))
        handle = StringIO()
        if qual_truncate:
            warnings.simplefilter("ignore", UserWarning)
        SeqIO.write(records, handle, out_format)
        if qual_truncate:
            warnings.filters.pop()
        handle.seek(0)
        assert False, "Parse or write should have failed!"
    except ValueError as err:
        err1 = err
    # Now do the conversion...
    try:
        handle2 = StringIO()
        if qual_truncate:
            warnings.simplefilter("ignore", UserWarning)
        SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet)
        if qual_truncate:
            warnings.filters.pop()
        assert False, "Convert should have failed!"
    except ValueError as err2:
        assert str(err1) == str(err2), "Different failures, parse/write:\n%s\nconvert:\n%s" % (err1, err2)
Example #8
0
    def __init__(self, file_name, padding, trim_flanking):

        """
        This class is created by instantiating with a file name and a padding value.
        These are used to locate the appropriate target file, and to select the
        amount of flanking sequence to be appended to exons.
        """
        """
        :param file_name: the location/identity of the target input file
        :param padding: the required amount of intronic padding
        """
        self.trim_flanking = trim_flanking
        self.exons = []
        self.cds = []
        self.mrna = []
        self.file_name = file_name
        try:
            self.transcriptdict = dict(
                transcripts={},
                input=SeqIO.to_dict(SeqIO.parse(file_name, "genbank")),
                pad=int(padding),
                pad_offset=int(padding) % 5,
            )
            self.transcriptdict["refseqname"] = self.transcriptdict["input"].keys()[0]
            self.is_matt_awesome = True
        except IOError as FileNotPresent:
            print "The specified file cannot be located: " + FileNotPresent.filename
            exit()

        assert self.transcriptdict["pad"] <= 2000, "Padding too large, please use a value below 2000 bases"
def extractSnpOrfs(config, organism):
    """Extract ORFs out of SNP-fastas"""
    orf_handle = open(config["OUTPUT"]["folder"] + "orfs/" + organism["prefix"] + "-orfs.tsv")
    snp_handle = open(config["OUTPUT"]["folder"] + "snps/" + organism["prefix"] + "_snps.fasta")
    snp_contigs = SeqIO.to_dict(SeqIO.parse(snp_handle, "fasta"))
    snp_handle.close()
    orfs = orf_handle.readlines()
    orf_handle.close()
    sequences = {}
    for line in orfs:
        if line[0] != "#":
            line_array = line.split("\t")
            start = int(line_array[2])
            stop = int(line_array[3])
            print line_array[0]
            print "start: " + str(start)
            print "stop: " + str(stop)
            if start > stop:
                sequences[line_array[0]] = snp_contigs[line_array[0]].reverse_complement()[stop : start + 1]
            else:
                sequences[line_array[0]] = snp_contigs[line_array[0]][start : stop + 1]
    snp_orf_handle = open(config["OUTPUT"]["folder"] + "orfs/" + organism["prefix"] + "-nt-snps.fasta", "w")
    out_sequences = []
    for key, value in sequences.items():
        out_sequences.append(SeqRecord(Seq(str(value.seq)), id=key, description=""))
    SeqIO.write(out_sequences, snp_orf_handle, "fasta")
    snp_orf_handle.close()
def check_match(input, reference, output):
    in_record = open(input)
    ref_record = open(reference)
    fasta_list = []

    for in_record in SeqIO.parse(input, "fasta"):  # parse input and reference seqs
        # translate to peptide seq
        orf = trans(str(in_record.seq))
        written = 0

        for aa_seq in orf:
            if (written == 0) and (len(aa_seq) >= 140):  # shortest length of a ref seq
                for ref_record in SeqIO.parse(reference, "fasta"):
                    # pairwise alignment of input seq and each ref until a match found
                    # 1 point for match, -1 for mistmatch, -.5 for gab, -.1 for gap extension.
                    # Can alter scoring for looser alignments
                    align = pairwise2.align.localms(aa_seq, ref_record.seq, 1, -1, -0.5, -0.1, score_only=True)

                    # scores equal to ref length (100% alignment)
                    if align == len(ref_record.seq):
                        fasta_list.append(">%s\n%s\n" % (in_record.description + " len:" + str(len(aa_seq)), aa_seq))
                        written = 1
                        break

    # write query descriptions and seqs that match ref
    with open(output + ".fna", "a") as file:
        file.write("\n".join(fasta_list))
        file.close()
Example #11
0
def fetch(queries, email, db="protein", rettype="fasta", retmode="text"):
    for qid in queries:
        handle = Entrez.efetch(db=db, rettype=rettype, retmode=retmode, id=qid)
        seq_record = SeqIO.read(handle, rettype)
        SeqIO.write(seq_record, sys.stdout, "fasta")
        print >>sys.stderr, "%s..." % seq_record.description[:45]
        handle.close()
Example #12
0
def _do_water_alignment(seq1, seq2, out_fhand, gap_open=10.0, gap_extend=0.5, out_fmt="markx10", reverse2=False):
    seq1_fhand = NamedTemporaryFile()
    seq2_fhand = NamedTemporaryFile()

    SeqIO.write(seq1, seq1_fhand, "fasta")
    SeqIO.write(seq2, seq2_fhand, "fasta")
    seq1_fhand.flush()
    seq2_fhand.flush()
    cmd = [
        "water",
        "-asequence",
        seq1_fhand.name,
        "-bsequence",
        seq2_fhand.name,
        "-outfile",
        out_fhand.name,
        "-gapopen",
        str(gap_open),
        "-gapextend",
        str(gap_extend),
        "-aformat3",
        out_fmt,
    ]
    if reverse2:
        cmd.append("-sreverse2")
    stdout = open(os.devnull, "w")
    stderr = open(os.devnull, "w")
    subprocess.check_call(cmd, stdout=stdout, stderr=stderr)
def splitFastaFile(infile, informat, outdir):
    for record in SeqIO.parse(open(infile), informat):
        iid = record.id
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        f_out = os.path.join(outdir, iid + ".fasta")
        SeqIO.write([record], open(f_out, "w"), "fasta")
Example #14
0
 def write_fasta(self, outfile):
     """write fasta file"""
     fasta_out = open(outfile + ".fasta", "w")
     for i, is_set in enumerate(self.annotations):
         SeqIO.write(is_set.fasta(str(i + 1)), fasta_out, "fasta")
         fasta_out.write("\n")
     fasta_out.close()
 def test_structured_comment_parsing(self):
     # GISAID_EpiFlu(TM)Data, HM138502.gbk has both 'comment' and 'structured_comment'
     record = SeqIO.read(path.join("GenBank", "HM138502.gbk"), "genbank")
     self.assertEqual(
         record.annotations["comment"],
         "Swine influenza A (H1N1) virus isolated during human swine flu\noutbreak of 2009.",
     )
     self.assertEqual(record.annotations["structured_comment"]["GISAID_EpiFlu(TM)Data"]["Lineage"], "swl")
     self.assertEqual(len(record.annotations["structured_comment"]["GISAID_EpiFlu(TM)Data"]), 3)
     # FluData structured comment
     record = SeqIO.read(path.join("GenBank", "EU851978.gbk"), "genbank")
     self.assertEqual(record.annotations["structured_comment"]["FluData"]["LabID"], "2008704957")
     self.assertEqual(len(record.annotations["structured_comment"]["FluData"]), 5)
     # Assembly-Data structured comment
     record = SeqIO.read(path.join("GenBank", "KF527485.gbk"), "genbank")
     self.assertEqual(
         record.annotations["structured_comment"]["Assembly-Data"]["Assembly Method"], "Lasergene v. 10"
     )
     self.assertEqual(len(record.annotations["structured_comment"]["Assembly-Data"]), 2)
     # No structured comment in NC_000932.gb, just a regular comment
     record = SeqIO.read(path.join("GenBank", "NC_000932.gb"), "genbank")
     self.assertFalse("structured_comment" in record.annotations)
     self.assertEqual(
         record.annotations["comment"],
         "REVIEWED REFSEQ: This record has been curated by NCBI staff. The\n"
         "reference sequence was derived from AP000423.\n"
         "COMPLETENESS: full length.",
     )
Example #16
0
def load_multi_database(gb_filename_or_handle, gb_filename_or_handle2):
    """Load two GenBank files into a new BioSQL database as different subdatabases.

    This is useful for running tests against a newly created database.
    """

    create_database()
    # now open a connection to load the database
    db_name = "biosql-test"
    db_name2 = "biosql-test2"
    server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB)
    db = server.new_database(db_name)

    # get the GenBank file we are going to put into it
    iterator = SeqIO.parse(gb_filename_or_handle, "gb")
    count = db.load(iterator)

    db = server.new_database(db_name2)

    # get the GenBank file we are going to put into it
    iterator = SeqIO.parse(gb_filename_or_handle2, "gb")
    # finally put it in the database
    count2 = db.load(iterator)
    server.commit()

    server.close()
    return count + count2
Example #17
0
def align(query_fasta, subject_fasta, ignore_ambiguous):
    global ignore_ambig
    ignore_ambig = ignore_ambiguous
    if not ignore_ambig:
        sys.stderr.write("Ignoring ambiguous bases")

    refp = SeqIO.parse(subject_fasta, "fasta")
    ref = refp.next()

    fhq = SeqIO.parse(query_fasta, "fasta")

    total_seq = count_seqs(query_fasta)

    tcount = 1
    for seq in fhq:
        mutations = []
        sys.stderr.write("Gathering mutations for sequence %s of %s\n" % (tcount, total_seq))

        # mutations, r1 = run_blast( seq, subject_fasta )
        r1 = -1
        if r1 == -1:
            sys.stderr.write("%s failed to blast falling back to tcoffee. Blast output:\n%s" % (seq.id, r1))
            mutations, r2 = tcoffee_align(ref, seq)
            if r2 == -1:
                sys.stderr.write("%s failed to align with tcoffee as well. Tcoffee output:\n%s" % (seq.id, r2))
        print "%s: Total mutations: %s" % (seq.description, len(mutations))
        for m in mutations:
            print m
        tcount += 1
Example #18
0
 def loop(self, filename, format):
     original_records = list(SeqIO.parse(filename, format))
     # now open a connection to load the database
     server = BioSeqDatabase.open_database(driver=DBDRIVER, user=DBUSER, passwd=DBPASSWD, host=DBHOST, db=TESTDB)
     db_name = "test_loop_%s" % filename  # new namespace!
     db = server.new_database(db_name)
     count = db.load(original_records)
     self.assertEqual(count, len(original_records))
     server.commit()
     # Now read them back...
     biosql_records = [db.lookup(name=rec.name) for rec in original_records]
     # And check they agree
     self.assertTrue(compare_records(original_records, biosql_records))
     # Now write to a handle...
     handle = StringIO()
     SeqIO.write(biosql_records, handle, "gb")
     # Now read them back...
     handle.seek(0)
     new_records = list(SeqIO.parse(handle, "gb"))
     # And check they still agree
     self.assertEqual(len(new_records), len(original_records))
     for old, new in zip(original_records, new_records):
         # TODO - remove this hack because we don't yet write these (yet):
         for key in ["comment", "references", "db_source"]:
             if key in old.annotations and key not in new.annotations:
                 del old.annotations[key]
         self.assertTrue(compare_record(old, new))
     # Done
     handle.close()
     server.close()
Example #19
0
def fragment_FASTA_files(infiles, outdirname, fragsize):
    """Chops sequences of the passed files into fragments, returns filenames.

    - infiles - paths to each input sequence file
    - outdirname - path to output directory
    - fragsize - the size of sequence fragments

    Takes every sequence from every file in infiles, and splits them into
    consecutive fragments of length fragsize, (with any trailing sequences
    being included, even if shorter than fragsize), and writes the resulting
    set of sequences to a file with the same name in the output directory.
    All fragments are named consecutively and uniquely (within a file) as
    fragNNNNN. Sequence description fields are retained.
    """
    outfnames = []
    for fname in infiles:
        outstem, outext = os.path.splitext(os.path.split(fname)[-1])
        outfname = os.path.join(outdirname, outstem) + "-fragments" + outext
        outseqs = []
        count = 0
        for seq in SeqIO.parse(fname, "fasta"):
            idx = 0
            while idx < len(seq):
                count += 1
                newseq = seq[idx : idx + fragsize]
                newseq.id = "frag%05d" % count
                outseqs.append(newseq)
                idx += fragsize
        outfnames.append(outfname)
        SeqIO.write(outseqs, outfname, "fasta")
    return outfnames, get_fraglength_dict(outfnames)
Example #20
0
    def saveFastaOfORFsInStrain(self, orfIdArray, strain, fn):
        """save a fasta file (filename fn) with the sequences for the orfs in the array OrfIdArray in strain"""

        strOrfs = SpringDb.convertNumPyArrayToSqlArray(orfIdArray)
        cursor = self.getCursor()
        cursor.execute(
            "select locus_name from orth_orf where orth_orf_id in " + strOrfs + " and genome_id=" + str(strain)
        )
        locusNameArray = cursor.fetchall()
        locusNameArray = np.array(locusNameArray)
        locusNameArrayFormatted = SpringDb.convertNumPyArrayToSqlStrArray(locusNameArray.reshape(locusNameArray.size))
        # for locusName in a:
        cursor.execute(
            "select seq, locus_name, gene_name, product_name from orf where locus_name in " + locusNameArrayFormatted
        )
        f = open(fn, "w")
        i = 0
        for locus in cursor:
            f.write(">" + locus[1] + "; " + locus[2] + "; " + locus[3] + "\n")
            f.write(locus[0] + "\n")
            i += 1
        f.close()
        # read and re-write to reformat nicely using the SeIO.write
        s = SeqIO.parse(fn, "fasta")
        SeqIO.write(s, fn + "_tmp", "fasta")
        shutil.move(fn + "_tmp", fn)
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""
        self.assertTrue(str(eval(repr(cline))) == str(cline))
        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"), lambda rec: rec.id.replace(":", "_"))

        # Determine name of tree file
        if cline.newtree:
            tree_file = cline.newtree
        else:
            # Clustalw will name it based on the input file
            tree_file = os.path.splitext(cline.infile)[0] + ".dnd"

        # Mark generated files for later removal
        self.add_file_to_clean(cline.outfile)
        self.add_file_to_clean(tree_file)

        output, error = cline()
        self.assertTrue(output.strip().startswith("CLUSTAL"))
        self.assertTrue(error.strip() == "")

        # Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        # The length of the alignment will depend on the version of clustalw
        # (clustalw 2.1 and clustalw 1.83 are certainly different).
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal"))
        self.assertTrue(set(input_records.keys()) == set(output_records.keys()))
        for record in align:
            self.assertTrue(str(record.seq) == str(output_records[record.id].seq))
            self.assertTrue(str(record.seq).replace("-", "") == str(input_records[record.id].seq))

        # Check the DND file was created.
        # TODO - Try and parse this with Bio.Nexus?
        self.assertTrue(os.path.isfile(tree_file))
def __main__():
    # Parse Command Line
    parser = optparse.OptionParser(
        usage="python %prog [options]\n\nProgram designed by Guillaume MARTIN : guillaume.martin@cirad.fr\n\n"
        "This program replace specified regions in the provided table file by X. These X will be used to split scaffold using SplitOnX.py"
        "The table file should be formated has in the example:"
        "scaffold83 93565 93586"
        "scaffold120 330181 330183"
        "scaffold120 380870 383428"
    )
    # Wrapper options.
    parser.add_option(
        "", "--table", dest="table", default="not_filled", help="The table file with region to convert to X"
    )
    parser.add_option("", "--fasta", dest="fasta", default="not_filled", help="The multifasta sequence file")
    parser.add_option(
        "", "--out", dest="out", default="X_converted.fasta", help="The output file name, [default: %default]"
    )
    (options, args) = parser.parse_args()

    if options.table == "not_filled":
        sys.exit("--table argument is missing")
    if options.fasta == "not_filled":
        sys.exit("--fasta argument is missing")

        # loading sequences
    record_dict = SeqIO.index(options.fasta, "fasta")
    file = open(options.table)
    dic = {}
    for line in file:
        data = line.split()
        if data:
            if data[0] in dic:
                if len(data) == 2:
                    dic[data[0]].add(int(data[1]) - 1)
                else:
                    i = int(data[1])
                    while i <= int(data[2]):
                        dic[data[0]].add(i - 1)
                        i += 1
            else:
                dic[data[0]] = set()
                if len(data) == 2:
                    dic[data[0]].add(int(data[1]) - 1)
                else:
                    i = int(data[1])
                    while i <= int(data[2]):
                        dic[data[0]].add(i - 1)
                        i += 1
    file.close()

    outfile = open(options.out, "w")
    for n in record_dict:
        if n in dic:
            sequence = list(str(record_dict[n].seq))
            for k in dic[n]:
                sequence[k] = "X"
            SeqIO.write(SeqRecord(Seq("".join(sequence), generic_dna), id=n, description=""), outfile, "fasta")
        else:
            SeqIO.write(SeqRecord(record_dict[n].seq, id=n, description=""), outfile, "fasta")
Example #23
0
def run_pal2nal(fname_aln, fname_nuc, fname_prot):
    """
    Generate a codon alignment via PAL2NAL.

    @param fname_aln:
        MSA of protein sequences in CLUSTAL format (.aln)
    @param fname_nuc:
        Nucleotide sequences in FASTA format (.fasta)
    @param fname_prot:
        Protein sequences in FASTA format (.fasta)
    @return:
        Codon alignment in CLUSTAL format (.aln), suitable for codeml
    1"""
    sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc))

    # Reorder fname_nuc according to the order of the proteins in fname_aln, which
    # was reordered due to CLUSTALW2.  Note that the first protein in each of
    # these files remains the same as at the start, however; this first protein
    # is our original query protein.
    nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")]
    prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")]
    records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records))
    fname_nuc2 = "homologs_ordered.dna.fasta"
    with open(fname_nuc2, "w") as f:
        for record in SeqIO.parse(fname_aln, "clustal"):
            SeqIO.write(records_map[record.id], f, "fasta")
    fname_codon = "homologs.codon.aln"
    # TODO: use subprocess
    os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon))
    return fname_codon
Example #24
0
    def __format__(self, format_spec):
        """Returns the record as a string in the specified file format.

        This method supports the python format() function added in
        Python 2.6/3.0.  The format_spec should be a lower case string
        supported by Bio.SeqIO as an output file format. See also the
        SeqRecord's format() method.
        """
        if not format_spec:
            # Follow python convention and default to using __str__
            return str(self)
        from Bio import SeqIO

        if format_spec in SeqIO._BinaryFormats:
            # Return bytes on Python 3
            try:
                # This is in Python 2.6+, but we need it on Python 3
                from io import BytesIO

                handle = BytesIO()
            except ImportError:
                # Must be on Python 2.5 or older
                from StringIO import StringIO

                handle = StringIO()
        else:
            from StringIO import StringIO

            handle = StringIO()
        SeqIO.write(self, handle, format_spec)
        return handle.getvalue()
Example #25
0
 def test_genbank_to_embl(self):
     """Conversion of GenBank to EMBL."""
     filename = "GenBank/NC_005816.gb"
     old = SeqIO.read(filename, "gb")
     with open(filename) as handle:
         new = SeqIO.read(TogoWS.convert(handle, "genbank", "embl"), "embl")
     self.assertEqual(str(old.seq), str(new.seq))
    def standard_test_procedure(self, cline):
        """Standard testing procedure used by all tests."""

        # Overwrite existing files.
        cline.force = True

        # Mark output files for later cleanup.
        self.add_file_to_clean(cline.outfile)
        if cline.guidetree_out:
            self.add_file_to_clean(cline.guidetree_out)

        input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta"))
        self.assertEqual(str(eval(repr(cline))), str(cline))
        output, error = cline()
        self.assertTrue(not output or output.strip().startswith("CLUSTAL"))

        # Test if ClustalOmega executed successfully.
        self.assertTrue(
            error.strip() == ""
            or error.startswith("WARNING: Sequence type is DNA.")
            or error.startswith("WARNING: DNA alignment is still experimental.")
        )

        # Check the output...
        align = AlignIO.read(cline.outfile, "clustal")
        output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal"))
        self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys())))
        for record in align:
            self.assertEqual(str(record.seq), str(output_records[record.id].seq))

        # TODO - Try and parse this with Bio.Nexus?
        if cline.guidetree_out:
            self.assertTrue(os.path.isfile(cline.guidetree_out))
 def test_fasta_out(self):
     """Check FASTQ to FASTA output"""
     records = SeqIO.parse("Quality/example.fastq", "fastq")
     h = StringIO()
     SeqIO.write(records, h, "fasta")
     with open("Quality/example.fasta") as expected:
         self.assertEqual(h.getvalue(), expected.read())
Example #28
0
def write_seqrecords(seqs, fhand=None, file_format="fastq"):
    "It writes a stream of sequences to a file"
    if fhand is None:
        fhand = NamedTemporaryFile(suffix="." + file_format.replace("-", "_"))
    seqs = clean_seq_stream(seqs)
    SeqIO.write(seqs, fhand, file_format)
    return fhand
Example #29
0
def update_protein_db():
    proteins = get_protein_sequences()
    try:
        fasta_file = os.path.join(utils.PROTEIN_DB, "Proteins.fasta")
        count = SeqIO.write(proteins, fasta_file, "fasta")
    except:
        print "creating proteins folder in correct place"
        utils.create_folders()
        fasta_file = os.path.join(utils.PROTEIN_DB, "Proteins.fasta")
        count = SeqIO.write(proteins, fasta_file, "fasta")
    if True:
        blast_db_command = [
            utils.BLAST_DIR + "makeblastdb",
            "-in",
            '"' + fasta_file + '"',
            "-dbtype",
            "prot",
            "-title",
            "Proteins",
            "-out",
            "%s" % fasta_file,
        ]
        print blast_db_command
    # else:
    #     blast_db_command = [BLAST_DIR + 'formatdb',
    #                 '-i', "\""+ fasta_file+ "\"",
    #                 '-o', 'T',
    #                 "-t", "Proteins"]
    #     print blast_db_command
    subprocess.check_call(blast_db_command)
Example #30
0
File: ks.py Project: rrane/jcvi
def run_mrtrans(align_fasta, recs, work_dir, outfmt="paml"):
    """Align nucleotide sequences with mrtrans and the protein alignment.
    """
    align_file = op.join(work_dir, "prot-align.fasta")
    nuc_file = op.join(work_dir, "nuc.fasta")
    output_file = op.join(work_dir, "nuc-align.mrtrans")

    # make the prot_align file and nucleotide file
    align_h0 = open(align_file + "0", "w")
    align_h0.write(str(align_fasta))
    align_h0.close()
    prot_seqs = {}
    i = 0
    for rec in SeqIO.parse(align_h0.name, "fasta"):
        prot_seqs[i] = rec.seq
        i += 1
    align_h = open(align_file, "w")
    for i, rec in enumerate(recs):
        if len(rec.id) > 30:
            rec.id = rec.id[:28] + "_" + str(i)
            rec.description = ""
        print >> align_h, ">{0}\n{1}".format(rec.id, prot_seqs[i])
    align_h.close()
    SeqIO.write(recs, file(nuc_file, "w"), "fasta")

    # run the program
    cl = MrTransCommandline(align_file, nuc_file, output_file, outfmt=outfmt)
    r, e = cl.run()
    if e is None:
        print >>sys.stderr, "\tpal2nal:", cl
        return output_file
    elif e.read().find("could not translate") >= 0:
        print >>sys.stderr, "***pal2nal could not translate"
        return None