Beispiel #1
0
def mergeMSAs(msa1, msa2, full_ref):
    """
    Given two MultipleSeqAlignment objects sharing a first (reference) sequence,
    merge them on the reference. Returns a MultipleSeqAlignment containing all
    the sequences from each alignment, in the alignment induced by the shared
    reference sequence.
    
    Also needs access to the full reference SeqRecord in case it needs bases to
    fill in a gap.
    
    The first sequence may actually be only a subrange in either MSA, and either
    MSA may be on either strand of it.
    
    Either MSA may be None, in which case the other MSA is returned.
    
    >>> ref = SeqRecord(Seq("ATATATATGCATATATAT"), "first")
    >>> ref.annotations = {"strand": 1, "start": 0, "size": 18, "srcSize": 18}
    >>> ref1 = SeqRecord(Seq("AT-ATATAT"), "first")
    >>> ref1.annotations = {"strand": 1, "start": 0, "size": 8, "srcSize": 18}
    >>> alt1 = SeqRecord(Seq("ATAATATAT"), "second")
    >>> alt1.annotations = {"strand": -1, "start": 0, "size": 9, "srcSize": 9}
    >>> ref2 = SeqRecord(Seq("ATATATAT--"), "first")
    >>> ref2.annotations = {"strand": -1, "start": 0, "size": 8, "srcSize": 18}
    >>> alt2 = SeqRecord(Seq("ATATGG--AT"), "third")
    >>> alt2.annotations = {"strand": 1, "start": 0, "size": 8, "srcSize": 8}
    
    >>> msa1 = Align.MultipleSeqAlignment([ref1, alt1])
    >>> msa2 = Align.MultipleSeqAlignment([ref2, alt2])
    >>> merged = mergeMSAs(msa1, msa2, ref)
    >>> print(merged)
    Alphabet() alignment with 3 rows and 21 columns
    AT-ATATATGC--ATATATAT first
    ATAATATAT------------ second
    -----------AT--CCATAT third
    >>> pprint.pprint(merged[0].annotations)
    {'size': 18, 'srcSize': 18, 'start': 0, 'strand': 1}
    >>> pprint.pprint(merged[1].annotations)
    {'size': 9, 'srcSize': 9, 'start': 0, 'strand': -1}
    >>> pprint.pprint(merged[2].annotations)
    {'size': 8, 'srcSize': 8, 'start': 0, 'strand': -1}
    
    
    >>> ref3 = SeqRecord(Seq("ATGCAT"), "first")
    >>> ref3.annotations = {"strand": 1, "start": 6, "size": 6, "srcSize": 18}
    >>> alt3 = SeqRecord(Seq("ATCCAT"), "fourth")
    >>> alt3.annotations = {"strand": 1, "start": 5, "size": 6, "srcSize": 15}
    >>> msa3 = Align.MultipleSeqAlignment([ref3, alt3])
    
    >>> merged2 = mergeMSAs(merged, msa3, ref)
    >>> print(merged2)
    Alphabet() alignment with 4 rows and 21 columns
    AT-ATATATGC--ATATATAT first
    ATAATATAT------------ second
    -----------AT--CCATAT third
    -------ATCC--AT------ fourth
    >>> pprint.pprint(merged2[0].annotations)
    {'size': 18, 'srcSize': 18, 'start': 0, 'strand': 1}
    >>> pprint.pprint(merged2[1].annotations)
    {'size': 9, 'srcSize': 9, 'start': 0, 'strand': -1}
    >>> pprint.pprint(merged2[2].annotations)
    {'size': 8, 'srcSize': 8, 'start': 0, 'strand': -1}
    >>> pprint.pprint(merged2[3].annotations)
    {'size': 6, 'srcSize': 15, 'start': 5, 'strand': 1}
    
    

    
    """

    if msa1 is None:
        # No merging to do.
        return msa2

    if msa2 is None:
        # No merging to do this way either.
        return msa1

    if msa1[0].annotations["strand"] == -1:
        # MSA 1 needs to be on the + strand of the reference
        msa1 = reverse_msa(msa1)

    if msa2[0].annotations["strand"] == -1:
        # MSA 2 also needs to be on the + strand of the reference
        msa2 = reverse_msa(msa2)

    if msa2[0].annotations["start"] < msa1[0].annotations["start"]:
        # msa2 starts before msa1. We want msa1 to start first, so we need to
        # flip them.
        msa1, msa2 = msa2, msa1

    logging.debug("Zipping {}bp/{} sequence and {}bp/{} sequence  reference "
                  "alignments".format(msa1[0].annotations["size"], len(msa1),
                                      msa2[0].annotations["size"], len(msa2)))

    # Make sure we are joining on the right sequence.
    assert (msa1[0].id == msa2[0].id)

    logging.debug("Merging")

    logging.debug(msa1)
    logging.debug(msa1[0].annotations)
    logging.debug(msa2)
    logging.debug(msa2[0].annotations)

    # Compute the offset: number of extra reference columns that msa2 needs in
    # front of it. This will always be positive or 0.
    msa2_leading_offset = (msa2[0].annotations["start"] -
                           msa1[0].annotations["start"])

    logging.debug("{}bp between left and right alignment starts".format(
        msa2_leading_offset))

    # It would be nice if we could shortcut by adjoining compatible alignments,
    # but the IDs wouldn't match up at all.

    # Make lists for each sequence we are going to build: those in msa1, and
    # then those in msa2 (except the duplicate reference).
    merged = [list() for i in xrange(len(msa1) + len(msa2) - 1)]

    # Start at the beginning of both alignments.
    msa1Pos = 0
    msa2Pos = 0

    # How many reference characters have been used?
    refChars = 0

    while refChars < msa2_leading_offset and msa1Pos < len(msa1[0]):
        # Until we're to the point that MSA 2 might have anything to say, we
        # just copy MSA 1.

        for i, character in enumerate(msa1[:, msa1Pos]):
            # For each character in the first alignment in this column

            # Put that character as the character for the appropriate
            # sequence.
            merged[i].append(character)

        for i in xrange(len(msa1), len(msa1) + len(msa2) - 1):
            # For each of the alignment rows that come from msa2, put a gap.
            merged[i].append("-")

        if msa1[0, msa1Pos] != "-":
            # We consumed a reference character.
            refChars += 1

        # We used some of MSA1
        msa1Pos += 1

    logging.debug("Used {}/{} offset".format(refChars, msa2_leading_offset))

    while refChars < msa2_leading_offset:
        # We have a gap between the first MSA and the second, and we need to
        # fill it with reference sequence.

        # We know we are refChars after the beginning of the first reference, so
        # we use that to know what base to put here.
        merged[0].append(full_ref[msa1[0].annotations["start"] + refChars])

        for i in xrange(1, len(msa1) + len(msa2) - 1):
            # And gap out all the other sequences.
            merged[i].append("-")

        # We consumed (or made up) a reference character
        refChars += 1

    while msa1Pos < len(msa1[0]) and msa2Pos < len(msa2[0]):
        # Until we hit the end of both sequences

        if refChars % 10000 == 0:
            logging.debug("Now at {} in alignment 1, {} in alignment 2, {} in "
                          "reference".format(msa1Pos, msa2Pos, refChars))

        if (msa1[0, msa1Pos] == "-"):
            # We have a gap in the first reference. Put this column from the
            # first alignment alongside a gap for every sequence in the second
            # alignment.
            for i, character in enumerate(msa1[:, msa1Pos]):
                # For each character in the first alignment in this column

                # Put that character as the character for the appropriate
                # sequence.
                merged[i].append(character)

            for i in xrange(len(msa1), len(msa1) + len(msa2) - 1):
                # For each of the alignment rows that come from msa2, put a gap.
                merged[i].append("-")

            # Advance in msa1. We'll keep doing this until it doesn't have a gap
            # in its reference.
            msa1Pos += 1

        elif (msa2[0, msa2Pos] == "-"):
            # We have a letter in the first reference but a gap in the second.
            # Gap out the merged reference and all the columns from alignment 1,
            # and take the non-reference characters from alignment 2.

            for i in xrange(len(msa1)):
                # For the reference and all the sequences in msa1, add gaps
                merged[i].append("-")

            for i, character in zip(
                    xrange(len(msa1),
                           len(msa1) + len(msa2) - 1), msa2[1:, msa2Pos]):

                # For each of the alignment rows that come from msa2, put the
                # character from that row.
                merged[i].append(character)

            # Advance in msa2. We'll keep doing this until both msa1 and msa2
            # have a non-gap character in their references. We make it an
            # invariant that this will always be the same character.
            msa2Pos += 1

        else:
            # Neither has a gap. They both have real characters.

            if (msa1[0, msa1Pos] != msa2[0, msa2Pos]):
                logging.error(msa1)
                logging.error(msa2)
                raise RuntimeError("{} in reference 1 does not match {} "
                                   "in reference 2".format(
                                       msa1[0, msa1Pos], msa2[0, msa2Pos]))

            for i, character in enumerate(msa1[:, msa1Pos]):
                # Copy all the characters from msa1's column
                merged[i].append(character)

            for character, i in zip(
                    msa2[1:, msa2Pos],
                    xrange(len(msa1),
                           len(msa1) + len(msa2) - 1)):
                # Copy all the characters from msa2's column, except its
                # reference
                merged[i].append(character)

            # Advance both alignments
            msa1Pos += 1
            msa2Pos += 1

            # Say we used a reference character
            refChars += 1

        for otherMerged in merged[1:]:
            # Make sure we aren't dropping characters anywhere.
            assert (len(otherMerged) == len(merged[0]))

    logging.debug("At {}/{} of msa2, {}/{} of msa1".format(
        msa2Pos, len(msa2[0]), msa1Pos, len(msa1[0])))

    # By here, we must have finished one of the MSAs. Only one can have anything
    # left.
    assert (msa1Pos == len(msa1[0]) or msa2Pos == len(msa2[0]))

    while msa1Pos < len(msa1[0]):
        # MSA2 finished first and now we have to finish up with the tail end of
        # MSA1

        for i, character in enumerate(msa1[:, msa1Pos]):
            # For each character in the first alignment in this column

            # Put that character as the character for the appropriate
            # sequence.
            merged[i].append(character)

        for i in xrange(len(msa1), len(msa1) + len(msa2) - 1):
            # For each of the alignment rows that come from msa2, put a gap.
            merged[i].append("-")

        # Advance in msa1, until we finish it.
        msa1Pos += 1

    while msa2Pos < len(msa2[0]):
        # MSA1 finished first and now we have to finish up with the tail end of
        # MSA2

        # For the reference, put whatever it has in MSA2
        merged[0].append(msa2[0][msa2Pos])

        for i in xrange(1, len(msa1)):
            # For all the sequences in msa1, add gaps
            merged[i].append("-")

        for i, character in zip(xrange(len(msa1),
                                       len(msa1) + len(msa2) - 1),
                                msa2[1:, msa2Pos]):

            # For each of the alignment rows that come from msa2, put the
            # character from that row.
            merged[i].append(character)

        # Advance in msa2, until we finish it.
        msa2Pos += 1

    # Now we have finished populating these aligned lists. We need to make a
    # MultipleSeqAlignment from them.

    # What names do the sequences in this alignment have? All the ones from
    # msa1, and then all the ones from msa2 except the first (which is the
    # reference)
    seqNames = [record.id
                for record in msa1] + [record.id for record in msa2[1:]]

    # Make a SeqRecord for each list of properly gapped-out characters, with the
    # appropriate name.
    seqRecords = [
        SeqRecord(Seq("".join(alignedList)), name)
        for alignedList, name in zip(merged, seqNames)
    ]

    # Make the records into a proper MSA
    merged = Align.MultipleSeqAlignment(seqRecords)

    # Do the annotations for the reference
    merged[0].annotations.update(msa1[0].annotations)
    # Calculate the total reference bases used. It will be the distance between
    # the rightmost alignment end and the start of msa1, along the reference.
    merged[0].annotations["size"] = (
        max(msa2[0].annotations["start"] + msa2[0].annotations["size"],
            msa1[0].annotations["start"] + msa1[0].annotations["size"]) -
        msa1[0].annotations["start"])

    for i in xrange(1, len(msa1)):
        # Copy over annotations from MSA1
        merged[i].annotations.update(msa1[i].annotations)

    for i in xrange(len(msa1), len(msa1) + len(msa2) - 1):
        # Copy over annotations from MSA2, starting after the reference.
        merged[i].annotations.update(msa2[i - len(msa1) + 1].annotations)

    # The merged result reverence needs to be longer than the input references.
    #assert(len(merged[0]) >= len(msa1[0]))
    #assert(len(merged[0]) >= len(msa2[0]))

    # Give back the merged MSA
    return merged
Beispiel #2
0
    def __next__(self):
        try:
            line = self._header
            del self._header
        except AttributeError:
            line = self.handle.readline()
        if not line:
            # Empty file - just give up.
            raise StopIteration
        if not line.strip() == '# STOCKHOLM 1.0':
            raise ValueError("Did not find STOCKHOLM header")

        # Note: If this file follows the PFAM conventions, there should be
        # a line containing the number of sequences, e.g. "#=GF SQ 67"
        # We do not check for this - perhaps we should, and verify that
        # if present it agrees with our parsing.

        seqs = {}
        ids = []
        gs = {}
        gr = {}
        gf = {}
        passed_end_alignment = False
        while True:
            line = self.handle.readline()
            if not line:
                break  # end of file
            line = line.strip()  # remove trailing \n
            if line == '# STOCKHOLM 1.0':
                self._header = line
                break
            elif line == "//":
                # The "//" line indicates the end of the alignment.
                # There may still be more meta-data
                passed_end_alignment = True
            elif line == "":
                # blank line, ignore
                pass
            elif line[0] != "#":
                # Sequence
                # Format: "<seqname> <sequence>"
                assert not passed_end_alignment
                parts = [x.strip() for x in line.split(" ", 1)]
                if len(parts) != 2:
                    # This might be someone attempting to store a zero length sequence?
                    raise ValueError("Could not split line into identifier " +
                                     "and sequence:\n" + line)
                id, seq = parts
                if id not in ids:
                    ids.append(id)
                seqs.setdefault(id, '')
                seqs[id] += seq.replace(".", "-")
            elif len(line) >= 5:
                # Comment line or meta-data
                if line[:5] == "#=GF ":
                    # Generic per-File annotation, free text
                    # Format: #=GF <feature> <free text>
                    feature, text = line[5:].strip().split(None, 1)
                    # Each feature key could be used more than once,
                    # so store the entries as a list of strings.
                    if feature not in gf:
                        gf[feature] = [text]
                    else:
                        gf[feature].append(text)
                elif line[:5] == '#=GC ':
                    # Generic per-Column annotation, exactly 1 char per column
                    # Format: "#=GC <feature> <exactly 1 char per column>"
                    pass
                elif line[:5] == '#=GS ':
                    # Generic per-Sequence annotation, free text
                    # Format: "#=GS <seqname> <feature> <free text>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gs:
                        gs[id] = {}
                    if feature not in gs[id]:
                        gs[id][feature] = [text]
                    else:
                        gs[id][feature].append(text)
                elif line[:5] == "#=GR ":
                    # Generic per-Sequence AND per-Column markup
                    # Format: "#=GR <seqname> <feature> <exactly 1 char per column>"
                    id, feature, text = line[5:].strip().split(None, 2)
                    # if id not in ids:
                    #    ids.append(id)
                    if id not in gr:
                        gr[id] = {}
                    if feature not in gr[id]:
                        gr[id][feature] = ""
                    gr[id][feature] += text.strip(
                    )  # append to any previous entry
                    # TODO - Should we check the length matches the alignment length?
                    #       For iterlaced sequences the GR data can be split over
                    #       multiple lines
            # Next line...

        assert len(seqs) <= len(ids)
        # assert len(gs)   <= len(ids)
        # assert len(gr)   <= len(ids)

        self.ids = ids
        self.sequences = seqs
        self.seq_annotation = gs
        self.seq_col_annotation = gr

        if ids and seqs:

            if self.records_per_alignment is not None \
            and self.records_per_alignment != len(ids):
                raise ValueError(
                    "Found %i records in this alignment, told to expect %i" %
                    (len(ids), self.records_per_alignment))

            alignment_length = len(list(seqs.values())[0])
            records = []  # Alignment obj will put them all in a list anyway
            for id in ids:
                seq = seqs[id]
                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )
                name, start, end = self._identifier_split(id)
                record = SeqRecord(Seq(seq, self.alphabet),
                                   id=id,
                                   name=name,
                                   description=id,
                                   annotations={"accession": name})
                # Accession will be overridden by _populate_meta_data if an explicit
                # accession is provided:
                record.annotations["accession"] = name

                if start is not None:
                    record.annotations["start"] = start
                if end is not None:
                    record.annotations["end"] = end

                self._populate_meta_data(id, record)
                records.append(record)
            alignment = MultipleSeqAlignment(records, self.alphabet)

            # TODO - Introduce an annotated alignment class?
            # For now, store the annotation a new private property:
            alignment._annotations = gr

            return alignment
        else:
            raise StopIteration
Beispiel #3
0
def cal_tm_bond(tri_seq,temp_seq,C_Na,C_Mg,C_Strand):
    tm1 = TmDeltaG.calTm(tri_seq, temp_seq, C_Na, C_Mg,C_Strand, 0.00008)
    tm2 = TmDeltaG.calTm(temp_seq, temp_seq, C_Na, C_Mg,C_Strand, 0.00008)
    bond = str(SecStructures_jf4.SecStructures(SeqRecord(Seq(tri_seq)),SeqRecord(Seq(temp_seq)))).split()[0]
    r=[tm1,tm2,bond]
    return r
Beispiel #4
0
def adapter_find(reference_database, reads, threads, max_intron_length,
                 working_dir, verbose):
    subset_fasta = reads + "subset.10000.fasta"
    with open(subset_fasta, "w") as fh:
        for rec in SeqIO.parse(reads, "fasta"):
            if int(rec.id) < 10000:
                SeqIO.write(rec, fh, "fasta")

    bam = mapping.minimap(reference_database, subset_fasta, threads,
                          max_intron_length, working_dir, verbose)
    #soft_clip_regions = soft_clip(bam)
    fasta_gz = bam + ".fasta"
    cmd = "extractSoftclipped %s | zcat | fastqToFa /dev/stdin %s" % (bam,
                                                                      fasta_gz)
    if verbose:
        sys.stderr.write('Executing: %s\n\n' % cmd)
    extract_clip = subprocess.Popen(cmd, cwd=working_dir, shell=True)
    extract_clip.communicate()

    list_short = []
    list_long = []
    dict_uniq = {}
    with open(fasta_gz, "r") as handle:
        for rec in SeqIO.parse(handle, "fasta"):
            name_seq = str(rec.id)
            name = name_seq.split("_")[0]
            if name in dict_uniq:
                if len(dict_uniq[name].seq) > len(rec.seq):
                    list_long.append(dict_uniq[name])
                    list_short.append(rec)
                else:
                    list_short.append(dict_uniq[name])
                    list_long.append(rec)
            else:
                dict_uniq[name] = rec
    long_file = fasta_gz + ".long.fasta"
    with open(long_file, "w") as fh:
        SeqIO.write(list_long, fh, "fasta")
    short_file = fasta_gz + ".short.fasta"
    with open(short_file, "w") as fh:
        SeqIO.write(list_short, fh, "fasta")
    list_file_clip = [(long_file, "long"), (short_file, "short")]
    for clip_file in list_file_clip:
        kmer_start = 21
        list_kmer = []
        while kmer_start < 120:
            cmd = "jellyfish count -s 10000000 -m %s -o %s.%s.kmer %s" % (
                kmer_start, kmer_start, clip_file[1], clip_file[0])
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % cmd)
            jelly_count = subprocess.Popen(cmd, cwd=working_dir, shell=True)
            jelly_count.communicate()
            cmd = "jellyfish dump -L 2 -ct %s.%s.kmer | sort -k2n | tail -n 1" % (
                kmer_start, clip_file[1])
            if verbose:
                sys.stderr.write('Executing: %s\n\n' % cmd)
            jelly_dump = subprocess.Popen(cmd,
                                          cwd=working_dir,
                                          stdout=subprocess.PIPE,
                                          shell=True)
            out_dump = jelly_dump.communicate()[0].decode('utf-8')
            mer = out_dump.split("\t")[0]
            a_count = mer.count("A")
            t_count = mer.count("T")
            if a_count > t_count:
                bias_count = a_count
            else:
                bias_count = t_count
            data_kmer = (kmer_start, mer, GC(mer),
                         (bias_count / kmer_start) * 100,
                         (bias_count / kmer_start) * 100 - GC(mer))
            list_kmer.append(data_kmer)
            kmer_start += 5
        value_adapter = 0
        for i in list_kmer:
            if i[4] > int(value_adapter):
                value_adapter = i[4]
                kmer_done = i[1]

    adapter_file = os.path.join(working_dir, "adapter.fasta")
    if value_adapter > 0:
        with open(adapter_file, "w") as fh:
            record = SeqRecord(Seq(str(kmer_done)), id="adapter")
            SeqIO.write(record, fh, "fasta")
    return adapter_file
def __generate_sequence_profiles_old():
    mtx_dir_name = 'pssm_deltablast'
    DB_INDEX = SeqIO.index('data/scop40_structural_alignment.fasta', 'fasta')
    records = {}
    for i in DB_INDEX:
        domkey = i.split('&')[0]
        records[domkey] = SeqRecord(DB_INDEX[i].seq.ungap('-'),
                                    id=domkey,
                                    name='',
                                    description='')
    with Path('data/scop40_scopdom_pdbatom_seq.fasta').open('w') as f:
        SeqIO.write(records.values(), f, 'fasta')

    DB_INDEX = SeqIO.index('data/scop40_scopdom_pdbatom_seq.fasta', 'fasta')
    for sid in tqdm(list(DB_INDEX)):
        mtx_dir = Path(f'data/{mtx_dir_name}/{sid[2:4]}')
        mtx_dir.mkdir(exist_ok=True, parents=True)
        mtx_file = mtx_dir / f'{sid}.mtx'
        if mtx_file.exists():
            logging.debug(f'PSSM already exists: {mtx_file}')
            continue
        try:
            SeqIO.write(DB_INDEX[sid], f'{sid}.fasta', 'fasta')
            NcbipsiblastCommandline(query=f'{sid}.fasta',
                                    db='uniref90',
                                    num_threads=int(os.cpu_count()),
                                    num_iterations=3,
                                    out_ascii_pssm=mtx_file.as_posix(),
                                    save_pssm_after_last_round=True)()
        except Exception as e:
            logging.exception(e)
            continue
        finally:
            if Path(f'{sid}.fasta').exists():
                Path(f'{sid}.fasta').unlink()

    logging.info('')
    for sid in tqdm(
            pickle.load(Path('data/one_domain_superfamily.pkl').open('rb'))):
        mtx_dir = Path(f'data/{mtx_dir_name}/{sid[2:4]}')
        mtx_dir.mkdir(exist_ok=True, parents=True)
        mtx_file = mtx_dir / f'{sid}.mtx'
        if mtx_file.exists():
            logging.debug(f'PSSM already exists: {mtx_file}')
            continue
        try:
            tmalign = TMalignCommandLine(f'data/scop_e/{sid[2:4]}/{sid}.ent',
                                         f'data/scop_e/{sid[2:4]}/{sid}.ent')
            tmalign.run()
            assert str(tmalign.alignment[0].seq).find('-') == -1
            SeqIO.write(tmalign.alignment[0], f'{sid}.fasta', 'fasta')
            NcbipsiblastCommandline(query=f'{sid}.fasta',
                                    db='uniref90',
                                    num_threads=int(os.cpu_count()),
                                    num_iterations=3,
                                    out_ascii_pssm=mtx_file.as_posix(),
                                    save_pssm_after_last_round=True)()
        except Exception as e:
            logging.error(f'sid={sid}')
            logging.exception(e)
            continue
        finally:
            if Path(f'{sid}.fasta').exists():
                Path(f'{sid}.fasta').unlink()
Beispiel #6
0
# python3
import argparse
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from dnachisel import *
# imput parameters
ap = argparse.ArgumentParser()
ap.add_argument("-fa", "--fasta", required=True, help="input single or multi fasta file")
ap.add_argument("-org","--organism", required=True, help="organism to input(use either the names of the genomes avaliable on dnachisel or use the taxid of the organisms in http://www.kazusa.or.jp/codon/)")
ap.add_argument("-opt","--optimized", required=True, help="optimized fasta file")
args = vars(ap.parse_args())
# main
optimized_seqs = [] # setup an empty list
for record in SeqIO.parse(args['fasta'], "fasta"):
    problem = DnaOptimizationProblem(sequence=str(record.seq),
    constraints=[EnforceTranslation()],
    objectives=[CodonOptimize(species= args['organism'])])
    problem.optimize()
    # add this record to the list
    optimized_seqs.append(SeqRecord(Seq(problem.sequence),id=record.id,description=""))
# export to fasta
SeqIO.write(optimized_seqs, args['optimized'], "fasta")

Beispiel #7
0
def get_structure_seqrecords(model):
    """Get a dictionary of a PDB file's sequences.

    Special cases include:
        - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
        - HETATMs. Currently written as an "X", or unknown amino acid.

    Args:
        model: Biopython Model object of a Structure

    Returns:
        list: List of SeqRecords

    """

    structure_seq_records = []

    # Loop over each chain of the PDB
    for chain in model:
        tracker = 0
        chain_seq = ''
        chain_resnums = []

        # Loop over the residues
        for res in chain.get_residues():
            # NOTE: you can get the residue number too
            res_id = res.id
            res_num = res_id[1]
            res_icode = res_id[2]

            # Double check if the residue name is a standard residue
            # If it is not a standard residue (ie. selenomethionine),
            # it will be filled in with an X on the next iteration)
            if Polypeptide.is_aa(res, standard=True):
                end_tracker = res_num
                res_aa_one = Polypeptide.three_to_one(res.get_resname())

                # Tracker to fill in X's
                if end_tracker != (tracker + 1):
                    if res_icode != ' ':
                        chain_seq += res_aa_one
                        chain_resnums.append(res_num)
                        tracker = end_tracker + 1
                        continue
                    else:
                        multiplier = (end_tracker - tracker - 1)
                        chain_seq += 'X' * multiplier
                        # Residue numbers for unresolved or nonstandard residues are Infinite
                        chain_resnums.extend([float("Inf")] * multiplier)

                chain_seq += res_aa_one
                chain_resnums.append(res_num)
                tracker = end_tracker

            else:
                continue

        chain_seq_record = SeqRecord(Seq(chain_seq, IUPAC.protein),
                                     id=chain.get_id())
        chain_seq_record.letter_annotations[
            'structure_resnums'] = chain_resnums
        structure_seq_records.append(chain_seq_record)

    return structure_seq_records
Beispiel #8
0
    part_list = []
    for kid in data.kids[rec.id]:
        for sub in data.ids[kid]:
            #assert sub.strand == rec.strand
            if sub.type == required_type: part_list.append(sub)

    #sort into order by start bp
    part_list.sort(key=lambda sub: sub.start)

    #assemble exons
    seq = ''
    for sub in part_list:
        seq += str(seq_dict[rec.seqid].seq[sub.start:sub.end])

    #ignore if length zero
    if len(seq) == 0: continue
    if seq.upper().count('N') == len(seq): continue

    seq = Seq(seq)
    if rec.strand == '-': seq = seq.reverse_complement()
    if conf.protein:
        seq = translate(seq)
        if len(seq) == 0: continue
    newrec = SeqRecord(seq, id=rec.id, description='')

    #write out to file
    SeqIO.write(newrec, fout, "fasta")

if conf.out != 'STDOUT':
    fout.close()
     if args.subsample < len(alignments):
         alignments = random.sample(alignments,args.subsample)
     else:
         raise Exception('Number to subsample must be smaller than number of loci available!')
     
 
 #now make output
 try:
     os.makedirs(outname)
 except:
     pass
 
 if args.complete: #add missing species to each alignment
     all_taxa = []
     for alignment in alignments:
         all_taxa.extend([record.id for record in alignment]) #first let's get a list of all taxa in each alignment
     all_taxa = set(all_taxa)
     
     for alignment in alignments:
         this_taxa = set([record.id for record in alignment])
         missing_taxa = all_taxa - this_taxa
         al_len = alignment.get_alignment_length()
         
         if missing_taxa:
             sys.stderr.write('Adding ' + str(len(missing_taxa)) + ' missing taxa\n')
             seqrecs = [SeqRecord(Seq('N' * al_len, IUPACAmbiguousDNA()), id=tx) for tx in missing_taxa]
             seqs_to_add = MultipleSeqAlignment(seqrecs)
             alignment.extend(seqs_to_add)
 
 for i, alignment in enumerate(alignments):
     AlignIO.write(alignment, outname + '/' + outname + '_' + str(i) + ".nex", "nexus")
def build_target_info(
    base_dir,
    info,
    all_index_locations,
    defer_HA_identification=False,
    offtargets=False,
):
    ''' info should have keys:
            sgRNA_sequence
            amplicon_primers
        optional keys:
            donor_sequence
            nonhomologous_donor_sequence
            extra_sequences
            effector
    '''
    genome = info['genome']
    if info['genome'] not in all_index_locations:
        print(f'Error: can\'t locate indices for {genome}')
        sys.exit(0)
    else:
        index_locations = all_index_locations[genome]

    base_dir = Path(base_dir)

    name = info['name']

    donor_info = info.get('donor_sequence')
    if donor_info is None:
        donor_name = None
        donor_seq = None
    else:
        donor_name, donor_seq = donor_info
        if donor_name is None:
            donor_name = f'{name}_donor'

    if donor_seq is None:
        has_donor = False
    else:
        has_donor = True

    if info['donor_type'] is None:
        donor_type = None
    else:
        _, donor_type = info['donor_type']

    nh_donor_info = info.get('nonhomologous_donor_sequence')
    if nh_donor_info is None:
        nh_donor_name = None
        nh_donor_seq = None
    else:
        nh_donor_name, nh_donor_seq = nh_donor_info
        if nh_donor_name is None:
            nh_donor_name = f'{name}_NH_donor'

    if nh_donor_seq is None:
        has_nh_donor = False
    else:
        has_nh_donor = True

    target_dir = base_dir / 'targets' / name
    target_dir.mkdir(parents=True, exist_ok=True)

    protospacer, *other_protospacers = info['sgRNA_sequence']
    primers_name, primers = info['amplicon_primers']
    primers = primers.split(';')

    if primers_name is None:
        target_name = name
    else:
        target_name = primers_name

    protospacer_dir = target_dir / 'protospacer_alignment'
    protospacer_dir.mkdir(exist_ok=True)
    fastq_fn = protospacer_dir / 'protospacer.fastq'
    STAR_prefix = protospacer_dir / 'protospacer_'
    bam_fn = protospacer_dir / 'protospacer.bam'

    STAR_index = index_locations['STAR']

    gb_fns = {
        'target': target_dir / f'{target_name}.gb',
        'donor': target_dir / f'{donor_name}.gb',
        'nh_donor': target_dir / f'{nh_donor_name}.gb',
    }

    # Make a fastq file with a single read containing the protospacer sequence.
    protospacer_name, protospacer_seq = protospacer

    with fastq_fn.open('w') as fh:
        quals = fastq.encode_sanger([40] * len(protospacer_seq))
        read = fastq.Read('protospacer', protospacer_seq, quals)
        fh.write(str(read))

    # Align the protospacer to the reference genome.
    mapping_tools.map_STAR(fastq_fn,
                           STAR_index,
                           STAR_prefix,
                           mode='guide_alignment',
                           bam_fn=bam_fn,
                           sort=False)

    with pysam.AlignmentFile(bam_fn) as bam_fh:
        perfect_als = [
            al for al in bam_fh
            if not al.is_unmapped and sam.total_edit_distance(al) == 0
        ]
        imperfect_als = [al for al in bam_fh if not al.is_unmapped]

    region_fetcher = genomes.build_region_fetcher(index_locations['fasta'])

    def evaluate_candidate(al):
        results = {
            'location':
            f'{al.reference_name} {al.reference_start:,} {sam.get_strand(al)}',
        }

        full_window_around = 5000

        full_around = region_fetcher(
            al.reference_name, al.reference_start - full_window_around,
            al.reference_end + full_window_around).upper()

        if sam.get_strand(al) == '+':
            ps_seq = protospacer_seq
            ps_strand = 1
        else:
            ps_seq = utilities.reverse_complement(protospacer_seq)
            ps_strand = -1

        ps_start = full_around.index(ps_seq)

        protospacer_locations = [(protospacer_name, ps_seq, ps_start,
                                  ps_strand)]

        for other_protospacer_name, other_protospacer_seq in other_protospacers:

            # Initial G may not match genome.
            if other_protospacer_seq.startswith('G'):
                other_protospacer_seq = other_protospacer_seq[1:]

            if other_protospacer_seq in full_around:
                ps_seq = other_protospacer_seq
                ps_strand = 1
            else:
                ps_seq = utilities.reverse_complement(other_protospacer_seq)
                if ps_seq not in full_around:
                    results[
                        'failed'] = f'protospacer {other_protospacer_seq} not present near protospacer {protospacer_seq}'
                    return results
                ps_strand = -1

            ps_start = full_around.index(ps_seq)
            protospacer_locations.append(
                (other_protospacer_name, ps_seq, ps_start, ps_strand))

        if 'effector' in info:
            effector_type = info['effector']
        else:
            if donor_type == 'pegRNA':
                effector_type = 'SpCas9H840A'
            else:
                effector_type = 'SpCas9'

        effector = target_info.effectors[effector_type]

        for ps_name, ps_seq, ps_start, ps_strand in protospacer_locations:
            PAM_pattern = effector.PAM_pattern

            if (ps_strand == 1 and effector.PAM_side
                    == 3) or (ps_strand == -1 and effector.PAM_side == 5):
                PAM_offset = len(ps_seq)
                PAM_transform = utilities.identity
            else:
                PAM_offset = -len(PAM_pattern)
                PAM_transform = utilities.reverse_complement

            PAM_start = ps_start + PAM_offset
            PAM = PAM_transform(full_around[PAM_start:PAM_start +
                                            len(PAM_pattern)])
            pattern, *matches = Bio.SeqUtils.nt_search(PAM, PAM_pattern)

            if 0 not in matches and not offtargets:
                # Note: this could incorrectly fail if there are multiple exact matches for an other_protospacer
                # in full_around.
                results[
                    'failed'] = f'bad PAM: {PAM} next to {ps_seq} (strand {ps_strand})'
                return results

        if primers[0] in full_around:
            leftmost_primer = primers[0]
            rightmost_primer = utilities.reverse_complement(primers[1])
            if rightmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[1]} not present near protospacer'
                return results

            leftmost_primer_name = 'forward_primer'
            rightmost_primer_name = 'reverse_primer'

        else:
            leftmost_primer = primers[1]
            rightmost_primer = utilities.reverse_complement(primers[0])

            if leftmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[1]} not present near protospacer'
                return results

            if rightmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[0]} not present near protospacer'
                return results

            leftmost_primer_name = 'reverse_primer'
            rightmost_primer_name = 'forward_primer'

        leftmost_start = full_around.index(leftmost_primer)
        rightmost_start = full_around.index(rightmost_primer)

        if leftmost_start >= rightmost_start:
            results['failed'] = f'primers don\'t flank protospacer'
            return results

        # Now that primers have been located, redefine the target sequence to include a fixed
        # window on either side of the primers.

        final_window_around = 500

        offset = leftmost_start - final_window_around

        final_start = leftmost_start - final_window_around
        final_end = rightmost_start + len(
            rightmost_primer) + final_window_around

        target_seq = full_around[final_start:final_end]

        leftmost_location = FeatureLocation(leftmost_start - offset,
                                            leftmost_start - offset +
                                            len(leftmost_primer),
                                            strand=1)
        rightmost_location = FeatureLocation(rightmost_start - offset,
                                             rightmost_start - offset +
                                             len(rightmost_primer),
                                             strand=-1)

        colors = {
            'HA_1': '#c7b0e3',
            'HA_RT': '#c7b0e3',
            'HA_2': '#85dae9',
            'HA_PBS': '#85dae9',
            'forward_primer': '#75C6A9',
            'reverse_primer': '#9eafd2',
            'sgRNA': '#c6c9d1',
            'donor_specific': '#b1ff67',
            'PCR_adapter_1': '#F8D3A9',
            'PCR_adapter_2': '#D59687',
            'protospacer': '#ff9ccd',
            'scaffold': '#b7e6d7',
        }

        target_features = [
            SeqFeature(
                location=leftmost_location,
                id=leftmost_primer_name,
                type='misc_feature',
                qualifiers={
                    'label': leftmost_primer_name,
                    'ApEinfo_fwdcolor': colors[leftmost_primer_name],
                },
            ),
            SeqFeature(
                location=rightmost_location,
                id=rightmost_primer_name,
                type='misc_feature',
                qualifiers={
                    'label': rightmost_primer_name,
                    'ApEinfo_fwdcolor': colors[rightmost_primer_name],
                },
            ),
        ]

        if leftmost_primer_name == 'forward_primer':
            start = leftmost_start - offset
            start_location = FeatureLocation(start, start + 5, strand=1)
        else:
            start = rightmost_start - offset + len(rightmost_primer) - 5
            start_location = FeatureLocation(start, start + 5, strand=-1)

        target_features.extend([
            SeqFeature(
                location=start_location,
                id='sequencing_start',
                type='misc_feature',
                qualifiers={
                    'label': 'sequencing_start',
                },
            ),
            SeqFeature(
                location=start_location,
                id='anchor',
                type='misc_feature',
                qualifiers={
                    'label': 'anchor',
                },
            ),
        ])

        sgRNA_features = []
        for sgRNA_i, (ps_name, ps_seq, ps_start,
                      ps_strand) in enumerate(protospacer_locations):
            sgRNA_feature = SeqFeature(
                location=FeatureLocation(ps_start - offset,
                                         ps_start - offset + len(ps_seq),
                                         strand=ps_strand),
                id=f'sgRNA_{ps_name}',
                type=f'sgRNA_{effector.name}',
                qualifiers={
                    'label': f'sgRNA_{ps_name}',
                    'ApEinfo_fwdcolor': colors['sgRNA'],
                },
            )
            target_features.append(sgRNA_feature)
            sgRNA_features.append(sgRNA_feature)

        results['gb_Records'] = {}

        if has_donor:
            if not defer_HA_identification:
                # If multiple sgRNAs are given, the edited one must be listed first.
                sgRNA_feature = sgRNA_features[0]

                cut_after_offset = [
                    offset for offset in effector.cut_after_offset
                    if offset is not None
                ][0]

                if sgRNA_feature.strand == 1:
                    # sgRNA_feature.end is the first nt of the PAM
                    cut_after = sgRNA_feature.location.end + cut_after_offset
                else:
                    # sgRNA_feature.start - 1 is the first nt of the PAM
                    cut_after = sgRNA_feature.location.start - 1 - cut_after_offset - 1

                if donor_type == 'pegRNA':
                    HA_info = identify_pegRNA_homology_arms(
                        donor_seq, target_seq, cut_after, protospacer_seq,
                        colors)
                else:
                    HA_info = identify_homology_arms(donor_seq, donor_type,
                                                     target_seq, cut_after,
                                                     colors)

                if 'failed' in HA_info:
                    results['failed'] = HA_info['failed']
                    return results

                donor_Seq = Seq(HA_info['possibly_flipped_donor_seq'])
                donor_features = HA_info['donor_features']
                target_features.extend(HA_info['target_features'])

            else:
                donor_Seq = Seq(donor_seq)
                donor_features = []

            donor_Record = SeqRecord(donor_Seq,
                                     name=donor_name,
                                     features=donor_features,
                                     annotations={'molecule_type': 'DNA'})
            results['gb_Records']['donor'] = donor_Record

        target_Seq = Seq(target_seq)
        target_Record = SeqRecord(target_Seq,
                                  name=target_name,
                                  features=target_features,
                                  annotations={'molecule_type': 'DNA'})
        results['gb_Records']['target'] = target_Record

        if has_nh_donor:
            nh_donor_Seq = Seq(nh_donor_seq)
            nh_donor_Record = SeqRecord(nh_donor_Seq,
                                        name=nh_donor_name,
                                        annotations={'molecule_type': 'DNA'})
            results['gb_Records']['nh_donor'] = nh_donor_Record

        return results

    good_candidates = []
    bad_candidates = []

    for al in perfect_als:
        results = evaluate_candidate(al)
        if 'failed' in results:
            bad_candidates.append(results)
        else:
            good_candidates.append(results)

    if len(good_candidates) == 0:
        if len(bad_candidates) == 0:
            print(
                f'Error building {name}: no perfect matches to sgRNA {protospacer} found in {genome}'
            )
            print(imperfect_als)
            return

        else:
            print(
                f'Error building {name}: no valid genomic locations for {name}'
            )

            for results in bad_candidates:
                print(f'\t{results["location"]}: {results["failed"]}')

            return

    elif len(good_candidates) > 1:
        print(f'Warning: multiple valid genomic locations for {name}:')
        for results in good_candidates:
            print(f'\t{results["location"]}')
        best_candidate = good_candidates[0]
        print(f'Arbitrarily choosing {best_candidate["location"]}')
    else:
        best_candidate = good_candidates[0]

    truncated_name_i = 0
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=BiopythonWarning)

        for which_seq, Record in best_candidate['gb_Records'].items():
            try:
                Bio.SeqIO.write(Record, gb_fns[which_seq], 'genbank')
            except ValueError:
                # locus line too long, can't write genbank file with BioPython
                old_name = Record.name

                truncated_name = f'{Record.name[:11]}_{truncated_name_i}'
                Record.name = truncated_name
                Bio.SeqIO.write(Record, gb_fns[which_seq], 'genbank')

                Record.name = old_name

                truncated_name_i += 1

    manifest_fn = target_dir / 'manifest.yaml'

    sources = [target_name]
    if has_donor:
        sources.append(donor_name)

    extra_Records = []
    if info.get('extra_sequences') is not None:
        for extra_seq_name, extra_seq in info['extra_sequences']:
            sources.append(extra_seq_name)

            extra_Records.append(SeqRecord(extra_seq, name=extra_seq_name),
                                 annotations={'molecule_type': 'DNA'})

    manifest = {
        'sources': sources,
        'target': target_name,
    }
    if has_donor:
        manifest['donor'] = donor_name
        manifest['donor_specific'] = 'donor_specific'
        if donor_type is not None:
            manifest['donor_type'] = donor_type

    if has_nh_donor:
        manifest['nonhomologous_donor'] = nh_donor_name

    manifest['features_to_show'] = [
        [target_name, 'forward_primer'],
        [target_name, 'reverse_primer'],
    ]

    if has_donor:
        if donor_type == 'pegRNA':
            manifest['features_to_show'].extend([
                [donor_name, 'scaffold'],
                [donor_name, 'protospacer'],
                [donor_name, 'HA_RT'],
                [donor_name, 'HA_PBS'],
                [target_name, 'HA_RT'],
                [target_name, 'HA_PBS'],
            ])
        else:
            manifest['features_to_show'].extend([
                [donor_name, 'HA_1'],
                [donor_name, 'HA_2'],
                [donor_name, 'donor_specific'],
                [donor_name, 'PCR_adapter_1'],
                [donor_name, 'PCR_adapter_2'],
                [target_name, 'HA_1'],
                [target_name, 'HA_2'],
            ])

    manifest['genome_source'] = genome

    manifest_fn.write_text(yaml.dump(manifest, default_flow_style=False))

    gb_records = list(best_candidate['gb_Records'].values()) + extra_Records
    ti = target_info.TargetInfo(base_dir, name, gb_records=gb_records)
    ti.make_references()
    ti.make_protospacer_fastas()
    ti.map_protospacers(genome)
    ti.identify_degenerate_indels()

    shutil.rmtree(protospacer_dir)
    def evaluate_candidate(al):
        results = {
            'location':
            f'{al.reference_name} {al.reference_start:,} {sam.get_strand(al)}',
        }

        full_window_around = 5000

        full_around = region_fetcher(
            al.reference_name, al.reference_start - full_window_around,
            al.reference_end + full_window_around).upper()

        if sam.get_strand(al) == '+':
            ps_seq = protospacer_seq
            ps_strand = 1
        else:
            ps_seq = utilities.reverse_complement(protospacer_seq)
            ps_strand = -1

        ps_start = full_around.index(ps_seq)

        protospacer_locations = [(protospacer_name, ps_seq, ps_start,
                                  ps_strand)]

        for other_protospacer_name, other_protospacer_seq in other_protospacers:

            # Initial G may not match genome.
            if other_protospacer_seq.startswith('G'):
                other_protospacer_seq = other_protospacer_seq[1:]

            if other_protospacer_seq in full_around:
                ps_seq = other_protospacer_seq
                ps_strand = 1
            else:
                ps_seq = utilities.reverse_complement(other_protospacer_seq)
                if ps_seq not in full_around:
                    results[
                        'failed'] = f'protospacer {other_protospacer_seq} not present near protospacer {protospacer_seq}'
                    return results
                ps_strand = -1

            ps_start = full_around.index(ps_seq)
            protospacer_locations.append(
                (other_protospacer_name, ps_seq, ps_start, ps_strand))

        if 'effector' in info:
            effector_type = info['effector']
        else:
            if donor_type == 'pegRNA':
                effector_type = 'SpCas9H840A'
            else:
                effector_type = 'SpCas9'

        effector = target_info.effectors[effector_type]

        for ps_name, ps_seq, ps_start, ps_strand in protospacer_locations:
            PAM_pattern = effector.PAM_pattern

            if (ps_strand == 1 and effector.PAM_side
                    == 3) or (ps_strand == -1 and effector.PAM_side == 5):
                PAM_offset = len(ps_seq)
                PAM_transform = utilities.identity
            else:
                PAM_offset = -len(PAM_pattern)
                PAM_transform = utilities.reverse_complement

            PAM_start = ps_start + PAM_offset
            PAM = PAM_transform(full_around[PAM_start:PAM_start +
                                            len(PAM_pattern)])
            pattern, *matches = Bio.SeqUtils.nt_search(PAM, PAM_pattern)

            if 0 not in matches and not offtargets:
                # Note: this could incorrectly fail if there are multiple exact matches for an other_protospacer
                # in full_around.
                results[
                    'failed'] = f'bad PAM: {PAM} next to {ps_seq} (strand {ps_strand})'
                return results

        if primers[0] in full_around:
            leftmost_primer = primers[0]
            rightmost_primer = utilities.reverse_complement(primers[1])
            if rightmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[1]} not present near protospacer'
                return results

            leftmost_primer_name = 'forward_primer'
            rightmost_primer_name = 'reverse_primer'

        else:
            leftmost_primer = primers[1]
            rightmost_primer = utilities.reverse_complement(primers[0])

            if leftmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[1]} not present near protospacer'
                return results

            if rightmost_primer not in full_around:
                results[
                    'failed'] = f'primer {primers[0]} not present near protospacer'
                return results

            leftmost_primer_name = 'reverse_primer'
            rightmost_primer_name = 'forward_primer'

        leftmost_start = full_around.index(leftmost_primer)
        rightmost_start = full_around.index(rightmost_primer)

        if leftmost_start >= rightmost_start:
            results['failed'] = f'primers don\'t flank protospacer'
            return results

        # Now that primers have been located, redefine the target sequence to include a fixed
        # window on either side of the primers.

        final_window_around = 500

        offset = leftmost_start - final_window_around

        final_start = leftmost_start - final_window_around
        final_end = rightmost_start + len(
            rightmost_primer) + final_window_around

        target_seq = full_around[final_start:final_end]

        leftmost_location = FeatureLocation(leftmost_start - offset,
                                            leftmost_start - offset +
                                            len(leftmost_primer),
                                            strand=1)
        rightmost_location = FeatureLocation(rightmost_start - offset,
                                             rightmost_start - offset +
                                             len(rightmost_primer),
                                             strand=-1)

        colors = {
            'HA_1': '#c7b0e3',
            'HA_RT': '#c7b0e3',
            'HA_2': '#85dae9',
            'HA_PBS': '#85dae9',
            'forward_primer': '#75C6A9',
            'reverse_primer': '#9eafd2',
            'sgRNA': '#c6c9d1',
            'donor_specific': '#b1ff67',
            'PCR_adapter_1': '#F8D3A9',
            'PCR_adapter_2': '#D59687',
            'protospacer': '#ff9ccd',
            'scaffold': '#b7e6d7',
        }

        target_features = [
            SeqFeature(
                location=leftmost_location,
                id=leftmost_primer_name,
                type='misc_feature',
                qualifiers={
                    'label': leftmost_primer_name,
                    'ApEinfo_fwdcolor': colors[leftmost_primer_name],
                },
            ),
            SeqFeature(
                location=rightmost_location,
                id=rightmost_primer_name,
                type='misc_feature',
                qualifiers={
                    'label': rightmost_primer_name,
                    'ApEinfo_fwdcolor': colors[rightmost_primer_name],
                },
            ),
        ]

        if leftmost_primer_name == 'forward_primer':
            start = leftmost_start - offset
            start_location = FeatureLocation(start, start + 5, strand=1)
        else:
            start = rightmost_start - offset + len(rightmost_primer) - 5
            start_location = FeatureLocation(start, start + 5, strand=-1)

        target_features.extend([
            SeqFeature(
                location=start_location,
                id='sequencing_start',
                type='misc_feature',
                qualifiers={
                    'label': 'sequencing_start',
                },
            ),
            SeqFeature(
                location=start_location,
                id='anchor',
                type='misc_feature',
                qualifiers={
                    'label': 'anchor',
                },
            ),
        ])

        sgRNA_features = []
        for sgRNA_i, (ps_name, ps_seq, ps_start,
                      ps_strand) in enumerate(protospacer_locations):
            sgRNA_feature = SeqFeature(
                location=FeatureLocation(ps_start - offset,
                                         ps_start - offset + len(ps_seq),
                                         strand=ps_strand),
                id=f'sgRNA_{ps_name}',
                type=f'sgRNA_{effector.name}',
                qualifiers={
                    'label': f'sgRNA_{ps_name}',
                    'ApEinfo_fwdcolor': colors['sgRNA'],
                },
            )
            target_features.append(sgRNA_feature)
            sgRNA_features.append(sgRNA_feature)

        results['gb_Records'] = {}

        if has_donor:
            if not defer_HA_identification:
                # If multiple sgRNAs are given, the edited one must be listed first.
                sgRNA_feature = sgRNA_features[0]

                cut_after_offset = [
                    offset for offset in effector.cut_after_offset
                    if offset is not None
                ][0]

                if sgRNA_feature.strand == 1:
                    # sgRNA_feature.end is the first nt of the PAM
                    cut_after = sgRNA_feature.location.end + cut_after_offset
                else:
                    # sgRNA_feature.start - 1 is the first nt of the PAM
                    cut_after = sgRNA_feature.location.start - 1 - cut_after_offset - 1

                if donor_type == 'pegRNA':
                    HA_info = identify_pegRNA_homology_arms(
                        donor_seq, target_seq, cut_after, protospacer_seq,
                        colors)
                else:
                    HA_info = identify_homology_arms(donor_seq, donor_type,
                                                     target_seq, cut_after,
                                                     colors)

                if 'failed' in HA_info:
                    results['failed'] = HA_info['failed']
                    return results

                donor_Seq = Seq(HA_info['possibly_flipped_donor_seq'])
                donor_features = HA_info['donor_features']
                target_features.extend(HA_info['target_features'])

            else:
                donor_Seq = Seq(donor_seq)
                donor_features = []

            donor_Record = SeqRecord(donor_Seq,
                                     name=donor_name,
                                     features=donor_features,
                                     annotations={'molecule_type': 'DNA'})
            results['gb_Records']['donor'] = donor_Record

        target_Seq = Seq(target_seq)
        target_Record = SeqRecord(target_Seq,
                                  name=target_name,
                                  features=target_features,
                                  annotations={'molecule_type': 'DNA'})
        results['gb_Records']['target'] = target_Record

        if has_nh_donor:
            nh_donor_Seq = Seq(nh_donor_seq)
            nh_donor_Record = SeqRecord(nh_donor_Seq,
                                        name=nh_donor_name,
                                        annotations={'molecule_type': 'DNA'})
            results['gb_Records']['nh_donor'] = nh_donor_Record

        return results
Beispiel #12
0
        genesh = open(workdir + "genes.fasta", "w")
        # pseudo = open(workdir + "proteins.fasta","w")
        for genebank_gz in tqdm(glob(workdir + "*.gz")):
            assembly = genebank_gz.split(workdir)[1].split(
                "_genomic.gbff.gz")[0]
            for contig in tqdm(bpio.parse(gzip.open(genebank_gz), "gb")):
                for f in contig.features:
                    if f.type == "CDS":

                        if "translation" in f.qualifiers:
                            locus_tag = f.qualifiers["locus_tag"][0]
                            desc = f.qualifiers["protein_id"][
                                0] + " " + assembly
                            seq = SeqRecord(
                                id=locus_tag,
                                seq=Seq(f.qualifiers["translation"][0]),
                                description=desc,
                                name=locus_tag)
                            bpio.write(seq, proth, "fasta")
                            seq = f.extract(contig)
                            seq.id = locus_tag
                            seq.name = locus_tag
                            seq.descripion = desc
                            bpio.write(seq, genesh, "fasta")

    finally:
        proth.close()
        genesh.close()

    #cd-hit -c 0.9 -i proteins.fasta -o proteins_90.fasta  -g 1 -aS 0.8 -p 1
    #cd-hit -c 0.5 -i proteins.fasta -o proteins_50.fasta  -g 1 -aS 0.8 -p 1
if args.gff:
    with open(args.gbk, 'r') as fh_gbk, open(basename + '.gff', 'w') as fh_gff:
        GFF.write(SeqIO.parse(fh_gbk, 'genbank'), fh_gff)

with open(args.gbk, 'r') as fh_gbk:
    if args.contigs:
        fh_fna = open(basename + '.fna', 'w')
    if args.aminoacids:
        fh_faa = open(basename + '.faa', 'w')
    if args.nucleotides:
        fh_fcn = open(basename + '.fcn', 'w')

    for seq_record_gbk in SeqIO.parse(fh_gbk, 'genbank'):
        if args.contigs:
            seq_record_fna = SeqRecord(seq_record_gbk.seq)
            # seq_record_fna.seq.alphabet = IUPAC.extended_dna
            seq_record_fna.id = seq_record_gbk.id
            seq_record_fna.description = seq_record_gbk.description
            SeqIO.write(seq_record_fna, fh_fna, 'fasta')

        if args.aminoacids or args.nucleotides:
            for feature in seq_record_gbk.features:
                if feature.type == 'CDS':
                    if('pseudo' in feature.qualifiers) or ('pseudogene' in feature.qualifiers)\
                            or ('translation' not in feature.qualifiers):
                        continue
                    seq_record_faa = SeqRecord(Seq(feature.qualifiers['translation'][0], IUPAC.extended_protein))
                    if args.aminoacids:
                        seq_record_faa.id = feature.qualifiers['protein_id'][0]
                        if 'product' in feature.qualifiers:
Beispiel #14
0
def gapMismatches(alignment):
    """
    Given an alignment (an MSA with just a reference and a query), replace any
    mismatches with gaps in each sequence.
    
    Return the processed alignment.
    """

    # Make lists of characters that we will join into the new reference and
    # query sequences.
    gappedReference = []
    gappedQuery = []

    # How many mismatches did we gap?
    mismatches_gapped = 0
    # How many aligned bases did we check?
    bases_checked = 0

    # Where are we in the alignment?
    for column in xrange(len(alignment[0])):
        # Just go through all the columns in the alignment's reference.

        # Pull out the reference and query characters at this position.
        refChar = alignment[0, column]
        queryChar = alignment[1, column]

        bases_checked += 1

        if "-" in [refChar, queryChar] or refChar == queryChar:
            # We have a gap or a match. Pass it through to bioth sequences.
            gappedReference.append(refChar)
            gappedQuery.append(queryChar)
        else:
            # We have a mismatch. Gap one and then the other.
            gappedReference.append("-")
            gappedQuery.append(queryChar)

            gappedReference.append(refChar)
            gappedQuery.append("-")

            mismatches_gapped += 1

    # Now we need to manufacture the MultipleSeqAlignment to return from these
    # lists of characters.

    # What names do the sequences in this alignment have?
    seqNames = [record.id for record in alignment]

    # Make a SeqRecord for each list of properly gapped-out characters, with the
    # appropriate name.
    seqRecords = [
        SeqRecord(Seq("".join(alignedList)), name)
        for alignedList, name in zip([gappedReference, gappedQuery], seqNames)
    ]

    for i in xrange(len(seqRecords)):
        # Se tannotations on all the new records
        seqRecords[i].annotations = alignment[i].annotations

    if float(mismatches_gapped) / bases_checked > 0.5 and bases_checked > 100:
        # If this gets too high, it means we have a bad offset somewhere. Yell
        # at the user.
        logging.warning("{}/{} bases gapped due to mismatch".format(
            mismatches_gapped, bases_checked))

    # Make the records into a proper MSA and return it.
    return Align.MultipleSeqAlignment(seqRecords)
Beispiel #15
0
def convert_genbank(genbank_tuple):
    genbank_path, db_directory, error_fname, do_protein = genbank_tuple
    record_list = []
    seq_record = next(SeqIO.parse(open(genbank_path), "genbank"))
    print((seq_record.annotations))
    accession = seq_record.id
    organism = seq_record.annotations['organism'].replace(' ', '_')
    err_log = []
    gc_list = []  # no need for this right now, but leaving in
    # loop over the genbank file
    for fnum, feature in enumerate(seq_record.features):
        err_flag = False
        error_in_field = False
        if feature.type == 'CDS':
            #print dir(feature.location)
            try:
                start = int(feature.location.start)
                stop = int(feature.location.end)
            except:
                error_in_field = True

            strand = feature.strand
            dna_seq = seq_record.seq[start:stop]
            #print "dna_seq", type(dna_seq), dna_seq
            gc = GC(dna_seq)
            gc_list.append(gc)
            gc = "%3.2f" % gc

            try:
                locus = feature.qualifiers['locus_tag'][0]
            except:
                try:
                    locus = feature.qualifiers['gene'][0]
                except:
                    locus = 'error'
                    print(("Error in the organism %s with NC # %s" %
                           (organism, accession)))
                    err_flag = True
                    err_log.append([organism, accession])

            if do_protein:
                #seq = seq.translate()
                #print type(seq)
                #print feature.qualifiers.keys()
                #seq = dir(feature)
                try:
                    if 'translation' in list(feature.qualifiers.keys()):
                        # prot_seq = Seq(''.join(feature.qualifiers['translation']), IUPAC.protein)
                        prot_seq = Seq(''.join(
                            feature.qualifiers['translation']))
                        #print "prot_seq", type(prot_seq), prot_seq

                        if 'gene' in feature.qualifiers:
                            gene = feature.qualifiers['gene'][0]
                            #record_list.append(SeqRecord(prot_seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''), description = ''))
                            seq_rec_to_store = SeqRecord(prot_seq,
                                                         id='|'.join([
                                                             accession,
                                                             organism, locus,
                                                             gene,
                                                             str(start),
                                                             str(stop),
                                                             str(strand), gc
                                                         ]).replace(' ', ''),
                                                         description='')
                        else:
                            #record_list.append(SeqRecord(prot_seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''),description = ''))
                            seq_rec_to_store = SeqRecord(prot_seq,
                                                         id='|'.join([
                                                             accession,
                                                             organism, locus,
                                                             'unknown',
                                                             str(start),
                                                             str(stop),
                                                             str(strand), gc
                                                         ]).replace(' ', ''),
                                                         description='')
                            #print prot_seq
                    else:
                        print("This was not a protein sequence")
                        error_in_field = True
                        #print "This was not a protein sequence"
                except:
                    print(
                        "Error in function convert_genbank(genbank_tuple) from the format_db.py script, unhandled error in the genbank parse."
                    )
                    error_in_field = True
            else:
                # put something in here that will deal with RNA later, if we plan to go that route.
                pass
            if not error_in_field:
                record_list.append(seq_rec_to_store)
            else:
                print("a record was omitted")
            '''        
            #print len(seq)
            if len(seq) < 2:
                #pass
                print "len seq", len(seq)
            
            elif do_protein:
                if 'gene' in feature.qualifiers:
                    gene = feature.qualifiers['gene'][0]
                    record_list.append(SeqRecord(seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''),
                       description = ''))
                else:
                    record_list.append( 
                      SeqRecord(seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''),
                       description = ''))
            
            
            else:
                if 'gene' in feature.qualifiers:
                    gene = feature.qualifiers['gene'][0]
                    record_list.append(SeqRecord(seq, id = '|'.join([accession, organism, locus, gene, str(start), str(stop), str(strand), gc]).replace(' ', ''),
                       description = ''))
                else:
                    record_list.append( 
                      SeqRecord(seq, id = '|'.join([accession, organism, locus, 'unknown', str(start), str(stop), str(strand), gc]).replace(' ', ''),
                       description = ''))
                       '''
    #if os.path.isfile(gc_outfile):
    #    os.remove(gc_outfile)
    #GCAnalysis(accession, organism, gc_list, seq_record.seq, gc_outfile)
    handle = open(error_fname, 'a')
    for i in err_log:
        handle.write('\t'.join(i) + '\n')
        handle.close()
    if not err_flag:
        outpath = db_directory + os.path.splitext(
            os.path.basename(genbank_path))[0] + '.ffc'
        #print outpath
        out_handle = open(outpath, "w")
        SeqIO.write(record_list, out_handle, "fasta")
        out_handle.close()

    if do_protein:
        cmd = "makeblastdb -in %s -dbtype prot" % (outpath)
        #print "got here"
    else:
        cmd = "makeblastdb -in %s -dbtype prot" % (outpath)
    os.system(cmd)
    #print "Passed main loop"

    return outpath, err_flag
Beispiel #16
0
 
             for number in ec_number:
                 print >> ec_out, number
                 
             qualifiers['EC_number'] = ec_number
         
             start = int(temp.loc[hit, 'start'])
             end = int(temp.loc[hit, 'end'])
         
             location = SeqFeature.FeatureLocation(start, end)
         
             new_feature = SeqFeature.SeqFeature(type = 'CDS', qualifiers = qualifiers)
             new_feature.location = location
             features.append(new_feature)
         
         new_record = SeqRecord(Seq('nnnn', alphabet = IUPAC.ambiguous_dna), id = genome, name = genome, features = features)
         SeqIO.write(new_record, open(pathos_output_dir + name + '.' + genome + '.gbk', 'w'), 'genbank')
         
         print >> all_genetic_elements, 'ID' + '\t' + genome
         print >> all_genetic_elements, 'NAME' + '\t' + genome
         print >> all_genetic_elements, 'TYPE' + '\t' + ':CHRSM'
         print >> all_genetic_elements, 'CIRCULAR?' + '\t' + 'Y'
         print >> all_genetic_elements, 'ANNOT-FILE' + '\t' + name + '.' + genome + '.gbk'
         print >> all_genetic_elements, '//' 
 
 ## Now create the organism-params.dat file. 
 
 with open(pathos_output_dir + 'organism-params.dat', 'w') as all_organism_params:
     print >> all_organism_params, 'ID' + '\t' + name
     print >> all_organism_params, 'Storage' + '\t' + 'File'
     print >> all_organism_params, 'Name' + '\t' + name
def create_euk_files(d):
    
    ## First create a df mapping protein id to SwissProt accession number.
    
    columns = ['prot_id', 'swissprot', 'description']
    spt = pd.read_csv(ref_dir_domain + 'refseq/' + d + '/swissprot.gff3', index_col = 0, comment = '#', names = columns, usecols = [0,8,10], sep = ';|\t', engine = 'python')
    spt['swissprot'] = spt['swissprot'].str.replace('Name=Swiss-Prot:', '')
    spt['description'] = spt['description'].str.replace('Description=Swiss-Prot:', '')
    
    ## Create empty list to hold gene features pulled from cds.fa.
    
    features = []
    
    ## Artificial start, stops are needed.
    
    combined_length = 1
    
    print('generating genbank format files for', d + '...')
    
    ## Some directory names differ from the accession number.  Rename these
    ## directories to match the accession number.
    
    for f in os.listdir(ref_dir_domain + 'refseq/' + d):
        if f.endswith('.pep.fa'):
            a = f.split('.pep.fa')[0]
            
    if a != d:
        os.rename(ref_dir_domain + 'refseq/' + d, ref_dir_domain + 'refseq/' + a)
        print('directory', d, 'is now', a)
    
    for record in SeqIO.parse(ref_dir_domain + 'refseq/' + a + '/' + a + '.pep.fa', 'fasta'):
            
        ## The swissprot annotations are indexed by MMETSP record locator, not
        ## by the actual record.id.
        
        sprot_name = str(record.description).split('NCGR_PEP_ID=')[1]
        sprot_name = sprot_name.split(' /')[0]
        	
        try:
            temp_spt = spt.loc[sprot_name, 'swissprot']
        except KeyError:
            continue
        		
        temp_sprot = sprot_df[sprot_df.index.isin(list(temp_spt))]
        	
        ecs = list(set(temp_sprot.ec))
        descriptions = list(set(temp_sprot.name))
        	
        ## Embed all information necessary to create the Genbank file as qualifiers, then
        ## append to this list of records for that genome.
        	
        qualifiers = {'protein_id':sprot_name, 'locus_tag':str(record.id), 'EC_number':ecs, 'product':descriptions, 'translation':str(record.seq)}
        new_feature = SeqFeature.SeqFeature(type = 'CDS', qualifiers = qualifiers)
        new_feature.location = SeqFeature.FeatureLocation(combined_length, combined_length + len(str(record.seq)))
        features.append(new_feature)
        
        combined_length = combined_length + len(str(record.seq))
        
    ## Write the records in Genbank format.  Even though you will ultimately
    ## want to use the gbk extension, to match the (silly) Genbank convention
    ## use gbff.
        
    new_record = SeqRecord(Seq('nnnn'), id = a, name = a, features = features)
    new_record.annotations['molecule_type']	= 'DNA'
    SeqIO.write(new_record, open(ref_dir_domain + 'refseq/' + a + '/' + a + '.gbff', 'w'), 'genbank')
Beispiel #18
0
        return hairpins_string

    #end def

    def _format_hairpins(self, hairpins, seq):
        hairpins_string = ''
        if not hairpins: return '0'
        #print header
        hairpins_string += self._format_hairpins_header(hairpins, seq)
        #print hairpins
        hairpins_string += self._format_hairpin(hairpins[0], seq)
        return hairpins_string

    #end def


#end class
if __name__ == "__main__":
    structs = SecStructures(SeqRecord(Seq('GATCTGATGCATGAGATCGCATCAGATC')))
    print 'seq1 self dimer', str(structs).split()[0]
    print 'seq1 hairpin', str(structs).split()[1]

#	for ET_record in SeqIO.parse('example.txt', "fasta"):
#		structs = SecStructures(SeqRecord(Seq('GATCTGATGCATGAGATCGCATCAGATC')))
#		print 'seq1 self dimer',str(structs).split()[0]
#		print 'seq1 hairpin',str(structs).split()[1]
#	for PT_record in SeqIO.parse('example2.txt', "fasta"):
#		structs = SecStructures(PT_record)
#	structs = SecStructures(ET_record,PT_record)
#	print 'set3 cross dimer',str(structs)
Beispiel #19
0
 def _record_formatter(self, trim, name):
     """return a string formatted as a biopython sequence record"""
     return SeqRecord(Seq(trim, Gapped(IUPAC.ambiguous_dna, "-?")),
                      id=name,
                      name=name,
                      description=name)
    def analyze(self, blueThresh, redThresh, bluePeakThresh, redPeakThresh):
        self.blueThresh = blueThresh
        self.redThresh = redThresh
        self.bluePeakThresh = bluePeakThresh
        self.redPeakThresh = redPeakThresh
        if self.seqplotlist != [[]]:
            for x in self.seqplotlist:
                self.p1.getRoiPlot().removeItem(x)
                self.p1.getRoiPlot().autoRange()

        if self.firingplotlist != []:
            for x in self.firingplotlist:
                self.p1.getRoiPlot().removeItem(x)
                self.p1.getRoiPlot().autoRange()

        self.seqplotlist = [[]]
        self.firingplotlist = []

        dntpnames = ["dCTP", "dATP", "dGTP", "dTTP"]
        cdf = pd.DataFrame({'a': [], 't': [], 'g': [], 'c': []})
        dntps = [[], [], [], []]
        zpro = [[], [], [], []]
        n = len(dntps)

        dntpdirec = r'C:\Users\Noah PC\PycharmProjects\ZMW analysis\ZMW\02082017'
        fn = 'dntpss.h5'
        hfile = h5py.File(os.path.join(dntpdirec, fn))
        for i, x in enumerate(dntpnames):
            fn = 'dntps.h5'
            zpro[i] = np.array(hfile[x]).astype(float)
            zpro[i] -= zpro[i].mean()
            zpro[i] /= zpro[i].std()

        composite = zpro[0] + zpro[1] + zpro[2] + zpro[3]

        ly, lx = composite.shape

        p0 = [composite.mean(1).max(), (ly / 2) - 4, (ly / 2) + 4, 1.]
        coeff1, var_matrix1 = curve_fit(dubgauss,
                                        np.linspace(0, ly - 1, ly),
                                        composite.mean(1),
                                        p0=p0)
        p0 = [self.zpro.mean(1).max(), (ly / 2) - 4, (ly / 2) + 4, 1.]
        coeff2, var_matrix2 = curve_fit(dubgauss,
                                        np.linspace(0, ly - 1, ly),
                                        self.zpro.mean(1),
                                        p0=p0)
        shifty = np.mean((coeff2[1], coeff2[2])) - np.mean(
            (coeff1[1], coeff1[2]))

        p0 = [composite.mean(1).max(), lx / 2, 1.]
        coeff1, var_matrix1 = curve_fit(gauss,
                                        np.linspace(0, lx - 1, lx),
                                        composite.mean(0),
                                        p0=p0)
        p0 = [self.zpro.mean(1).max(), lx / 2, 1.]
        coeff2, var_matrix2 = curve_fit(gauss,
                                        np.linspace(0, lx - 1, lx),
                                        self.zpro.mean(0),
                                        p0=p0)
        shiftx = coeff2[1] - coeff1[1]

        self.czpro = [[], [], [], []]
        for i, x in enumerate(dntps):
            self.czpro[i] = zpro[i]
            self.czpro[i] = ird.transform_img(self.czpro[i],
                                              tvec=[shifty, shiftx])

        seqdf = self.peakdetection(blueThresh, redThresh, bluePeakThresh,
                                   redPeakThresh)
        predictedseq = seqdf.base.str.cat()

        fn = self.datafilename[:-3] + '_seq.fasta'
        predictedseq = Seq.Seq(predictedseq, generic_dna)
        predictedseq = SeqRecord(predictedseq, id=os.path.split(fn)[1])
        SeqIO.write(predictedseq, fn, "fasta")
Beispiel #21
0
names = set()


def fasta_record(rec, cnt):
    name = '{}_fragment_{}.fasta'.format(file_input.split('.')[0], cnt)
    names.add(name)
    print(name)
    SeqIO.write(rec, name, "fasta")


seq_count = 0
records = []
cnt = 0

for record in SeqIO.parse(file_input, 'fasta'):
    cnt += 1
    seq_count += 1
    if seq_count < SEQ_LIMIT:
        records.append(
            SeqRecord(seq=record.seq,
                      id=record.id,
                      description=record.description))

    else:
        fasta_record(records, cnt)
        records = []
        seq_count = 0

fasta_record(records, cnt)
print('The number of records: ', cnt)
Beispiel #22
0
#!/usr/bin/env python
import sys
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

new_sequences = []
gbinput = str(sys.argv[1])

if len(sys.argv) < 3:
    faout = gbinput.split(".")[0] + ".fasta"
else:
    faout = str(sys.argv[2])

#faout = gbinput.split(".")[0] + ".fasta"

for i in SeqIO.parse(gbinput, "gb"):
    for f in i.features:
        if (f.type == "CDS" and f.qualifiers['gene'] == ['pol']):
            new_sequences.append(
                SeqRecord(Seq(*(f.qualifiers['translation'])),
                          id="_".join(
                              [i.description, *(f.qualifiers['protein_id'])]),
                          description=""))
#            print(">{0}_{1}\n{2}".format(
#                i.description, *(f.qualifiers['protein_id']), *(f.qualifiers['translation'])), #end="\n")
if len(new_sequences) == 0:
    print("No sequences were converted")
else:
    SeqIO.write(new_sequences, faout, format="fasta")
Beispiel #23
0
        #Now try setting it afterwards to a bad value...
        rec = SeqRecord(Seq("ACGT", generic_dna),
                        id="Test",
                        name="Test",
                        description="Test")
        try:
            rec.letter_annotations = {"test": [1, 2, 3]}
            self.assertTrue(False,
                            "Changing to bad letter_annotations should fail!")
        except (TypeError, ValueError), e:
            pass
        #Now try setting it at creation time to a bad value...
        try:
            rec = SeqRecord(Seq("ACGT", generic_dna),
                            id="Test",
                            name="Test",
                            description="Test",
                            letter_annotations={"test": [1, 2, 3]})
            self.assertTrue(False,
                            "Wrong length letter_annotations should fail!")
        except (TypeError, ValueError), e:
            pass


class SeqRecordMethods(unittest.TestCase):
    """Test SeqRecord methods."""
    def setUp(self):
        f0 = SeqFeature(FeatureLocation(0, 26),
                        type="source",
                        qualifiers={"mol_type": ["fake protein"]})
        f1 = SeqFeature(FeatureLocation(0, ExactPosition(10)))
    try:
        os.stat(directory)
    except:
        os.mkdir(directory)


fi = sys.argv[1]  # fasta file with sequences
diri = sys.argv[2]  # input directory
diro = sys.argv[3]  # output directory

mkdir(diro + "/plasmids/")
mkdir(diro + "/chromosomes/")

for ix, record in enumerate(
        SeqIO.parse(diri + "/" + fi + "/" + fi + ".fna", "fasta")):
    gid = record.description
    seq = record.seq

    if "plasmid" in gid:
        gtype = "plasmids"
    else:
        gtype = "chromosomes"

    nrecord = SeqRecord(record.seq,
                        id=record.id,
                        name='',
                        description=record.description)
    SeqIO.write([nrecord],
                open(diro + "/" + gtype + '/' + fi + "_" + str(ix) + '.fasta',
                     'w'), 'fasta')
Beispiel #25
0
def _get_codon_rec(pro, nucl, span_mode, alphabet, gap_char="-",
                   codon_table=default_codon_table, complete_protein=False,
                   max_score=10):
    """Generate codon alignment based on regular re match (PRIVATE)

    span_mode is a tuple returned by _check_corr. The first element
    is the span of a re search, and the second element is the mode
    for the match.

    mode
        - 0: direct match
        - 1: mismatch (no indels)
        - 2: frameshift

    """
    import re
    from Bio.Seq import Seq

    nucl_seq = nucl.seq.ungap(gap_char)
    codon_seq = ""
    span = span_mode[0]
    mode = span_mode[1]
    aa2re = _get_aa_regex(codon_table)
    if mode in (0, 1):
        if len(pro.seq.ungap(gap_char)) * 3 != (span[1] - span[0]):
            raise ValueError("Protein Record {0} and Nucleotide Record {1} "
                             "do not match!".format((pro.id, nucl.id)))
        aa_num = 0
        for aa in pro.seq:
            if aa == "-":
                codon_seq += "---"
            elif complete_protein and aa_num == 0:
                this_codon = nucl_seq._data[span[0]:span[0] + 3]
                if not re.search(_codons2re[codon_table.start_codons],
                                 this_codon.upper()):
                    max_score -= 1
                    warnings.warn("start codon of {0} ({1} {2}) does not "
                                  "correspond to {3} "
                                  "({4})".format(pro.id, aa, aa_num,
                                                 nucl.id, this_codon),
                                  BiopythonWarning)
                if max_score == 0:
                    raise RuntimeError("max_score reached for {0}! Please "
                                       "raise up the tolerance to get an "
                                       "alignment in anyway".format(nucl.id))
                codon_seq += this_codon
                aa_num += 1
            else:
                this_codon = nucl_seq._data[(span[0] + 3 * aa_num):
                                            (span[0] + 3 * (aa_num + 1))]
                if not str(Seq(this_codon.upper()).translate(table=codon_table)) == aa:
                    max_score -= 1
                    warnings.warn("%s(%s %d) does not correspond to %s(%s)"
                                  % (pro.id, aa, aa_num, nucl.id, this_codon),
                                  BiopythonWarning)
                if max_score == 0:
                    raise RuntimeError("max_score reached for {0}! Please "
                                       "raise up the tolerance to get an "
                                       "alignment in anyway".format(nucl.id))
                codon_seq += this_codon
                aa_num += 1
        return SeqRecord(CodonSeq(codon_seq, alphabet=alphabet), id=nucl.id)
    elif mode == 2:
        from collections import deque
        shift_pos = deque([])
        shift_start = []
        match = span_mode[2]
        m_groupdict = list(match.groupdict().keys())
        # backward frameshift
        for i in m_groupdict:
            shift_pos.append(match.span(i))
            shift_start.append(match.start(i))
        rf_table = []
        i = match.start()
        while True:
            rf_table.append(i)
            i += 3
            if i in shift_start and \
                    m_groupdict[shift_start.index(i)].isupper():
                shift_index = shift_start.index(i)
                shift_val = 6 - (shift_pos[shift_index][1] -
                            shift_pos[shift_index][0])
                rf_table.append(i)
                rf_table.append(i + 3 - shift_val)
                i = shift_pos[shift_index][1]
            elif i in shift_start and \
                    m_groupdict[shift_start.index(i)].islower():
                i = shift_pos[shift_start.index(i)][1]
            if i >= match.end():
                break
        aa_num = 0
        for aa in pro.seq:
            if aa == "-":
                codon_seq += "---"
            elif complete_protein and aa_num == 0:
                this_codon = nucl_seq._data[rf_table[0]:rf_table[0] + 3]
                if not re.search(_codons2re[codon_table.start_codons],
                                 this_codon.upper()):
                    max_score -= 1
                    warnings.warn("start codon of {0}({1} {2}) does not "
                                  "correspond to {3}({4})".format(
                                      pro.id, aa, aa_num, nucl.id, this_codon),
                                  BiopythonWarning)
                    codon_seq += this_codon
                    aa_num += 1
            else:
                if aa_num < len(pro.seq.ungap('-')) - 1 and \
                        rf_table[aa_num + 1] - rf_table[aa_num] - 3 < 0:
                    max_score -= 1
                    start = rf_table[aa_num]
                    end = start + (3 - shift_val)
                    ngap = shift_val
                    this_codon = nucl_seq._data[start:end] + '-' * ngap
                elif rf_table[aa_num] - rf_table[aa_num - 1] - 3 > 0:
                    max_score -= 1
                    start = rf_table[aa_num - 1] + 3
                    end = rf_table[aa_num]
                    ngap = 3 - (rf_table[aa_num] - rf_table[aa_num - 1] - 3)
                    this_codon = nucl_seq._data[start:end] + '-' * ngap + \
                            nucl_seq._data[rf_table[aa_num]:rf_table[aa_num] + 3]
                else:
                    start = rf_table[aa_num]
                    end = start + 3
                    this_codon = nucl_seq._data[start:end]
                    if not str(Seq(this_codon.upper()).translate(table=codon_table)) == aa:
                        max_score -= 1
                        warnings.warn("Codon of {0}({1} {2}) does not "
                                      "correspond to {3}({4})".format(
                                          pro.id, aa, aa_num, nucl.id,
                                          this_codon),
                                      BiopythonWarning)
                if max_score == 0:
                    raise RuntimeError("max_score reached for {0}! Please "
                                       "raise up the tolerance to get an "
                                       "alignment in anyway".format(nucl.id))
                codon_seq += this_codon
                aa_num += 1
        return SeqRecord(CodonSeq(codon_seq, alphabet=alphabet,
                         rf_table=rf_table), id=nucl.id)
def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False):
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqFeature import SeqFeature, FeatureLocation

    blast_records = NCBIXML.parse(blastxml)
    for idx_record, record in enumerate(blast_records):
        # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
        match_type = {  # Currently we can only handle BLASTN, BLASTP
            'BLASTN': 'nucleotide_match',
            'BLASTP': 'protein_match',
        }.get(record.application, 'match')

        recid = record.query
        if ' ' in recid:
            recid = recid[0:recid.index(' ')]

        rec = SeqRecord(Seq("ACTG"), id=recid)
        for idx_hit, hit in enumerate(record.alignments):
            for idx_hsp, hsp in enumerate(hit.hsps):
                qualifiers = {
                    "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp),
                    "source": "blast",
                    "score": hsp.expect,
                    "accession": hit.accession,
                    "hit_id": hit.hit_id,
                    "length": hit.length,
                    "hit_titles": hit.title.split(' >'),
                }
                if include_seq:
                    qualifiers.update({
                        'blast_qseq': hsp.query,
                        'blast_sseq': hsp.sbjct,
                        'blast_mseq': hsp.match,
                    })

                for prop in ('score', 'bits', 'identities', 'positives',
                             'gaps', 'align_length', 'strand', 'frame',
                             'query_start', 'query_end', 'sbjct_start',
                             'sbjct_end'):
                    qualifiers['blast_' + prop] = getattr(hsp, prop, None)

                desc = hit.title.split(' >')[0]
                qualifiers['description'] = desc[desc.index(' '):]

                # This required a fair bit of sketching out/match to figure out
                # the first time.
                #
                # the match_start location must account for queries and
                # subjecst that start at locations other than 1
                parent_match_start = hsp.query_start - hsp.sbjct_start
                # The end is the start + hit.length because the match itself
                # may be longer than the parent feature, so we use the supplied
                # subject/hit length to calculate the real ending of the target
                # protein.
                parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')

                # If we trim the left end, we need to trim without losing information.
                used_parent_match_start = parent_match_start
                if trim:
                    if parent_match_start < 1:
                        used_parent_match_start = 0

                if trim or trim_end:
                    if parent_match_end > hsp.query_end:
                        parent_match_end = hsp.query_end + 1

                # The ``match`` feature will hold one or more ``match_part``s
                top_feature = SeqFeature(
                    FeatureLocation(used_parent_match_start, parent_match_end),
                    type=match_type, strand=0,
                    qualifiers=qualifiers
                )

                # Unlike the parent feature, ``match_part``s have sources.
                part_qualifiers = {
                    "source": "blast",
                }
                top_feature.sub_features = []
                for idx_part, (start, end, cigar) in \
                        enumerate(generate_parts(hsp.query, hsp.match,
                                                 hsp.sbjct,
                                                 ignore_under=min_gap)):
                    part_qualifiers['Gap'] = cigar
                    part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part)

                    # Otherwise, we have to account for the subject start's location
                    match_part_start = parent_match_start + hsp.sbjct_start + start - 1

                    # We used to use hsp.align_length here, but that includes
                    # gaps in the parent sequence
                    #
                    # Furthermore align_length will give calculation errors in weird places
                    # So we just use (end-start) for simplicity
                    match_part_end = match_part_start + (end - start)

                    top_feature.sub_features.append(
                        SeqFeature(
                            FeatureLocation(match_part_start, match_part_end),
                            type="match_part", strand=0,
                            qualifiers=copy.deepcopy(part_qualifiers))
                    )

                rec.features.append(top_feature)
        rec.annotations = {}
        yield rec
Beispiel #27
0

if __name__ == '__main__':
   #define file name without extentions. This is usually virus family name
   file_path = os.path.splitext(os.path.basename(sys.argv[1]))[0]
   handle = open(sys.argv[1], 'r')
   for record in SeqIO.parse(handle, "fasta"):
     #only 1% of Ns is allowed
     if (float(str(record.seq).count('N'))/float(len(str(record.seq)))*100) <= 1:
       #only sequences with more than 2000 bp will be chopped 
       if len(str(record.seq)) > 2000:
          #length (about 1000 pb ) of the block is defined by folowing formula: int(round(len(record.seq.tostring())/round(len(record.seq.tostring())/1000)))  
          for pos, block in enumerate(blocks(str(record.seq), int(round(len(str(record.seq))/round(len(str(record.seq))/1000))))):    
              #safe only blocks with >=800bp length
              if len(block) >= 800:
                 block_record = SeqRecord(Seq(block, record.seq.alphabet),id=record.id, name=record.name, description=record.description)
                 outfile = "%s.%s-%d_fasta" % (file_path,record.id,pos)
                 SeqIO.write(block_record, open(outfile, 'w'), "fasta")
          #reverse transcribe and do the similar as above
          for pos, block in enumerate(blocks(str(record.seq.reverse_complement()), int(round(len(str(record.seq.reverse_complement()))/round(len(str(record.seq.reverse_complement()))/1000))))):    
              if len(block) >= 800:
                 block_record = SeqRecord(Seq(block, record.seq.alphabet),id=record.id, name=record.name, description=record.description)
                 outfile = "%s.%s-%drev_fasta" % (file_path,record.id,pos)
                 SeqIO.write(block_record, open(outfile, 'w'), "fasta")
       else:
          #if length is less than 2000bp than just safe both strands
          outfile = "%s.%s-1_fasta" % (file_path,record.id)
          SeqIO.write(record, open(outfile, 'w'), "fasta")
          outfile = "%s.%s-1rev_fasta" % (file_path,record.id)
          reverse_record = SeqRecord(Seq(str(record.seq.reverse_complement()), record.seq.alphabet),id=record.id, name=record.name, description=record.description)
          SeqIO.write(reverse_record, open(outfile, 'w'), "fasta")
Beispiel #28
0
	min_frag_len = max_frag_len - 1
    # Get the number of sets that will have the maximum length sequences to
    # balance the distribution per set
    big_sets = max_len % num_sets
    small_sets = num_sets - big_sets	
    # Minimum string length for the given number of sets (with zero-filling)
    num_zeros = len(str(num_sets))
    # Generate all the set ids and their corresponding starting site
	first_range = big_sets * max_frag_len
    frag_list = [('cset{}'.format(str(i).zfill(num_zeros)), value)
                    for i, value in enumerate(range(0, first_range,
					                                max_frag_len), 1)]
	frag_list += [('cset{}'.format(str(i).zfill(num_zeros)), value)
                    for i, value in enumerate(range(first_range, max_len,
					                                min_frag_len), big_sets+1)]
    set_dict = {}
    for record in iter(record_list) :
        for set_id, start in frag_list :
            if ( len(record) < start ) :
                # The current sequence can't be divided into more sets
                break
            else : # len(record) >= start
                end = start + max_frag_len
                frag_record = SeqRecord(record.seq[start:end], id=record.id,
                                        name=record.name, description=set_id)
                set_dict.setdefault(set_id, []).append(frag_record)
    return ( set_dict )


#-------------------------------------------------------------------------------
     continue
 # clustering
 fieldrange = [int(bedfield[1]), int(bedfield[2])]
 # parse all exons
 exonlen = [int(x) for x in bedfield[10][:-1].split(',')]
 exonstart = [int(x) + fieldrange[0] for x in bedfield[11][:-1].split(',')]
 if not bedfield[0] in refkeys:
     print('Warning: ' + bedfield[0] + ' not in the reference. Ignore...',
           file=sys.stderr)
     continue
 if bedfield[0] != prevchr:
     print('Switching to %s ...' % bedfield[0], file=sys.stderr)
     prevchr = bedfield[0]
     previndex = seqref[bedfield[0]]
 # extract sequences
 thisseq = SeqRecord('')
 for i in range(len(exonlen)):
     thisseq += previndex[exonstart[i]:(exonstart[i] + exonlen[i])]
 if forcelength:
     if sum(exonlen) < readlength:
         thisseq += filledseq * (readlength - sum(exonlen))
 thisseq.id = bedfield[3]
 thisseq.description = ''
 # mutation
 nmut = numpy.random.poisson(errrate)
 if nmut > 0:
     newseq = thisseq.seq
     for n in range(nmut):
         if len(posweight) == 0:
             # uniform distrib
             modifyposition = random.choice(range(len(newseq)))
Beispiel #30
0
def smart_adjoin(msa1, msa2, sequence_source):
    """
    Given two Multiple Sequence Alignments (MSAs) on the same source sequences,
    with correct annotations, concatenate them together, with the intervening
    sequences unaligned.
    
    Either MSA may be None, in which case the other is returned.
    
    Requires a function that, when passed a sequence ID, returns the SeqRecord
    for the full sequence.
    
    Requires that there be a valid way to attach the two sequences together
    (i.e. the same sequence doesn't run in different directions in the two
    blocks).
    
    Raises a RuntimeError if the two MSAs cannot be adjoined.
    
    """

    if msa1 is None:
        # Nothing plus something equals that thing.
        return msa2

    if msa2 is None:
        # Nothing plus something equals that thing.
        return msa1

    logging.debug("Adjoining {}bp and {}bp reference alignments".format(
        msa1[0].annotations["size"], msa2[0].annotations["size"]))

    for seq1, seq2 in itertools.izip(msa1, msa2):
        # Check all the sequences

        if seq1.annotations["strand"] != seq2.annotations["strand"]:
            # These alignments are to opposite reference strands and cannot be
            # adjoined.
            raise RuntimeError("Can't adjoin alignments on opposite strands")

    if msa2[0].annotations["start"] < msa1[0].annotations["start"]:
        # Whatever strand we're on for the first sequence, alignment 2 needs to
        # happen first.
        msa2, msa1 = msa1, msa2

    # We're going to get the sequence needed to go from the end of MSA1 to the
    # start of MSA2.
    intervening_sequences = []

    for seq1, seq2 in itertools.izip(msa1, msa2):
        # For each pair of sequence pieces, we need the sequence from #1 to #2,
        # on the appropriate strand.

        # Where does the intervening sequence start along the strand in
        # question? Remember MAF coordinates are 0-based.
        intervening_start = seq1.annotations["start"] + seq1.annotations["size"]

        # And where does it end? (1 past the end)
        intervening_end = seq2.annotations["start"]

        if intervening_end < intervening_start:
            # We're always going up in strand-local coordinates.
            raise RuntimeError("Sequence is trying to go backwards!")

        if seq1.annotations["strand"] == -1:
            # Convert to the correct strand.

            intervening_start = seq1.annotations["srcSize"] - intervening_start
            intervening_end = seq1.annotations["srcSize"] - intervening_end

            intervening_start, intervening_end = (intervening_end,
                                                  intervening_start)

        # Go get and clip out the intervening sequence.
        intervening_sequence = sequence_source(
            seq1.id)[intervening_start:intervening_end]

        if seq1.annotations["strand"] == -1:
            # Make sure it is on the correct strand
            intervening_sequence = intervening_sequence.reverse_complement()

        # Put the clipped-out, correctly-oriented unaligned sequence in the
        # list.
        intervening_sequences.append(intervening_sequence)

    # We'll tack these additional alignments onto msa1
    to_return = msa1

    for i in xrange(len(intervening_sequences)):
        # Now for each intervening sequence, I need an MSA consisting of that
        # sequence in its correct row and gaps in all the other rows.

        # Make all the rows for this bit of unaligned sequence, as SeqRecords.
        alignment_rows = [
            SeqRecord(Seq("-" * len(intervening_sequences[i])))
            if j != i else intervening_sequences[i]
            for j in xrange(len(intervening_sequences))
        ]

        # Make them into an alignment and stick it on
        to_return = to_return + Align.MultipleSeqAlignment(alignment_rows)

    # Now stick on msa2
    to_return = to_return + msa2

    for i in xrange(len(to_return)):
        # Do the annotations for each record in the alignment

        # Set the ID
        to_return[i].id = msa1[i].id

        # Start with the annotations from msa1, so start is correct
        to_return[i].annotations.update(msa1[i].annotations)

        # Compute the actual sequence length that outght to be used here.
        to_return[i].annotations["size"] = (msa2[i].annotations["start"] +
                                            msa2[i].annotations["size"] -
                                            msa1[i].annotations["start"])

        # Make sure size is correct correct.
        assert (len(str(to_return[i].seq).replace(
            "-", "")) == to_return[i].annotations["size"])

    # Give back the final adjoined alignment
    return to_return