예제 #1
0
    def _get_dna_records(self, records, name):
        """
        
        :param records: 
        :return: 
        """
        og_cdna = [None] * len(records)
        for i, record in enumerate(records):
            if 'h5' in self._db_source:
                oma_db_nr = self._db_id_map.omaid_to_entry_nr(record.id)
                og_cdna[i] = SeqRecord.SeqRecord(Seq.Seq(self._db.get_cdna(oma_db_nr).decode("utf-8")),
                                                 id=record.id + "_" + name, description="")
            elif 'fa' in self._db_source:
                og_cdna[i] = self._db[record.id]
            elif 'REST_api' in self._db_source:
                protein = requests.get(API_URL + "/protein/" + record.id + "/")
                protein = protein.json()
                og_cdna[i] = SeqRecord.SeqRecord(Seq.Seq(protein['cdna']),
                                                 id=record.id + "_" + name, description="")

            if 'X' in str(og_cdna[i].seq):
                cleaned_seq = self._clean_DNA_seq(og_cdna[i])
                og_cdna[i].seq = cleaned_seq

        return og_cdna
예제 #2
0
def extract_seq(fasta_file,
                fasta_header_name,
                start,
                stop,
                id="16S",
                header="",
                rev=False):
    from Bio import SeqRecord
    from Bio.Seq import Seq
    from Bio.Alphabet import IUPAC

    for one_fasta_entry in SeqIO.parse(fasta_file, "fasta"):
        if fasta_header_name == one_fasta_entry.name:
            seq = one_fasta_entry[int(start):int(stop)].seq
            seq.id = id
            seq.name = fasta_header_name
            #print seq
            record = SeqRecord.SeqRecord(seq,
                                         id="rrna",
                                         name="extract",
                                         description=header)
            record.id = id
            record.name = "baba"
            if not rev:
                return record
            else:
                record = SeqRecord.SeqRecord(seq.reverse_complement(),
                                             id=id,
                                             name="extract",
                                             description=header)
                return record
예제 #3
0
def aln_parse(aln_filename, fastq_filename):
    """ Convert an ART alignment output to a BioPython one """

    reads = []
    aligns = []
    start_positions = []
    fastqs = SeqIO.parse(open(fastq_filename), "fastq")
    
    with open(aln_filename, 'r') as fp:
        # Skip the first two lines - no info we want there
        line = fp.readline()
        line = fp.readline()
        line = fp.readline()
        # Deal with headers and get sequence length
        header_array = str.split(line, '\t')
        if str.rstrip(header_array[0]) != '@SQ':
            print "ART file header corrupted: ", header_array[0]
            sys.exit(2)
        header_array = str.split(line, '\t')
        sequence_length = int(header_array[2])
        while line:

            # Find the next record
            while not line.startswith(">"):
                line = str.rstrip(fp.readline())

            name_array = str.split(line, '\t')

            strand = str.rstrip(name_array[3])
            
            if strand == '-':
                continue
            
            name = name_array[1]

            start_pos = int(name_array[2])

            read = fastqs.next()
      
            clean_seq = Seq.Seq(str.rstrip(fp.readline()), generic_dna)
            dirty_seq = Seq.Seq(str.rstrip(fp.readline()), generic_dna)
             
            align = [   SeqRecord.SeqRecord(clean_seq, id=read.name, name=read.name,
                            description=""),
                        SeqRecord.SeqRecord(dirty_seq, id=read.name, name=read.name,
                            description="")]

            reads.append(read)
            aligns.append(align)
            start_positions.append(start_pos)

            line = fp.readline()

    return reads, aligns, start_positions
예제 #4
0
def align(sequences):
    """Translate, then align, then back-translate the sequences."""
    # First, start a new tempfile
    translated = tempfile.NamedTemporaryFile(prefix='Translated_',
                                             suffix='.fasta',
                                             mode='w+t')
    # And write the translated sequences into it
    nuc_seqs = {}
    for seq in SeqIO.parse(sequences, 'fasta'):
        s = SeqRecord.SeqRecord(seq.seq.translate(), id=seq.id, description='')
        SeqIO.write(s, translated, 'fasta')
        nuc_seqs[s.id] = str(seq.seq)
    # Seek to the beginning to read them with Clustal Omega
    translated.seek(0)
    # Open a temp file for clustal output
    aligned = tempfile.NamedTemporaryFile(prefix='Aligned_',
                                          suffix='.fasta',
                                          mode='w+t')
    # And align them
    co_cmd = ClustalOmegaCommandline(infile=translated.name,
                                     outfile=aligned.name,
                                     seqtype='protein',
                                     force=True,
                                     iterations=10,
                                     distmat_full=True,
                                     distmat_full_iter=True)
    co_cmd()
    # Close the translated unaligned handle. We are done with it
    translated.close()
    # Then, we want to back-translate the sequences
    backtrans = tempfile.NamedTemporaryFile(prefix='Backtranslated_',
                                            suffix='.fasta',
                                            mode='w+t')
    aligned.seek(0)
    aln = SeqIO.parse(aligned.name, 'fasta')
    for prot_seq in aln:
        bt = ''
        codon = 0
        nuc = nuc_seqs[prot_seq.id]
        for aa in prot_seq:
            if aa == '-':
                bt += '---'
            else:
                bt += nuc[codon * 3:(codon * 3) + 3]
                codon += 1
        # Make it a SeqRecord to write into the output file
        b = SeqRecord.SeqRecord(Seq.Seq(bt), id=prot_seq.id, description='')
        SeqIO.write(b, backtrans, 'fasta')
    # Again, seek to the beginning so we can read it later
    backtrans.seek(0)
    # Close the aligned and unaligned handles; we are done with them
    aligned.close()
    sequences.close()
    return backtrans
예제 #5
0
    def cut_sequence(self, ref_seq):
        logger.info('cutting sequence {}'.format(self.id))
        raw_blast_list = self.blasts_dict[ref_seq.id]
        blast_list = []
        for blast in raw_blast_list:
            if blast.similarity > similarity_threshold:
                blast_list.append(blast)
        blast_list.sort(key=lambda match: match.ref_start)
        max_diff = ref_seq.length * 0.01
        max_diff_cand = None
        for i in range(0, len(blast_list) - 1):
            actual_blast = blast_list[i]
            next_blast = blast_list[i + 1]
            start_next_blast = next_blast.ref_start
            end_actual_blast = actual_blast.ref_end
            diff = abs(start_next_blast - end_actual_blast)
            if diff > max_diff + ref_seq.length * 0.01:
                max_diff = diff
                max_diff_cand = i
        if max_diff_cand is None:
            return None
        #A gap has been found
        new_dict = {}
        new_dict[ref_seq.id] = []
        cut_dtce = 0
        for j in range(0, max_diff_cand + 1):
            if cut_dtce < blast_list[j].query_end < self.length:
                cut_dtce = blast_list[j].query_end
            new_dict[ref_seq.id].append(blast_list[j])
        if cut_dtce == 0:
            return [self, None]
        new_seq_1 = Sequence(SeqRecord.SeqRecord(self.seq[:cut_dtce],
                                                 id=self.id + '_1'),
                             ref_seq=self.ref_seq)
        new_seq_2 = Sequence(SeqRecord.SeqRecord(self.seq[cut_dtce:],
                                                 id=self.id + '_2'),
                             ref_seq=self.ref_seq)
        new_seq_1.ref_start = blast_list[0].ref_start
        new_seq_1.ref_end = blast_list[max_diff_cand].ref_end
        new_seq_1.blasts_dict = new_dict

        other_dict = {}
        other_dict[ref_seq.id] = []
        new_seq_2.ref_start = blast_list[max_diff_cand + 1].ref_start
        for j in range(max_diff_cand + 1, len(blast_list)):
            other_dict[ref_seq.id].append(blast_list[j])
        new_seq_2.ref_end = blast_list[-1].ref_end
        new_seq_2.blasts_dict = other_dict
        logger.info(
            '2 new sequences: {} ({} - {}) w/ length {} and {} ({} - {}) w/ length {}'
            .format(new_seq_1.id, new_seq_1.ref_start, new_seq_1.ref_end,
                    new_seq_1.length, new_seq_2.id, new_seq_2.ref_start,
                    new_seq_2.ref_end, new_seq_2.length))
        return [new_seq_1, new_seq_2]
예제 #6
0
def get_blast_matched_ids(query,
                          blast_db_name,
                          word_size='5',
                          max_seq='6000',
                          evalue=10.0,
                          search_id='',
                          threads=None,
                          identity_cutoff='0'):
    query_file = settings.BLAST_TMP_DIR + search_id + '_query.fasta'
    result_file = settings.BLAST_TMP_DIR + search_id + '_blast_result.txt'
    with open(query_file, "w") as output_handle:
        my_rec = SeqRecord.SeqRecord(seq=Seq.Seq(query),
                                     id='query',
                                     description='')
        SeqIO.write([my_rec], output_handle, 'fasta')

    if len(query) <= 15:
        task = 'blastn-short'
    else:
        task = 'blastn'

    if not threads:
        threads = settings.CORES

    matched_ids = run_blast_search(query_file, blast_db_name, result_file,
                                   threads, word_size, max_seq, evalue, task,
                                   identity_cutoff)

    os.remove(result_file) if os.path.exists(result_file) else None
    os.remove(query_file) if os.path.exists(query_file) else None

    return matched_ids
예제 #7
0
def ammend_fasta():
    with open(fname_unspliced, 'w') as ofile:
        strain_by_protein = defaultdict(dict)
        for seq in SeqIO.parse(fname, 'fasta'):
            SeqIO.write(seq, ofile, 'fasta')
            prot = seq.name.split('_')[0]
            seq_name = seq.description.split('|')[1]
            strain_by_protein[prot][seq_name] = seq

        joined_seqs = defaultdict(dict)
        splice_pairs = [('M', 'M1', 'M2'), ('NS', 'NS1', 'NS2')]
        #splice_pairs = [('M', 'M1', 'BM2')]
        from seqanpy import align_overlap
        for c, a,b in splice_pairs:
            for strain in strain_by_protein[a]:
                seq1 = strain_by_protein[a][strain]
                new_id = c+seq1.id.lstrip(a)
                new_description = c+seq1.description.lstrip(a)
                new_description = new_description.replace(a, c)
                new_name = c+seq1.name.lstrip(a)
                new_name = new_name.replace(a, c)
                try:
                    score, ali1, ali2 = align_overlap(seq1.seq, strain_by_protein[b][strain].seq,
                                            score_gapopen=-20, score_gapext=0)
                    ali_array = np.array([np.fromstring(x, 'S1') for x in [ali1, ali2]])
                    tmpseq = np.copy(ali_array[0])
                    tmpseq[ali_array[0]=='-']=ali_array[1][ali_array[0]=='-']
                    joined_seqs[c][strain] = SeqRecord.SeqRecord(seq=Seq.Seq("".join(tmpseq)), id=new_id,
                                                            description=new_description, name=new_name)
                    SeqIO.write(joined_seqs[c][strain], ofile, 'fasta')
                except:
                    print(seq1.name, "doesn't have a partner")
예제 #8
0
def cctmr_fasta2ref_fasta(fsta_fh, cctmr):
    """
    Converts concatamer sequence to monomer fasta.

    :param fsta_fh: path to fasta file
    :param cctmr: seqeunce of individual monoer sequence
    """
    from dms2dfe.lib.convert_seq import cds2aas
    from Bio import SeqIO, Seq, SeqRecord
    from Bio.Alphabet import IUPAC

    fsta_cctmr1_fh = "%s_cctmr1.fasta" % (splitext(fsta_fh)[0])
    with open(fsta_fh, 'r') as fsta_data:
        #print [i for i in SeqIO.parse(fsta_data, "fasta")]
        for fsta_record in SeqIO.parse(fsta_data, "fasta"):
            fsta_id = fsta_record.id
            #print fsta_id
            fsta_seq = str(fsta_record.seq)
            fsta_cctmr1_seq = fsta_seq[(cctmr[0] - 1) * 3:(cctmr[1] - 1) * 3]
            break
    fsta_cctmr1_f = open(fsta_cctmr1_fh, "w")
    fsta_data = SeqRecord.SeqRecord(Seq.Seq(fsta_cctmr1_seq,
                                            IUPAC.ExtendedIUPACDNA),
                                    id=fsta_id,
                                    description='')
    SeqIO.write(fsta_data, fsta_cctmr1_f, "fasta")
    fsta_cctmr1_f.close()
    return fsta_cctmr1_fh
예제 #9
0
    def setUp(self):

        self.bed_row = "\t".join("TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1	0	3539	TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1|m.13	0	+	2	2969	0	1	3539	0".split())
        self.sequence = """ATCGAGCAGATTGGCCGCAACCTACAACTCCCACGGCCCAAGCACTCTCTCTCTCTCTTTCCCTCTCACC
CTCGCCTCCGCTCCCCCATTTCCGAAGTACTCGCGAGCCAGCGGCCTCCAGCTCACCACCGTTTCCGCCG
CGCGCAGATCCGCCCAATCCGTGCAGCCTCAGGCCACCGCTCTGGTTCCGTGACATGTGGCGAGGTGGTG
GCGCAGACGCTGATGCAGGAGGCGCTCGCGAGGCTGAGGAGCACAACAATGTCGAGGAAGAGGAAGGGAG
TGAGGATGGAGATCGGGACCTGCAGAATAAACGTCCTAAAGTGGGTGCTTTTGGCGAAGAAAGCTCTGGT
GTTAATGCATCCTTCTTTGGATATGAAGCACCACATTTGCATGCTTTTGCTGAACATGACCATTTGAAGC
TGTCACATGGTCCAGAAAATGAATTGGATTTTGGTTTGTCGCTTATCTCAAATGATGGTGGGAATGATAT
TCCAAGGGAGACCAACAGTCATGGTGTCTGTGATGTAGAAAGATCAGGTGGAACAAATGCAGAAGATCTT
GAAATAAGAATGGACCTATCTGATGATCTCTTGCACCTGATATTCTCCTTCTTATGCCAGAAGGATTTAT
GTAGAGCAGGGGCTGCCTGCAAACAGTGGCAGTCTGCTAGTATGCATGAGGATTTCTGGAAATATTTGAA
GTTTGAGAACACCAGAATATCTCTGCAGAACTTTGTTAATATTTGCCACCGTTATCAGAATGTGACAAAT
CTCAATTTGTCTGGTGTCTTAAGTGCAGAAAGCCTAGTGATTGAAGCAATAACATTCTTAAGGCATCTTA
AGACCTTGATAATGGGCAAGGGACAACTGGGAGAAACATTTTTTCAGGCTTTGGCTGAATGCCCATTGTT
AAATACTTTAACAGTCAGTGATGCATCCCTTGGTAGTGGCATTCAAGAGGTAACTGTTAATCATGATGGA
TTGCATGAACTTCAAATTGTGAAGTGTCGTGCACTCAGAGTATCTATCAGATGCCACCAACTTCGAATAC
TGTCTCTGAGGAGAACTGGCATGGCTCATGTATCACTCAATTGTCCTCAGTTGCTTGAATTGGATTTTCA
GTCCTGCCATAAGCTTTCTGACACTGCAATTCGTCAAGCAGCGACAGCCTGTCCACTGTTAGCGTCACTA
GATATGTCATCCTGCTCGTGTGTTACTGATGAGACATTGCGTGAGATAGCTAATGCATGTCAAAATCTTT
CTGTTCTTGATGCATCTAACTGCCCCAACATTTCTTTCGAGTCGGTAAAGCTTCCAATGTTGGTAGACTT
GAGACTATCAAGTTGTGAGGGAATCACATCTGCTTCAATGGGTGCAGTATGTTTTAGTCGTATACTTGAG
GCGTTGCAACTTGATAATTGTAGCCTGTTGACATCTGTGTCTTTGGATCTGCCACATCTCAAGAATATTA
GTCTTGTACACCTCCGCAAGTTTGCTGATTTAAATCTGCGAAGCCCTGTGCTTTCTTACATAAAAGTTTC
CAGATGCTCAGCACTTCGTTGTGTTACCATAACATCAAATGCTCTTAAGAAACTGGTGCTTCAAAAACAA
GAGAGCCTATGTAATTTATCATTGCAATGCCACAATTTAATTGATGTTGATCTTAGTGATTGCGAGTCAT
TGACAAATGAGATCTGCAAAGTTCTCAGTGACGGAGGGGGTTGCCCCATGCTCAGGTCATTAATTCTTGA
TAATTGTGAGAGTTTGAGTGTCGTGGAACTGAATAATAGTTCTTTGGTTAATCTCTCACTTGCTGGTTGC
CGTTCCATGACATTCCTGAAACTTGCATGCCCAAAGCTTCAAGTGGTGATTCTTGATGGTTGTGATCATC
TTGAAAGAGCATCATTTTGCCCGGTTGGTCTTGAATCCCTAAACCTTGGAATTTGTCCAAAGTTGAGTGT
TCTACGCATAGAGGCCCCAAATATGTCTATATTGGAGCTGAAGGGCTGTGGTGTCCTTTCTGAGGCTTCA
ATTAATTGTCCTTGCTTGATATCTTTAGATGCCTCTTTCTGCAGACAGTTTATGGATGATTCGCTGTCCC
AAACAGCAGAAGCATGCCCTCTTATTGAACATCTTATATTGTCTTCATGTTTATCCATTGACGTCCGTGG
ATTGTCTTCTCTGCATTGCCTTCAGAAGCTGGCCTTGCTTGACCTATCATATACATTTTTGATGAACTTG
AAGCCGGTTTTTGACAGTTGTCTGCAGTTGAAGGTCTTGAAACTTTCAGCTTGCAAGTATCTCAGTGATT
CATCTTTGGAACCACTCTACAGAGAGGGTGCTCTACCGATGCTCGTTGAGCTAGATCTGTCCTACTCGTC
CATTGGGCAGACTGCAATAGAAGAGCTTCTCGCGTGCTGTACAAATTTGGTTAATGTGAACCTAAACGGA
TGTACGAACTTGCATGAATTGGTATGTGGATCAGACTATTGCCGGTCCGGTGACATGCCAATTGATGCTT
TCCCCCCTGATTCTGCACCAGACAAGACCAAAGAGATCAGGGAGAGTTCGGATTGTCAGCTTGAAGTTCT
CAGTTGTACTGGCTGTCCAAATATTAAGAAAGTTGTTATTCCTTCAACGGCCAACTATCTGAATTTGTCT
AAGATCAACCTTAATTTGTCTGCAAACTTGAAGGAAGTAGATTTGAAGTGCTCCAATCTTTACAATTTAA
ATTTGAGCAATTGTAACTCACTGGAGATTCTGAAGCTTGATTGCCCAAGATTGGCTAACCTCCAACTTTT
GGCATGCACAATGTTGCAAGAGGATGAACTGAAATCTGCACTATCCTTTTGCGGTGCATTGGAGATCCTC
AATGTGCACTCTTGTCCACAAATAAACACGCTGGATTTTGGCAGGCTACAGGCTGTTTGCCCAACTCTTA
AGCGCATCCAGAGCAGCCCCATCGCATAGTATGAAGGATTCTGGTCTTCTTAATGGACTCGAGTAAATAG
TCCAGATTTGAAACAGAAAAGGCCATGTCGTACTCTTGTACATATGCAGCACCGCCAATATATTGTATGG
CTGCATGTATTAGGGAGCCAGGGCTGACATGAAACCTGTTCTTCCAATCGATTTCTTGTGTTGAATCTAG
TTGAAACATGGAAACCGCACTTCCTAGTTTGTATTTGCTTTTGAGGTGCAGTGATGGAGTAAGCAGATCT
GTATTTATATGAATGAATAACCATCTTGTTTGGATCGTCGATGTTGTATGCTTCATTGATGACATGGGGT
GCTAAGTTTGACTGAAATTACACCAGGTTCTATGGTTCTCTCATAAGGTGCAGTGATTCTGCGGTCTTTA
TTAATCTGTCTCAACTGTGACGATGCAACTGAGACGTTTCCATCTGCCGGCTGCTGATGCTGTGAACTCT
TGGTAAAAAACCTGGTGTACTTGATCCAAGAGCATTCGTTGGGTCACTTGTATCCTTGAAAATTGAGTAA
CTAATAAATGCTGTTGTGTAAAAAAAAGGGGCTTTCTTT"""

        self.seq = SeqRecord.SeqRecord(Seq.Seq(self.sequence.replace("\n", "")),
                                       id="TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1")

        self.index = dict()
        self.index["TRIAE_CS42_1AL_TGACv1_000002_AA0000030.1"] = self.seq
예제 #10
0
def convert_from_subtype_to_hxb2(working_dir, position, orientation, subtype):
    """
    Convert a position number in HXB2 to the equivalent in another subtype.

    Args:
        working_dir: working folder in which to place temporary files
        position: hxb2 coordinate position to convert
        subtype: subtype position to convert to
    """

    sequences = [subtype_sequence(subtype), HXB2()]
    if orientation == "reverse":
        sequences = [SeqRecord.SeqRecord(Seq.reverse_complement(s.seq),
                                         id = s.id, name = s.name) for s in sequences]

    alignment = wrappers.mafft(working_dir, sequences)

    hxb2_pos = 0
    subtype_pos = 0
    for i in range(len(alignment[0])):
        if subtype_pos == position:
            return hxb2_pos
        if alignment[0][i] != "-":
            subtype_pos += 1
        if alignment[1][i] != "-":
            hxb2_pos += 1
    def test_file_to_seqbias(self):
        seq_list = [
            [
                "test1",
                "IKAAESFLPSPVLRTDVMFLVPALKYNPLHRLLIQILGGHETMIQIGHAETATVKFEERLVERIFDKRAGTSSLILIQIDYDEIQIWPGYSILRLGMPEKDEIQIAIITEMKRGAPHIQIQILDFGPAISFKESWLDCVMGNCYNDIASEIKVRGSDLNKVGVRARKECGVATSPINAFINRLLSATYSVGVNFLAVIQISTGIDKVHTNYDKA"
            ],
            [
                "test2",
                "TTNIISELRCTQTCGNAMDNWMGEVLDGTPAFHFGVHCGDTAGPASKRFLLVCLEFSLRGYDLLVRLLLIKDEDANDVHCNQKCSQCCQKCMAHLALGPVTCSSSFNVHYSPGIGALWACIQTCEIDYCIQPCKACVQSCEERSLKVIKADGITAKSFAPMPNGAVDPSTVEYMVKTLIVCLQTCYDENRTVRRFPEKAL"
            ],
            [
                "test3",
                "YPSSALQGGSMSRFLSPTMLRVRASLGFLGINLLPWTLFVIAALPSKSDAQLSSTQPLSAMGMEFIRANTESEINFVDKIHYAYHNLVVDPRKVDSEIAKERCKLLKSIVQVGSVTFATVPGDSYIGISSRSLMFVSEKNTGRELGNKCSAEQDDSSDQKNSGTAECGKLYSYEQWESTREGVDIIRKKTAVTHSNRQIPSVADHPLFLADAHEG"
            ]
        ]

        path_in = "/Users/coltongarelli/SequenceAnalyzer/PAM/Tests/Resources/SequenceBiasIOTestFile"

        director = Director()
        director.file_in_path = path_in
        director.start_up()
        seq_record_list = []
        for i in seq_list:
            seq_record_list.append(SeqRecord.SeqRecord(id=i[0], seq=i[1]))
        director.master_list = seq_record_list
        director.run_bias_analysis()
        for i in range(len(seq_list)):
            self.assertEqual(director.master_list[i].seq, seq_list[i][1])
            self.assertEqual(director.master_list[i].id, seq_list[i][0])
    def fetch(self,
              accessions: Iterable[str]) -> Iterator[SeqRecord.SeqRecord]:
        """Fetch genbank records.

        Args:
            accessions (Iterable[str]): An iterable object that have accession numbers.

        Yields:
            Iterator[SeqRecord.SeqRecord]: An SeqRecord iterator.
        """
        for use_accs in utils.split_per_n(accessions, self.n_once):
            use_accs = tuple(use_accs)
            queue = deque()
            queue.append(use_accs)
            # ダウンロードができなければクエリを半分にする
            while queue:
                query = queue.popleft()
                try:
                    for r in self._efetch(query):
                        yield r
                except Exception:
                    if n := len(query) > 1:
                        queue.append(query[:n // 2])
                        queue.append(query[n // 2:])
                    else:
                        for acc in query:
                            print(f"Cannot fetch {acc}.")
                            yield SeqRecord.SeqRecord(seq="", name=acc)
예제 #13
0
    def get_aln(self, internal=False):
        """assemble a multiple sequence alignment from the evolved
        sequences. Optionally in clude internal sequences

        Parameters
        ----------
        internal : bool, optional
            include sequences of internal nodes in the alignment

        Returns
        -------
        Bio.Align.MultipleSeqAlignment
            multiple sequence alignment
        """
        from Bio import SeqRecord, Seq
        from Bio.Align import MultipleSeqAlignment

        tmp = []
        for n in self.tree.get_terminals():
            if n.is_terminal() or internal:
                tmp.append(
                    SeqRecord.SeqRecord(
                        id=n.name,
                        name=n.name,
                        description='',
                        seq=Seq.Seq(''.join(
                            n.ancestral_sequence.astype('U')))))

        return MultipleSeqAlignment(tmp)
def main(input, output, target):  #get sequences from a file
    records = {}
    seqs = []
    with open(input, 'r') as f:
        for record in SeqIO.parse(f, 'fasta'):
            records[record.seq] = record
            seqs.append(record.seq)
    if len(records) != len(seqs):
        sys.exit('records different length from seqs! report')
    nseq = len(seqs)
    from collections import defaultdict
    transmissibility = defaultdict(list)
    DM = calc_distance_matrix(seqs)
    for id in range(len(DM)):
        seq = seqs[id]
        h1 = DM[id]
        centralTemp = np.divide(sum(h1), nseq - 1, dtype=float)
        transmissibility[centralTemp].append(seq)
    outD = []
    for transVal, subseqs in sorted(transmissibility.items(), reverse=True):
        for seq in subseqs:
            outD.append(SeqRecord.SeqRecord(seq, id='>seq_1'))
        print(len(outD), target)
        if len(outD) > target:
            with open(output, 'w') as f:
                SeqIO.write(outD, f, 'fasta')
            return (input, len(seqs), len(outD))
예제 #15
0
def bam_to_rec(in_file, make_unique_recs=False):
    """
    Generator to convert BAM files into Biopython SeqRecords.
    """
    from Bio import SeqIO, Seq, SeqRecord
    bam_file = pysam.Samfile(in_file, "rb")
    rec_num = 1
    # Keep track of which read IDs have been outputted
    read_ids_outputted = {}
    for read in bam_file:
        seq = Seq.Seq(read.seq)
        if read.is_reverse:
            seq = seq.reverse_complement()
        read_name = read.qname
        if make_unique_recs:
            read_name = "%s_%d" % (read_name, rec_num)
        else:
            # If we're not asked to make the records unique,
            # then don't output the same read ID twice
            if read_name in read_ids_outputted:
                continue
            # Record that we've seen this read
            read_ids_outputted[read_name] = True
        rec = SeqRecord.SeqRecord(seq, read_name, "", "")
        rec_num += 1
        yield rec
예제 #16
0
def construct_supermatrix(coregenome, alifins, supermatrixfout):

    supermatrix = {}
    genomes = list(set(coregenome.genome))
    n_genomes = len(genomes)
    for genome in genomes:
        supermatrix[genome] = SeqRecord.SeqRecord(id = genome, seq = "",
            description = "")

    alifindict = {filename_from_path(alifin): alifin for alifin in alifins}

    n_fams_sc = 0

    for orthogroup, rows in coregenome.groupby("orthogroup"):
        alifin = alifindict[orthogroup]
        sequencedict = {}
        for record in SeqIO.parse(alifin, "fasta"):
            alilen = len(record.seq)
            sequencedict[record.id] = record.seq
        rows = rows.drop_duplicates("genome", keep = False)
        rows = pd.merge(pd.DataFrame({"genome": genomes}), rows, how = "left")
        for ix, row in rows.iterrows():
            sequence_to_add = sequencedict.get(row.gene, "-" * alilen)
            supermatrix[row.genome] = supermatrix[row.genome] + sequence_to_add

    with open(supermatrixfout, "a") as supermatrixhout:
        for genome in supermatrix:
            SeqIO.write(supermatrix[genome], supermatrixhout, "fasta")
예제 #17
0
def assemble_contigs(contig_list, reference, join_threshold):
    sol = []
    raw_list = []
    for contig in contig_list:
        contig.set_borders(reference)
        if contig.ref_start is None:
            sol.append(contig)
        else:
            raw_list.append(contig)
    contigs_by_start = sorted(raw_list, key=lambda contig: contig.ref_start)
    contigs_by_end = sorted(raw_list, key=lambda contig: contig.ref_end)
    if join_threshold < 0:
        logger.info('\tnegative join_threshold. Must join all the contigs')
    while len(contigs_by_end) > 0:
        actual_contig = contigs_by_end[0]
        name = actual_contig.id
        logger.info('\tLooking for contigs near {}'.format(name))
        sequence = actual_contig.seq
        start = actual_contig.ref_start
        end = actual_contig.ref_end
        dict = actual_contig.blasts_dict
        contigs_by_start.remove(actual_contig)
        used_contigs = []
        j = 1
        for contig in contigs_by_start:
            if actual_contig.ref_seq == contig.ref_seq and (
                    join_threshold < 0
                    or end + join_threshold > contig.ref_start):
                name += contig.id
                sequence += contig.seq
                start = min(start, contig.ref_start)
                end = max(end, contig.ref_end)
                dict[reference] += contig.blasts_dict[reference]
                used_contigs.append(contig)
                if join_threshold >= 0:
                    logger.info(
                        'contig {} is near enough: end + join_threshold > contig.ref_start --> {} + {} = {} > '
                        '{}'.format(contig.id, end, join_threshold,
                                    end + join_threshold, contig.ref_start))
                j += 1
            else:
                break
        for elem in used_contigs:
            contigs_by_start.remove(elem)
            contigs_by_end.remove(elem)
        new_contig = Sequence(SeqRecord.SeqRecord(sequence, id=name),
                              ref_seq=actual_contig.ref_seq)
        new_contig.ref_start = start
        new_contig.ref_end = end
        new_contig.blasts_dict = dict
        sol.append(new_contig)
        contigs_by_end.remove(actual_contig)
        if j == 1:
            logger.info('No contigs near enough to join')
        else:
            logger.info(
                'Search finished. {} contigs have been joined\n'.format(j))
    logger.info(
        'Finish assembling contigs with reference {}\n'.format(reference))
    return sol
예제 #18
0
def swissrec2seqrec(record):
    seq = Seq.Seq(record.sequence, Seq.IUPAC.protein)
    s = SeqRecord.SeqRecord(seq,
                            description=record.description,
                            id=record.accessions[0],
                            name=record.entry_name)
    return s
def make_alignment_upper_case(orig_aln):
    '''
    Transform all characters to uppercase.

    :param orig_aln: Biopython alignment
    :type  orig_aln: :class:`Bio.Align.MultipleSeqAlignment`

    :returns: Uppercase Biopython alignment, for
       :class:`Bio.Align.MultipleSeqAlignment`.
    :rtype: list(:class:`Bio.SeqRecord.SeqRecord`)

    Example::

        >>> alignment = AlignIO.read(open('seq.txt', 'rU'), 'fasta')
        >>> print alignment
        SingleLetterAlphabet() alignment with 3 rows and 178 columns
        iigp--gr-gfgkrrhpkkltplaykqfipnvaekt...sgg 3M1N:A|PDBID|CHAIN|SEQUENCE
        iigp--grpgfgkrrhpkkltplaykqfipnvaekt...sgg PRO1:A|NAME1|CHAIN|SEQUENCE
        iigpxxgrcgfgkrrhpkkltplaykqfipnvaekt...sgg PRO2:A|NAME2|CHAIN|SEQUENCE
        >>> new_alignment = Sequence.make_alignment_upper_case(alignment)
        >>> print Align.MultipleSeqAlignment(new_alignment)
        Alphabet() alignment with 3 rows and 178 columns
        IIGP--GR-GFGKRRHPKKLTPLAYKQFIPNVAEKT...SGG 3M1N:A|PDBID|CHAIN|SEQUENCE
        IIGP--GRPGFGKRRHPKKLTPLAYKQFIPNVAEKT...SGG PRO1:A|NAME1|CHAIN|SEQUENCE
        IIGPXXGRCGFGKRRHPKKLTPLAYKQFIPNVAEKT...SGG PRO2:A|NAME2|CHAIN|SEQUENCE

    '''
    aln = []
    for record in orig_aln:
        aln.append(SeqRecord.SeqRecord(Seq.Seq(record.seq.tostring().upper()),
                                   record.id, description=record.description))
    return aln
예제 #20
0
def translate_dealign(aln):
    """De-align the aligned sequences (remove gaps), and translate them to
    amino acid. Write the de-aligned amino acid sequences to a temporary file,
    to align later."""
    to_realign = []
    for s in SeqIO.parse(open(aln), 'fasta'):
        nogaps = ''.join([b for b in s.seq if b != '-'])
        # Translate it
        nogaps_trans = str(Seq.Seq(nogaps).translate())
        # Remove any stop codons
        nogaps_trans = nogaps_trans.split('*')[0]
        to_realign.append(
            SeqRecord.SeqRecord(
                Seq.Seq(nogaps_trans),
                id=s.id,
                name='',
                description=''
                )
            )
    # Open a temporary file
    t = tempfile.NamedTemporaryFile(
        mode='w+t',
        prefix='msaprobs_realign',
        suffix='.fasta'
        )
    SeqIO.write(to_realign, t.name, 'fasta')
    return t
예제 #21
0
    def test_internal(self):

        sequence = """TCCTCACAGTTACTATAAGCTCGTCTATGGCCAGAGACGGTGGTGTTTCTTGTTTACGAA
GGTCGGAGATGATGAGCGTCGGTGGTATCGGAGGAATTGAATCTGCGCCGTTGGATTTAG
ATGAAGTTCATGTCTTAGCCGTTGATGACAGTCTCGTTGATCGTATTGTCATCGAGAGAT
TGCTTCGTATTACTTCCTGCAAAGTTACGGCGGTAGATAGTGGATGGCGTGCTCTGGAAT
TTCTAGGGTTAGATAATGAGAAAGCTTCTGCTGAATTCGATAGATTGAAAGTTGATTTGA
TCATCACTGATTACTGTATGCCTGGAATGACTGGTTATGAGCTTCTCAAGAAGATTAAGG
AATCGTCCAATTTCAGAGAAGTTCCGGTTGTAATCATGTCGTCGGAGAATGTATTGACCA
GAATCGACAGATGCCTTGAGGAAGGTGCTCAAGATTTCTTATTGAAACCGGTGAAACTCG
CCGACGTGAAACGTCTGAGAAGTCATTTAACTAAAGACGTTAAACTTTCCAACGGAAACA
AACGGAAGCTTCCGGAAGATTCTAGTTCCGTTAACTCTTCGCTTCCTCCACCGTCACCTC
CGTTGACTATCTCGCCTGA"""
        sequence = sub("\n", "", sequence)

        record = SeqRecord.SeqRecord(Seq.Seq(sequence), id="class_Chr1.1006.0")
        index = {record.id: record}

        line = "\t".join(
            ['class_Chr1.1006.0',
             '0',
             '619',
             'ID=class_Chr1.1006.0|m.22308;class_Chr1.1006.0|g.22308;ORF_class_Chr1.1006.0|g.22308_class_Chr1.1006.0|m.22308_type:internal_len:206_(+)',
             '0',
             '+',
             '2',
             '617',
             '0',
             '1',
             '619',
             '0'])

        bed_line = bed12.BED12(line, transcriptomic=True, fasta_index=index)
        self.assertFalse(bed_line.invalid, bed_line.invalid_reason)
        pep = sequence[bed_line.thick_start - 1 + 2:bed_line.thick_end]
        if len(pep) % 3 != 0:
            pep = pep[:-(len(pep) % 3)]
        pep = str(Seq.Seq(pep).translate())
        self.assertEqual(bed_line.phase, 2, (bed_line.thick_start, bed_line.thick_end, pep))
        self.assertFalse(bed_line.has_start_codon)
        self.assertFalse(bed_line.has_stop_codon)

        lines = """Chr1	CLASS	transcript	3442811	3443785	1000	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0"; exon_number "1"; Abundance "22.601495"; canonical_proportion "1.0";
    Chr1	CLASS	exon	3442811	3442999	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
    Chr1	CLASS	exon	3443099	3443169	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
    Chr1	CLASS	exon	3443252	3443329	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
    Chr1	CLASS	exon	3443417	3443493	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";
    Chr1	CLASS	exon	3443582	3443785	.	-	.	gene_id "Chr1.1006.gene"; transcript_id "class_Chr1.1006.0";"""

        lines = [GTF.GtfLine(_) for _ in lines.split("\n") if _]

        transcript = Transcript(lines[0])
        transcript.add_exons(lines[1:])
        transcript.finalize()
        transcript.load_orfs([bed_line])
        self.assertTrue(transcript.is_coding)
        self.assertFalse(transcript.has_start_codon)
        self.assertFalse(transcript.has_stop_codon)
        self.assertEqual(transcript.selected_cds_end, transcript.start)
        self.assertEqual(transcript.selected_cds_start, transcript.end)
예제 #22
0
def replace_indels(sam_filename_in, templ_filename, sam_filename_out):
    '''Replace indels, replacing them with wildtype.'''
    sam_filename_out = io_utils.get_filename(sam_filename_out)
    templ_seq = get_seq(templ_filename)
    records = []

    all_reads = 0

    for read in Samfile(sam_filename_in, 'r'):
        # Perform mapping of nucl indices to remove spurious indels:
        all_reads += 1

        seq = ''.join([
            read.seq[pair[0]] if pair[0] else templ_seq[pair[1]]
            for pair in read.aligned_pairs if pair[1] is not None
        ])

        if seq:
            records.append(
                SeqRecord.SeqRecord(Seq.Seq(seq), read.qname, '', ''))

    reads_filename = io_utils.get_filename(None)

    with open(reads_filename, 'w') as fle:
        SeqIO.write(records, fle, 'fasta')

    utils.mem(templ_filename,
              reads_filename,
              out_filename=sam_filename_out,
              gap_open=12)

    print('%s: %i/%i passed replace_indels filter' %
          (sam_filename_in, len(records), all_reads))

    return sam_filename_out
예제 #23
0
def backtranslate(amino_acid, nucleotide):
    """Use the nucleotide sequences to convert the aligned amino acids back to
    nucleotides."""
    # Store the non-gapped nucleotide sequences as a dictionary
    nuc_seqs = []
    for s in SeqIO.parse(nucleotide, 'fasta'):
        snogap = ''.join([b for b in s.seq if b != '-'])
        nuc_seqs.append(
            SeqRecord.SeqRecord(
                seq=Seq.Seq(snogap),
                id=s.id,
                name='',
                description=''
                )
            )
    nuc_seqs = SeqIO.to_dict(nuc_seqs)
    # Then, back-translate the aligned sequences
    bt_seq = []
    for prot in SeqIO.parse(amino_acid, 'fasta'):
        codon = 0
        bt = ''
        nuc = nuc_seqs[prot.id]
        for aa in prot:
            if aa == '-':
                bt += '---'
            else:
                bt += nuc[codon*3:(codon*3)+3]
                codon += 1
        bt_seq.append(bt)
    # Write the backtranslated sequences to disk
    bt_name = amino_acid.replace('msaprobs.fasta', 'backtranslated.fasta')
    SeqIO.write(bt_seq, bt_name, 'fasta')
    return
예제 #24
0
 def _predict_best_protein_pyopa(self, record, og):
     """
     Given a list of sequences that are derived from mapped reads to multiple seq of a OG
     we find the best corresponding mapped seq by comparing it with a representative sequence of the original OG using
     pyopa local alignment and return the sequence with its highest score!
     :return: 
     """
     ref_og_seq = og.aa[0]
     s1 = pyopa.Sequence(str(ref_og_seq.seq))
     best_score = 0
     try:
         frames = [
             record.seq[i:].translate(table='Standard',
                                      stop_symbol='X',
                                      to_stop=False,
                                      cds=False) for i in range(3)
         ]
         best_seq_idx = 0
         for i, seq in enumerate(frames):
             s2 = pyopa.Sequence(str(seq))
             # calculating local and global scores for the given sequences
             local_double = pyopa.align_double(s1, s2, self.env)
             # print('Local score: %f' % local_double[0])
             if local_double[0] > best_score:
                 best_score = local_double[0]
                 best_seq_idx = i
         best_translation = SeqRecord.SeqRecord(
             frames[best_seq_idx],
             id=self._species_name,
             description=record.description,
             name=record.name)
     except:
         raise ValueError("Problem with sequence format!", ref_og_seq.seq)
     return best_translation
예제 #25
0
 def start_record(self):
     seq = Seq.Seq("", self.alphabet)
     self.data = SeqRecord.SeqRecord(seq)
     self.data.description = ""
     self.data.name = ""
     self._current_ref = None
     self._sequence_lines = []
예제 #26
0
def create_reference_region_with_specific_repeats(reference_vntr,
                                                  desired_repeats_count,
                                                  output_name,
                                                  flanks=30000,
                                                  repeat_patterns=None):
    record = SeqRecord.SeqRecord('')
    sequence = get_chromosome_reference_sequence(reference_vntr.chromosome)
    vntr_end = reference_vntr.start_point + reference_vntr.get_length()
    if flanks is None:
        region_start = 0
        region_end = len(sequence)
    else:
        region_start = reference_vntr.start_point - flanks
        region_end = vntr_end + flanks
    new_sequence = sequence[region_start:reference_vntr.start_point]
    if repeat_patterns is None:
        repeats = reference_vntr.get_repeat_segments()
    else:
        repeats = repeat_patterns
    for i in range(desired_repeats_count):
        new_sequence += repeats[i % len(repeats)]
    new_sequence += sequence[vntr_end:region_end]

    record.seq = Seq.Seq(new_sequence)
    with open(output_name, 'w') as output_handle:
        SeqIO.write([record], output_handle, 'fasta')
예제 #27
0
def fasta_nts2prt(fsta_fh, host='coli', fsta_prt_fh=None):
    """
    Translates nucleotide fasta to amino acid fasta
    
    :param fsta_fh: path to fasta file
    :param host: host organism e.g. E. coli
    :param fsta_prt_fh: path to fasta protein sequence
    :returns fsta_seq_prt: fasta protein sequence
    """
    from dms2dfe.lib.convert_seq import cds2aas
    from Bio import SeqIO, Seq, SeqRecord
    from Bio.Alphabet import IUPAC

    with open(fsta_fh, 'r') as fsta_data:
        for fsta_record in SeqIO.parse(fsta_data, "fasta"):
            fsta_id = fsta_record.id
            fsta_seq = str(fsta_record.seq)
            break
    if fsta_prt_fh == None:
        fsta_prt_fh = "%s_prt%s" % (splitext(fsta_fh)[0], splitext(fsta_fh)[1])
    fsta_prt_f = open(fsta_prt_fh, "w")
    fsta_seq_prt = cds2aas(fsta_seq, host, stop_codon='*')
    fsta_seq_prt_id = splitext(basename(fsta_fh))[0] + '_prt'
    # print fsta_seq_prt
    # print fsta_seq_prt_id
    fsta_data_prt = SeqRecord.SeqRecord(Seq.Seq(fsta_seq_prt, IUPAC.protein),
                                        id=fsta_seq_prt_id,
                                        description='')
    SeqIO.write(fsta_data_prt, fsta_prt_f, "fasta")
    fsta_prt_f.close()
    return fsta_seq_prt
예제 #28
0
def identify_similar_regions_for_vntrs_using_blat():
    from multiprocessing import Process, Semaphore, Manager
    reference_vntrs = load_unique_vntrs_data()

    records = []
    for ref_vntr in reference_vntrs:
        record = SeqRecord.SeqRecord('')
        sequence = ref_vntr.left_flanking_region[
            -30:] + ref_vntr.pattern + ref_vntr.right_flanking_region[:30]
        record.seq = Seq.Seq(sequence)
        record.id = str(ref_vntr.id)
        records.append(record)
    vntr_structures_file = 'reference_vntr_structures.fa'
    with open(vntr_structures_file, 'w') as output_handle:
        SeqIO.write(records, output_handle, 'fasta')

    sema = Semaphore(7)
    manager = Manager()
    result_list = manager.list()
    process_list = []
    for ref_vntr in reference_vntrs:
        sema.acquire()
        p = Process(target=find_similar_region_for_vntr,
                    args=(sema, ref_vntr, vntr_structures_file, result_list))
        process_list.append(p)
        p.start()

    for p in process_list:
        p.join()
    result_list = list(result_list)
    with open('similar_vntrs.txt', 'a') as out:
        for vntr_id in result_list:
            out.write('%s\n' % vntr_id)
예제 #29
0
    def split_hypermuts(self, hm_columns):
        """Produce the hypermut positive and hypermut negative alignments"""
        hm_indices = list(set(map(lambda n: n - 1, hm_columns)))
        hm_indices.sort()

        # soi is either a seq or index - handle appropriately
        def hyp_reducer(soi, i):
            seq1 = self[:, soi:soi + 1] if type(soi) == int else soi
            seq2 = self[:, i:i + 1]
            return seq1 + seq2

        init = type(self)([
            SeqRecord.SeqRecord(Seq.Seq(''), id=self[i].id)
            for i in xrange(len(self))
        ])
        self.hm_pos_aln = reduce(hyp_reducer, hm_indices, init)

        if hm_indices:
            self.hm_neg_aln = self[:, :hm_indices[0]]
            n_hypermut = len(hm_indices)
            for i in range(0, n_hypermut - 1):
                start_i = hm_indices[i] + 1
                stop_i = hm_indices[i + 1]
                self.hm_neg_aln += self[:, start_i:stop_i]

            self.hm_neg_aln += self[:, hm_indices[-1] + 1:]

        else:
            self.hm_neg_aln = self

        return self
예제 #30
0
def find_similar_region_for_vntr(sema, reference_vntr, ref_file, result_list):
    from Bio import SearchIO
    vntr_id = reference_vntr.id
    q = reference_vntr.left_flanking_region[
        -30:] + reference_vntr.pattern + reference_vntr.right_flanking_region[:
                                                                              30]
    search_index = vntr_id
    qfile = settings.BLAST_TMP_DIR + str(vntr_id) + '_' + str(
        search_index) + '_query.fasta'
    with open(qfile, "w") as output_handle:
        my_rec = SeqRecord.SeqRecord(seq=Seq.Seq(q),
                                     id='query',
                                     description='')
        SeqIO.write([my_rec], output_handle, 'fasta')
    output = 'blat_out/output_%s_%s.psl' % (vntr_id, search_index)
    command = 'blat -q=dna -oneOff=1 -tileSize=8 -stepSize=3 -minIdentity=75 %s %s %s' % (
        ref_file, qfile, output)
    os.system(command)
    os.system('rm %s' % qfile)
    try:
        qresult = SearchIO.read(output, 'blat-psl')
        if is_false_vntr_hit(qresult, reference_vntr):
            print('there is similar sequence for %s' % vntr_id)
            result_list.append(vntr_id)
    except ValueError:
        pass
    sema.release()
예제 #31
0
파일: krseq.py 프로젝트: karolisr/PhyloMill
def translate_cds(record, table):

    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet import generic_protein

    # Translation tables
    # http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi

    # Extract CDS
    feature_list = list()

    for f in record.features:
        if f.type.lower() == 'cds':
            feature_list.append(f)

    extraction_list = list()

    for f in feature_list:
        e = f.extract(record)
        start = int(f.qualifiers['codon_start'][0])
        if start > 1:
            e = e[start-1:len(e)]
        extraction_list.append(e)

    translation_list = list()

    for e in extraction_list:
        t = e.seq.translate(table=table)
        translation_list.append(t)

    seq = ''

    for t in translation_list:
        seq = seq + str(t)

    seq = Seq(seq, generic_protein)
    rec = SeqRecord(seq)

    rec.name = record.id.split('.')[0]
    rec.description = record.description
    rec.annotations['gi'] = record.annotations['gi']
    rec.annotations['organism'] = record.annotations['organism']
    rec.annotations['taxonomy'] = record.annotations['taxonomy']
    rec.id = record.id

    return rec