Example #1
0
def _as_seq_object(dna, alphabet=IUPAC.ambiguous_dna):

    if not isinstance(dna, Seq):
        dna = Seq(dna, alphabet)

    return dna
Example #2
0
 def test_non_iupac_letters(self):
     """Test if non-IUPAC letters raise a TypeError."""
     with self.assertRaises(TypeError):
         seq = FormattedSeq(Seq("GATCZ"))
Example #3
0
#!/usr/bin/env
# encoding: utf-8
"""
Created by John DiBaggio on 2016-11-30
"""
__author__ = 'johndibaggio'

import sys
import os
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC

argv = list(sys.argv)
input_file = open(argv[1])
output_file = open(argv[2], 'w+')

dna = input_file.read()

dna_sequence = Seq(dna, IUPAC.unambiguous_rna)

a_count = dna_sequence.count("A")
c_count = dna_sequence.count("C")
g_count = dna_sequence.count("G")
t_count = dna_sequence.count("T")

output_file.write("DNA: " + dna + "\nA: " + str(a_count) + "\nC: " +
                  str(c_count) + "\nG: " + str(g_count) + "\nT: " +
                  str(t_count))
output_file.close()
input_file.close()
        feature_type = seq_record.features[feat].type
        if feature_type == 'gene':
            try:
                feature_start_zero_based_numbering = seq_record.features[
                    feat].location.nofuzzy_start
                feature_end_zero_based_numbering = seq_record.features[
                    feat].location.nofuzzy_end
                feature_strand = seq_record.features[feat].strand
                gene_id = str(seq_record.features[feat].qualifiers['gene'][0])
                sequence_slice = gb_entire_sequence_joined[
                    feature_start_zero_based_numbering:
                    feature_end_zero_based_numbering]
                if feature_strand == 1:
                    gene_array.at[accession, gene_id] = sequence_slice
                if feature_strand == -1:
                    sequence_slice_BP = Seq(sequence_slice)
                    sequence_slice_rvscomp = str(
                        sequence_slice_BP.reverse_complement())
                    gene_array.at[accession, gene_id] = sequence_slice_rvscomp
            except KeyError:
                pass

# Count and record instances of each gene in the dataframe

gene_instances_list = []
#print('Counts of each gene extracted from GenBank file:')
for gene in gene_list_rem_dups:
    gene_to_append = gene, len(gene_array) - gene_array[gene].isnull().sum()
    gene_instances_list.append(gene_to_append)
    #print(gene, len(gene_array)-gene_array[gene].isnull().sum())
gene_instances_list_DF = pd.DataFrame(gene_instances_list,
Example #5
0
 def test_overlapping_cut_sites(self):
     """Check if overlapping recognition sites are properly handled."""
     seq = Seq("CATGCACGCATGCATGCACGC")
     self.assertEqual(SphI.search(seq), [13, 17])
Example #6
0
def revcomp(seq):
    
    #create a sequence object
    my_seq = Seq(seq)
    
    return str(my_seq.reverse_complement())
def cluster(file_names, candidates, min_copy_number, FSL, workers):
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    from Bio import SeqIO
    from Bio import pairwise2
    from subprocess import Popen, PIPE
    from collections import OrderedDict
    import os, shutil
    import math

    makelog("Clustering")
    cmd_list = [
    './vsearch-2.7.1/bin/vsearch',
    '--cluster_fast',file_names['file_candidates_fasta'],
    #'--consout',file_names['file_representative'],
    '--threads',str(workers),
    '--strand','both',
    '--clusters',file_names['file_temp_cluster'],
    '--iddef','1',
    '--id', '0.8']
    makelog(' '.join(cmd_list))
    p = Popen(cmd_list, stdout=PIPE, stderr=PIPE)
    out,err = p.communicate()
    #for stdout_line in iter(popen.stdout.readline, ""):
    #    yield stdout_line
    #popen.stdout.close()
    #return_code = popen.wait()
    ##if return_code:
    #    raise subprocess.CalledProcessError(return_code, cmd)

    #for c in iter(lambda: p.stdout.read(), ''):
        #makelog(c)
    makelog("Clustering done")
    makelog("Filtering clusters")
    #count for minimum file length
    clusters_dic = {}
    list_dir = os.listdir(file_names['file_temp_cluster_dir'])
    makelog("Initial clusters: %i" % (len(list_dir),))
    for fn in list_dir:
        if os.path.isfile(file_names['file_temp_cluster_dir'] + fn):
            fh = open(file_names['file_temp_cluster_dir'] + fn)
            n = 0
            for line in fh:
                if line.startswith(">"):
                    n += 1
                    id_seq = line[1:line.find(" ")]
                    if fn in clusters_dic:
                        clusters_dic[fn].append(id_seq)
                    else:
                        clusters_dic[fn] = [id_seq]
            fh.close()
    #shutil.rmtree(file_names['file_temp_cluster_dir'])
    
    #        os.unlink(file_names['file_temp_cluster_dir'] + fn)
    #        if n < args.min_copy_number:
    #            df.loc[df['candidate_id'] == 'id_seq', 'status'] =  'low_cn'
    #            os.remove(fn)
    #            continue
    #clusters_dic = loadcluster(cluster_candidates_file + ".clstr")
    filtered_clusters = filtercluster(clusters_dic, min_copy_number, candidates)
    unique_clusters = set(filtered_clusters.keys())
    num_clusters = len(unique_clusters)
    #loop through clusters
    for current_cluster in unique_clusters:
        #search candidates for that cluster
        #all possible 2-combinations of candidates
        candidates_in_cluster = filtered_clusters[current_cluster]
        #porc_of_clusters = int(math.ceil(len(candidates_in_cluster) * 0.4))
        #new_min_copy_number = max(min_copy_number,porc_of_clusters)
        new_min_copy_number = min_copy_number
        sum_diff_fs_cluster = 0
        for x in candidates_in_cluster:
            totally_different_fs = True
            cand_x = candidates[x]

            fs_right_1 = cand_x['fs_right']
            fs_left_1 = cand_x['fs_left']

            if fs_left_1 == '' or fs_right_1 == '' or not isinstance(fs_left_1,str) or not isinstance(fs_right_1,str):
                totally_different_fs = False
                continue
            if not complex_enough(fs_right_1) or not complex_enough(fs_left_1):
                totally_different_fs = False
                continue

            fs_right_1 = fs_right_1.upper()
            fs_left_1 = fs_left_1.upper()
            fs_right_1_plus_mite =  cand_x['seq'][-FSL:].upper() + fs_right_1
            fs_left_1_plus_mite = fs_left_1 +  cand_x['seq'][0:FSL].upper()

            at_least_one = False
            for y in candidates_in_cluster:
                cand_y = candidates[y]
                if cand_x['candidate_id'] == cand_y['candidate_id']:
                    continue
                # R1 x R2
                # L1 x L2
                # L1RC x R2
                # R1RC x L2
                #some MITE could be at the end or begining of the sequence and this not having flanking seqs
                fs_right_2 = cand_y['fs_right']
                fs_left_2 = cand_y['fs_left']
                
                if fs_right_2 == '' or fs_left_2 == '':
                    continue
                #empty strings in some versions of pandas are returned as nan, so we make sure the flanking seqs are strings
                if fs_left_2 == '' or fs_right_2 == '' or not isinstance(fs_right_2,str) or not isinstance(fs_left_2,str):
                    continue
                if not complex_enough(fs_right_2) or not complex_enough(fs_left_2):
                    continue

                fs_right_2 = fs_right_2.upper()
                fs_left_2 = fs_left_2.upper()
                fs_right_2_plus_mite =  cand_y['seq'][-FSL:].upper() + fs_right_2
                fs_left_2_plus_mite = fs_left_2 + cand_y['seq'][0:FSL].upper()

                fs_left_1_rc = Seq(fs_left_1).reverse_complement()
                fs_right_1_rc = Seq(fs_right_1).reverse_complement()
                
                fs_left_1_plus_mite_rc = Seq(fs_left_1_plus_mite).reverse_complement()
                fs_right_1_plus_mite_rc = Seq(fs_right_1_plus_mite).reverse_complement()
                
                #calculate scores
                score_r1_r2 = pairwise2.align.localms(fs_right_1, fs_right_2, 1, -1, -1, -1,score_only=True)
                score_l1_l2 = pairwise2.align.localms(fs_left_1, fs_left_2, 1, -1, -1, -1,score_only=True)
                score_l1rc_r2 = pairwise2.align.localms(fs_left_1_rc, fs_right_2, 1, -1, -1, -1,score_only=True)
                score_r1rc_l2 = pairwise2.align.localms(fs_right_1_rc, fs_left_2, 1, -1, -1, -1,score_only=True)

                #since a MITE might be longer, we also look a few nt inside 
                score_r1_r2_plus_mite = pairwise2.align.localms(fs_right_1, fs_right_2_plus_mite, 1, -1, -1, -1,score_only=True)
                score_l1_l2_plus_mite = pairwise2.align.localms(fs_left_1, fs_left_2_plus_mite, 1, -1, -1, -1,score_only=True)
                score_l1rc_r2_plus_mite = pairwise2.align.localms(fs_left_1_rc, fs_right_2_plus_mite, 1, -1, -1, -1,score_only=True)
                score_r1rc_l2_plus_mite = pairwise2.align.localms(fs_right_1_rc, fs_left_2_plus_mite, 1, -1, -1, -1,score_only=True)


                #TODO remove 
                #since a MITEs might be longer, we also look for the FS inside
                #score_r1_m2 = pairwise2.align.localms(fs_right_1, seq_2, 1, -1, -1, -1,score_only=True)
                #score_l1_m2 = pairwise2.align.localms(fs_left_1, seq_2, 1, -1, -1, -1,score_only=True)
                #score_r1rc_m2 = pairwise2.align.localms(fs_right_1_rc, seq_2, 1, -1, -1, -1,score_only=True)
                #score_l1rc_m2 = pairwise2.align.localms(fs_left_1_rc, seq_2, 1, -1, -1, -1,score_only=True)
                #max_score = max(score_r1_r2,score_l1_l2,score_l1rc_r2,score_r1rc_l2,score_r1_m2,score_r1rc_m2,score_l1rc_m2)

                #get max score
                max_score = max(score_r1_r2,score_l1_l2,score_l1rc_r2,score_r1rc_l2,score_r1_r2_plus_mite,score_l1_l2_plus_mite,score_l1rc_r2_plus_mite,score_r1rc_l2_plus_mite)
                if max_score == []:
                    max_score = 0
                max_score /= FSL
                at_least_one = True
                if max_score > 0.5:
                    totally_different_fs = False
                    break
               
            if totally_different_fs and at_least_one:
                sum_diff_fs_cluster += 1
            if sum_diff_fs_cluster >= new_min_copy_number:
                break

        if sum_diff_fs_cluster < new_min_copy_number:
            #makelog(' '.join(filtered_clusters[current_cluster]) + " filtered by flanking sequence")
            del filtered_clusters[current_cluster]
        #else:
        #    makelog(' '.join(filtered_clusters[current_cluster]) + " not filtered by flanking sequence")

    #again to remove < MIN_COPY_NUMBER elements
    #filtered_clusters = filtercluster(filtered_clusters, args.min_copy_number, positions, df, 'low_copy_number_2')
    ordered_cluster = OrderedDict(sorted(filtered_clusters.items(), key=lambda t: t[0]))

    makelog("Clusters: " + str(len(filtered_clusters)))
    buffer_rec = []
    #import ipdb; ipdb.set_trace()
    #for candidate in candidates.values():
    count = 1
    family_number = 1#ordered_cluster.keys().index(clus)
    buffer_nr = []
    output_gff = open(file_names['file_gff'],"w") 
    output_gff.write("##gff-version 3\n")

    for clus, seqs in ordered_cluster.items():
        one_per_family = False
        tsd_family = []
        for seq in seqs:
            candidate = candidates[seq]
            candidate['id'] = "MITE_T_%s|%s|%s|%s|%s|%s|F%s" % (str(count),candidate['record'],candidate['start'],candidate['end'],candidate['tsd'],candidate['tir_len'],family_number)
            candidate['description'] = "%s CANDIDATE_ID:%s" % (candidate['description'], candidate['candidate_id'].split('|')[0])
            record = SeqRecord(Seq(candidate['seq']), id=candidate['id'], description=candidate['description'])
            buffer_rec.append(record)
            
            write_row =  '\t'.join([candidate['record'], 'MITE_Tracker','MITE',str(candidate['start']), str(candidate['end']),'.','+','.','ID='+candidate['id'] ]) 
            output_gff.write(write_row + '\n')

            tsd_family.append(candidate['tsd'])
            if not one_per_family:
                one_per_family = True
                record_family = record
                buffer_nr.append(record_family)
            count += 1
        from statistics import mode,StatisticsError
        try:
            tsd_consensus = mode(tsd_family)
            record_family.description = '%s COMMON_TSD:%s' % (record_family.description, tsd_consensus)
        except StatisticsError:
            pass
        family_number += 1


    SeqIO.write(buffer_nr, file_names['file_representative'] , "fasta")
    SeqIO.write(buffer_rec, file_names['all_file'] , "fasta")
    cluster2seq(ordered_cluster, candidates, file_names['families_file'])
Example #8
0
 def setUp(self):
     self.sequences = [
         SeqRecord(Seq("AAA"), id="s1"),
         SeqRecord(Seq("A-G"), id="s2"),
         SeqRecord(Seq("-A-"), id="s3"),
     ]
Example #9
0
def extract_pairwise(align_json=None, outfile=None,
        outfmt=None, refreg=None,
        debug=False,
    ):
    outh = sys.stdout if outfile is None else open(outfile, 'w')
    
    if outfmt == 'nuc_fa' or outfmt == 'prot_fa':
        jaln = load_slot_json(align_json, 'padded_alignments')
        if refreg is None:
            for newname, alignment in list(jaln.items()):
                nucstr = ''.join(t[2] for t in alignment if t[3] != -1)
                nucstr = nucstr.replace('*', 'N')
                print('>%s' % newname, file=outh)                
                if outfmt == 'nuc_fa':
                    print(sequtils.wrap(nucstr), file=outh)
                else:
                    s = Seq(nucstr[:(old_div(len(nucstr),3))*3])
                    print(sequtils.wrap(str(s.translate())), file=outh)
        else:
            refmap = {sequtils.parse_seq_id(k)['ref']:k for k in list(jaln.keys())}
            chrom, ref_s, ref_e = sequtils.region_to_tuple(refreg)
            ref_s = ref_s - 1
            alignment = jaln[refmap[chrom]]
            
            # Get alignment start
            for aln_s in range(len(alignment)):
                if alignment[aln_s][0] == ref_s:
                    break
                while alignment[aln_s][3] == -1:
                    aln_s += 1
            
            # Get alignment end
            for aln_e in range(len(alignment)-1, -1, -1):
                if alignment[aln_e][0] == ref_e:
                    break
            while alignment[aln_e][3] == -1:
                aln_e += -1

            nucstr = ''.join(t[2] for t in alignment[aln_s:aln_e] if t[3] != -1)
            nucstr = nucstr.replace('*', 'N')
            print('>%s (%s)' % (refmap[chrom], refreg), file=outh)
            if outfmt == 'nuc_fa':            
                print(sequtils.wrap(nucstr), file=outh)
            else:
                s = Seq(nucstr[:(old_div(len(nucstr),3))*3])
                print(sequtils.wrap(str(s.translate())), file=outh)

    elif outfmt == 'aln_fa':
        jaln = load_slot_json(align_json, 'padded_alignments')
        for newname, alignment in list(jaln.items()):
            aid = sequtils.parse_seq_id(newname)
            rstr = ''.join(t[1] for t in alignment).replace('*', 'N')
            qstr = ''.join(t[2] for t in alignment).replace('*', 'N')
            print('>ref|%s|' % aid['ref'], file=outh)
            print(sequtils.wrap(rstr), file=outh)
            print('>sid|%s|' % aid['sid'], file=outh)
            print(sequtils.wrap(qstr), file=outh)

    elif outfmt == 'amp_gtf':
        jgtf = load_slot_json(align_json, 'padded_gtf')
        print('\n'.join(_ for _ in jgtf), file=outh)

    elif outfmt == 'tsv':
        jaln = load_slot_json(align_json, 'padded_alignments')
        for newname, alignment in list(jaln.items()):
            print('# %s' % newname, file=outh)
            for l in alignment:
                print('\t'.join(str(_) for _ in l), file=outh)
Example #10
0
def _alignment_record(sequence):
    return SeqRecord(
        Seq(sequence, alphabet=Alphabet.Gapped(Alphabet.generic_dna)))
Example #11
0
def seqrecord(sequence_id, sequence_text, alphabet=Alphabet.generic_dna):
    """
    Quick shortcut to make a SeqRecord
    """
    return SeqRecord(Seq(sequence_text, alphabet), id=sequence_id)
Example #12
0
 def proRC(self):
     return str(Seq(self.seq, IUPAC.unambiguous_dna).reverse_complement())
Example #13
0

from Bio.Seq import Seq

Dna=input("enter DNA sequence ")
Dna =Seq(Dna)
Mrna=Dna.translate()
protein=Mrna.translate()
print(protein)



Example #14
0
from Bio.Alphabet import generic_dna
from Bio.SeqUtils import molecular_weight, MeltingTemp
from Bio.Restriction import AllEnzymes
from pathlib import Path
from exmemo import Workspace
from functools import lru_cache
from more_itertools import one

work = Workspace.from_path(__file__)
work.sequence_dir = work.root_dir / 'sequences'
work.plasmid_dir = work.sequence_dir / 'plasmids'
work.plasmid_db = work.sequence_dir / 'plasmids.xlsx'
work.fragment_db = work.sequence_dir / 'fragments.xlsx'
work.oligo_db = work.sequence_dir / 'oligos.xlsx'

DnaSeq = lambda x: Seq(x.upper(), generic_dna)
param_pattern = r'(?P<key>\w+)=((?P<value>[^"]\S*)|(?P<value_quoted>".*?"))(\s|$)'


def get_seq(tag):
    return dispatch_to_tag(
        tag,
        p=get_plasmid_seq,
        f=get_fragment_seq,
        o=get_oligo_seq,
    )


def get_mw(tag):
    return molecular_weight(
        seq=get_seq(tag),
Example #15
0
    def simulate_read_with_error_model(cls,
                                       genome,
                                       ErrorModel,
                                       i,
                                       always_forward=True):
        """Form a read from one genome (or sequence) according to an
        ErrorModel
        returns a string
        Args:
            genome (string): sequence or genome of reference
            ErrorModel (ErrorModel): an ErrorModel class
            i (int): a number identifying the read
        Returns:
            string: a string representing a single read
        """
        # ErrorModel.read_length = ErrorModel.read_length - 1
        np_random = np.random.RandomState(seed=i)

        read_length = ErrorModel.read_length

        if len(genome) <= read_length:
            genome = "".join([genome, "N" * (read_length - len(genome) + 1)])

        record = SeqRecord(Seq(genome, IUPAC.unambiguous_dna),
                           id=f'genome_{i}',
                           description='')

        sequence = record.seq
        header = record.id

        # generate the forward read
        forward_start = np_random.randint(
            low=0, high=max(len(record.seq) - read_length + 1, 1))

        forward_end = forward_start + read_length

        generate_forward = np_random.randint(low=0, high=2)

        if generate_forward or always_forward:

            bounds = (forward_start, forward_end)
            # create a perfect read
            forward = SeqRecord(Seq(str(sequence[forward_start:forward_end]),
                                    IUPAC.unambiguous_dna),
                                id='%s_%s' % (header, i),
                                description='')
            # add the indels, the qual scores and modify the record accordingly
            forward.seq = ErrorModel.introduce_indels(forward, 'forward',
                                                      sequence, bounds)
            forward = ErrorModel.introduce_error_scores(forward, 'forward')
            forward.seq = ErrorModel.mut_sequence(forward, 'forward')

            return str(forward.seq)

        else:
            insert_size = ErrorModel.random_insert_size()
            try:
                reverse_start = forward_end + insert_size
                reverse_end = reverse_start + read_length
                assert reverse_end < len(record.seq)
            except AssertionError:
                reverse_end = np_random.randint(low=read_length,
                                                high=len(record.seq))
                reverse_start = reverse_end - read_length

            bounds = (reverse_start, reverse_end)
            reverse = SeqRecord(Seq(
                rev_comp(str(sequence[reverse_start:reverse_end])),
                IUPAC.unambiguous_dna),
                                id='%s_%s' % (header, i),
                                description='')
            reverse.seq = ErrorModel.introduce_indels(reverse, 'reverse',
                                                      sequence, bounds)
            reverse = ErrorModel.introduce_error_scores(reverse, 'reverse')
            reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse')

            return str(reverse.seq)
Example #16
0
                style = ''
                end = i - 1
                features.append({
                    'style': '-->',
                    'sel': [begin, end],
                    'position': position
                })
    return features

    #prof=cons_prof(alignment)
    #pylab.plot(prof)


if __name__ == '__main__':
    human_h2a_z_core = Seq(
        'SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLI-KATIAGGGVIPHIHKSLIG'
    )
    xenopus_h2a_core = Seq(
        'TRSSRAGLQFPVGRVHRLLRKGNYAE-RVGAGAPVYLAAVLEYLTAEILELAGNAARDNKKTRIIPRHLQLAVRNDEELNKLLGRVTIAQGGVLPNIQSVLLP'
    )

    # human_h2a_z_core=Seq('SRSQRAGLQFPVGRIHRHLKSRTTSHGRVGATAAVYSAAILEYLTAEVLELAGNASKDLKVKRITPRHLQLAIRGDEELDSLIKATIAGGGVIPHIHKSLIG')
    msa = MultipleSeqAlignment(
        [SeqRecord(xenopus_h2a_core, id='H2A', name='H2A')])
    features = get_hist_ss_in_aln_for_shade(msa, below=True)
    # features=[{'style':'fill:$\uparrow$','sel':[5,10],'text':'test'}]
    print features
    shade_aln2png(msa,
                  filename='default',
                  shading_modes=['charge_functional'],
                  legend=False,
Example #17
0
def write_novel_report(novelmiRNALListFile, featureFile, clusterFile,
                       rnafoldCmdTmp):
    dir_tmp = os.path.split(os.path.abspath(clusterFile))[0]
    samppleName_tmp = '_'.join(novelmiRNALListFile.split('_')[:-3])
    novelmiRNALOriginalList = []
    #clusterNameContentDic = {}
    clusterNameProbabilityDic = {}
    with open(novelmiRNALListFile, 'r') as inf:
        line = inf.readline()
        probabilityIndex = line.strip().split(',').index('probability')
        clusterNameIndex = line.strip().split(',').index('clusterName')
        line = inf.readline()
        while line != '':
            content = line.strip().split(',')
            probability = content[probabilityIndex]
            clusterName = content[clusterNameIndex]
            if clusterName not in novelmiRNALOriginalList:
                novelmiRNALOriginalList.append(clusterName)
                clusterNameProbabilityDic.update({clusterName: probability})
            line = inf.readline()
    # The majority of the paired miRNAs' pruned precursor RNA sequences should be identical. However, the pruned precursor RNA sequences might be slightly different
    # at tail and head part. This situation happens at a quite low probility due to pruning error.
    # Therefore, the paired miRNA's precursor miRNA of the identified novel miRNA should be corrected in order to plot.
    with open(featureFile, 'r') as inf:
        totalContent = inf.readlines()

    with open(featureFile, 'r') as inf:
        line = inf.readline()
        content = line.strip().split('\t')
        clusterNameIndex = content.index('clusterName')
        pruned_precusor_seqIndex = content.index('pruned_precusor_seq')
        pruned_precusor_strIndex = content.index('pruned_precusor_str')
        upstreamDistanceIndex = content.index('upstreamDistance')
        downstreamDistanceIndex = content.index('downstreamDistance')
        armTypeIndex = content.index('armType')
        pair_stateIndex = content.index('pair_state')
        i = 1
        line = inf.readline()
        while line != '':
            content = line.strip().split('\t')
            if content[clusterNameIndex] in novelmiRNALOriginalList:
                armTypeTmp = content[armTypeIndex]
                pair_state = content[pair_stateIndex]
                pruned_precusor_seq = content[pruned_precusor_seqIndex]
                pruned_precusor_str = content[pruned_precusor_strIndex]
                upstreamDistance = content[upstreamDistanceIndex]
                downstreamDistance = content[downstreamDistanceIndex]
                if pair_state == 'Yes':
                    if upstreamDistance != 'None':
                        if int(upstreamDistance) <= 44:
                            correctLineContent = totalContent[
                                i - 1].strip().split('\t')
                            if (correctLineContent[pair_stateIndex]
                                ) == 'Yes' and (
                                    correctLineContent[armTypeIndex] in [
                                        'arm5', 'arm3'
                                    ]) and (correctLineContent[armTypeIndex] !=
                                            armTypeTmp):
                                correctLineContent[
                                    pruned_precusor_seqIndex] = pruned_precusor_seq
                                correctLineContent[
                                    pruned_precusor_strIndex] = pruned_precusor_str
                            totalContent[
                                i - 1] = '\t'.join(correctLineContent) + '\n'
                            #print content[clusterNameIndex]
                        elif int(downstreamDistance) <= 44:
                            correctLineContent = totalContent[
                                i + 1].strip().split('\t')
                            if (correctLineContent[pair_stateIndex]
                                ) == 'Yes' and (
                                    correctLineContent[armTypeIndex] in [
                                        'arm5', 'arm3'
                                    ]) and (correctLineContent[armTypeIndex] !=
                                            armTypeTmp):
                                correctLineContent[
                                    pruned_precusor_seqIndex] = pruned_precusor_seq
                                correctLineContent[
                                    pruned_precusor_strIndex] = pruned_precusor_str
                            totalContent[
                                i + 1] = '\t'.join(correctLineContent) + '\n'
                            #print content[clusterNameIndex]
                        else:
                            pass
            line = inf.readline()
            i = i + 1
    with open(featureFile[:-4] + '_corrected.tsv', 'w') as outf:
        for t, totalContentTmp in enumerate(totalContent):
            # Correct the armType if it is wrongly alocated because the armType is based on the stableClusterSeq. This sequnce may be too short.
            # So use the clusterSeq to recorrect the armType.
            if t == 0:
                content = totalContentTmp.strip().split('\t')
                clusterNameIndex = content.index('clusterName')
                clusterSeqIndex = content.index('clusterSeq')
                stableClusterSeqIndex = content.index('stableClusterSeq')
                pruned_precusor_seqIndex = content.index('pruned_precusor_seq')
                pruned_precusor_strIndex = content.index('pruned_precusor_str')
                armTypeIndex = content.index('armType')
                outf.write(totalContentTmp)
            else:
                content = totalContentTmp.strip().split('\t')
                clusterSeq = content[clusterSeqIndex]
                clusterSeqNew = ''
                for nucl in clusterSeq:
                    if nucl == 'T':
                        clusterSeqNew = clusterSeqNew + 'U'
                    else:
                        clusterSeqNew = clusterSeqNew + nucl
                stableClusterSeq = content[stableClusterSeqIndex]
                pruned_precusor_seq = content[pruned_precusor_seqIndex]
                pruned_precusor_str = content[pruned_precusor_strIndex]
                clusterSeqStr = getMiRNAStructure(clusterSeqNew,
                                                  pruned_precusor_seq,
                                                  pruned_precusor_str)
                pattern = re.compile('\(.*\)')
                if pattern.search(clusterSeqStr) is not None:
                    if content[armTypeIndex] == 'loop':
                        outf.write(totalContentTmp)
                    else:
                        content[armTypeIndex] = 'loop'
                        outf.write('\t'.join(content) + '\n')
                        #print '%s is modified'%(content[clusterNameIndex])
                else:
                    outf.write(totalContentTmp)

    clusterNameFeatureDic = {}
    precursorSeqclusterNameDic = {}
    with open(featureFile[:-4] + '_corrected.tsv', 'r') as inf:
        line = inf.readline()
        content = line.strip().split('\t')
        clusterNameIndex = content.index('clusterName')
        seqCountIndex = content.index('seqCount')
        readCountSumIndex = content.index('readCountSum')
        stableClusterSeqIndex = content.index('stableClusterSeq')
        alignedClusterSeqLabel = content.index('alignedClusterSeq')
        headUnstableLengthLabel = content.index('headUnstableLength')
        tailUnstableLengthLabel = content.index('tailUnstableLength')
        precusorSeqIndex = content.index('pruned_precusor_seq')
        precusorStrIndex = content.index('pruned_precusor_str')
        armTypeIndex = content.index('armType')
        line = inf.readline()
        while line != '':
            content = line.strip().split('\t')
            clusterName = content[clusterNameIndex]
            seqCount = content[seqCountIndex]
            readCountSum = content[readCountSumIndex]
            stableClusterSeq = str(
                Seq(content[stableClusterSeqIndex], generic_dna).transcribe())
            alignedClusterSeq = content[alignedClusterSeqLabel]
            headUnstableLength = int(content[headUnstableLengthLabel])
            tailUnstableLength = int(content[tailUnstableLengthLabel])
            precusorSeq = content[precusorSeqIndex]
            precusorStr = content[precusorStrIndex]
            armType = content[armTypeIndex]
            strand = clusterName[-1]
            chr = clusterName.split(':')[2]
            startPos = int(
                clusterName.split(':')[3][:-1].split('_')[0].strip())
            endPos = int(clusterName.split(':')[3][:-1].split('_')[1].strip())
            headDashCountTmp = headDashCount2(alignedClusterSeq)
            tailDashCountTmp = tailDashCount2(alignedClusterSeq)
            if strand == '+':
                startPos = startPos - headDashCountTmp + headUnstableLength
                endPos = endPos + tailDashCountTmp - tailUnstableLength
            else:
                startPos = startPos - tailDashCountTmp + tailUnstableLength
                endPos = endPos + headDashCountTmp - headUnstableLength
                #startPos = startPos - headDashCountTmp + tailUnstableLength
                #endPos = endPos + tailDashCountTmp - headUnstableLength
            if precusorSeq != 'None':
                if clusterName not in clusterNameFeatureDic.keys():
                    clusterNameFeatureDic.update({
                        clusterName: [
                            chr, startPos, endPos, strand, stableClusterSeq,
                            seqCount, readCountSum, armType, precusorSeq,
                            precusorStr
                        ]
                    })
                if (precusorSeq, chr,
                        strand) not in precursorSeqclusterNameDic.keys():
                    precursorSeqclusterNameDic.update({
                        (precusorSeq, chr, strand): [clusterName]
                    })
                else:
                    precursorSeqclusterNameDic[(precusorSeq, chr,
                                                strand)].append(clusterName)
            line = inf.readline()

    clusterNameClusterSeqDic = {}
    # Parse the clusterFile to get the detailed reads information for each cluster.
    with open(clusterFile, 'r') as inf:
        contentTmp = inf.readlines()
    labelList = []
    for index, item in enumerate(contentTmp):
        if 'Cluster Name:' in item:
            labelList.append(index)

    for k in range(len(labelList)):
        if k != len(labelList) - 1:
            subContentTmp = contentTmp[labelList[k]:labelList[k + 1]]
        else:
            subContentTmp = contentTmp[labelList[k]:]
        clusterNameTmp = subContentTmp[0].strip().split(' ')[2]
        if clusterNameTmp in clusterNameFeatureDic.keys():
            #print clusterNameTmp
            startTmp = int(clusterNameTmp[:-1].split(':')[-1].split('_')[0])
            endTmp = int(clusterNameTmp[:-1].split(':')[-1].split('_')[1])
            if clusterNameTmp[-1] == '+':
                dashedClusterSeqStart = startTmp - headDashCount2(
                    subContentTmp[4].strip())
                dashedClusterSeqEnd = endTmp + tailDashCount2(
                    subContentTmp[4].strip())
            else:
                dashedClusterSeqStart = startTmp - tailDashCount2(
                    subContentTmp[4].strip())
                dashedClusterSeqEnd = endTmp + headDashCount2(
                    subContentTmp[4].strip())
            readContentlist = []
            for subitem in subContentTmp[5:]:
                if clusterNameTmp[-1] == '+':
                    ReadSeqStart = dashedClusterSeqStart + headDashCount2(
                        subitem.split('\t')[0])
                    ReadSeqEnd = dashedClusterSeqEnd - tailDashCount2(
                        subitem.split('\t')[0])
                else:
                    #ReadSeqStart = dashedClusterSeqStart + headDashCount2(subitem.split('\t')[0])-2
                    #ReadSeqEnd = dashedClusterSeqEnd - tailDashCount2(subitem.split('\t')[0])-2

                    ReadSeqStart = dashedClusterSeqStart + tailDashCount2(
                        subitem.split('\t')[0])
                    ReadSeqEnd = dashedClusterSeqEnd - headDashCount2(
                        subitem.split('\t')[0])
                readCount = int(subitem.strip().split('\t')[1])
                #readSeq = removeDash(subitem.split('\t')[0])
                readSeq = str(
                    Seq(removeDash(subitem.split('\t')[0]),
                        generic_dna).transcribe())
                readContentlist.append(
                    (readSeq, ReadSeqStart, ReadSeqEnd, readCount))
            clusterNameClusterSeqDic.update({clusterNameTmp: readContentlist})
    #print len(clusterNameClusterSeqDic)
    #print clusterNameClusterSeqDic['unmapped_mirna_HUVEC_JH-04:miRCluster_154_18:chr1:38212279_38212296-']

    # Output the novel miRNA report csv file
    #print precursorSeqclusterNameDic[('CUGACUGCCGAGGGGGCCCUGGCCUGGAUCCAUGCUGGGCAGAAGCAGCUGGACACUGACCAGGACCCCCCAGGGCCGGAGGAACC', 'chr9', '+')]
    outf1 = open(
        os.path.join(
            os.path.join(dir_tmp,
                         samppleName_tmp + '_novel_miRNAs_report.csv')), 'w')
    outf1.write(
        'Novel miRNA name,Probability,Chr,Start Pos,End Pos,Strand,Mature miRNA sequence,Arm type,Passenger miRNA sequence,Mature miRNA read Count,Passenger miRNA read Count,Precursor miRNA sequence,Precursor miRNA structure\n'
    )
    i = 1
    while len(novelmiRNALOriginalList) >= 1:
        novelmiRNA = novelmiRNALOriginalList[0]
        sampleName = '_'.join(novelmiRNA.split(':')[0].split('_')[2:])
        precursorSeq = clusterNameFeatureDic[novelmiRNA][-2]
        clusterNameList = precursorSeqclusterNameDic[(
            precursorSeq, clusterNameFeatureDic[novelmiRNA][0],
            clusterNameFeatureDic[novelmiRNA][3])]
        #print clusterNameList
        for clusterNameListTmp in chunkInto2(clusterNameList):
            #print clusterNameListTmp
            if len(clusterNameListTmp) == 1:
                matureMiRNAName = clusterNameListTmp[0]
                passengerMiRNAName = 'None'
            elif len(clusterNameListTmp) == 2:
                if clusterNameListTmp[0] in clusterNameProbabilityDic.keys(
                ) and clusterNameListTmp[1] in clusterNameProbabilityDic.keys(
                ):
                    if int(
                            clusterNameFeatureDic[clusterNameListTmp[0]][6]
                    ) >= int(clusterNameFeatureDic[clusterNameListTmp[1]][6]):
                        matureMiRNAName = clusterNameListTmp[0]
                        passengerMiRNAName = clusterNameListTmp[1]
                    else:
                        matureMiRNAName = clusterNameListTmp[1]
                        passengerMiRNAName = clusterNameListTmp[0]
                elif clusterNameListTmp[0] in clusterNameProbabilityDic.keys(
                ) and clusterNameListTmp[
                        1] not in clusterNameProbabilityDic.keys():
                    matureMiRNAName = clusterNameListTmp[0]
                    passengerMiRNAName = clusterNameListTmp[1]
                else:
                    matureMiRNAName = clusterNameListTmp[1]
                    passengerMiRNAName = clusterNameListTmp[0]
                if clusterNameFeatureDic[passengerMiRNAName][7] == 'loop':
                    passengerMiRNAName = 'None'
                    #print 'Error happens at: %s'%(passengerMiRNAName)
            chr = clusterNameFeatureDic[matureMiRNAName][0]
            startPos = clusterNameFeatureDic[matureMiRNAName][1]
            endPos = clusterNameFeatureDic[matureMiRNAName][2]
            strand = clusterNameFeatureDic[matureMiRNAName][3]
            matureMiRNASeq = clusterNameFeatureDic[matureMiRNAName][4]
            readCountSumMatureMiRNA = clusterNameFeatureDic[matureMiRNAName][6]
            armType = clusterNameFeatureDic[matureMiRNAName][7]
            matureMiRNAPrecusorSeq = clusterNameFeatureDic[matureMiRNAName][8]
            matureMiRNAPrecusorStr = clusterNameFeatureDic[matureMiRNAName][9]
            matureReadContentlistRaw = clusterNameClusterSeqDic[
                matureMiRNAName]
            if passengerMiRNAName == 'None':
                passengerMiRNASeq = 'None'
                passengerReadContentlistRaw = 'None'
                passengerReadContentlist = 'None'
                readCountSumPassengerMiRNA = 'None'
                passengerStrand = 'None'
            else:
                passengerMiRNASeq = clusterNameFeatureDic[passengerMiRNAName][
                    4]
                readCountSumPassengerMiRNA = clusterNameFeatureDic[
                    passengerMiRNAName][6]
                passengerReadContentlistRaw = clusterNameClusterSeqDic[
                    passengerMiRNAName]
                passengerStartPos = clusterNameFeatureDic[passengerMiRNAName][
                    1]
                passengerEndPos = clusterNameFeatureDic[passengerMiRNAName][2]
                passengerStrand = clusterNameFeatureDic[passengerMiRNAName][3]
            # Pad the head and tail part with '.' according to the allignment to the precursor miRNA.
            matureReadContentlist = padClusteredList(matureReadContentlistRaw,
                                                     matureMiRNASeq,
                                                     matureMiRNAPrecusorSeq,
                                                     startPos, endPos, strand)
            if passengerReadContentlistRaw != 'None':
                passengerReadContentlist = padClusteredList(
                    passengerReadContentlistRaw, passengerMiRNASeq,
                    matureMiRNAPrecusorSeq, passengerStartPos, passengerEndPos,
                    passengerStrand)
                totalReadCountSum = int(readCountSumMatureMiRNA) + int(
                    readCountSumPassengerMiRNA)
            else:
                passengerReadContentlist = 'None'
                totalReadCountSum = int(readCountSumMatureMiRNA)
            #if readCountSumPassengerMiRNA != 'None':
            #	totalReadCountSum = int(readCountSumMatureMiRNA) + int(readCountSumPassengerMiRNA)
            #else:
            #	totalReadCountSum = int(readCountSumMatureMiRNA)
            if armType != 'loop':
                novelmiRNANameNew = 'novel_miRNA_' + str(i)
                outf1.write(','.join([
                    novelmiRNANameNew,
                    clusterNameProbabilityDic[matureMiRNAName], chr,
                    str(startPos),
                    str(endPos), strand, matureMiRNASeq, armType,
                    passengerMiRNASeq,
                    str(readCountSumMatureMiRNA),
                    str(readCountSumPassengerMiRNA), matureMiRNAPrecusorSeq,
                    matureMiRNAPrecusorStr
                ]))
                outf1.write('\n')
                # Prepare to plot the precurosr sturcuture, cluster seuqences into a pdf file.
                with open(os.path.join(dir_tmp, 'precusorTmp.fa'),
                          'w') as outf:
                    fa_tmp = '\n'.join([
                        '>' + novelmiRNANameNew, matureMiRNAPrecusorSeq,
                        matureMiRNAPrecusorStr
                    ])
                    outf.write(fa_tmp + '\n')
                f1 = os.path.join(dir_tmp, 'precusorTmp.fa')
                f2 = os.path.join(dir_tmp, 'precusorTmp.str')
                #print '%s -d 0 < %s > %s'%(rnafoldCmdTmp, f1, f2)
                #os.system('cd %s'%(dir_tmp))
                #os.system('%s -d 0 < %s > %s'%(rnafoldCmdTmp, f1, f2))
                os.system('cd %s && %s -d 0 < %s > %s' %
                          (dir_tmp, rnafoldCmdTmp, f1, f2))
                #f3 = os.path.join(dir_tmp, 'precusorTmp_ss.ps')
                f3 = os.path.join(dir_tmp, 'novel_miRNA_' + str(i) + '_ss.ps')
                #print f3
                #print os.path.isfile(f3)
                f4 = os.path.join(dir_tmp, 'novel_miRNA_' + str(i) + '.pdf')
                # Extract the coordinate information of the precusor sequence
                #with open(f3, 'r') as infTmp:
                #clusterNameProbabilityDic[matureMiRNAName]
                #print '*******************'
                #if novelmiRNANameNew == 'novel_miRNA_1':
                #	print len(matureMiRNAPrecusorSeq)
                #	print matureMiRNAPrecusorSeq
                #	print matureReadContentlist
                #	print len(matureReadContentlist[0][0])
                #	print len(matureReadContentlist[1][0])
                #	print passengerReadContentlist
                #	if passengerReadContentlist != 'None':
                #		print len(passengerReadContentlist[0][0])
                #		print len(passengerReadContentlist[1][0])
                #print '*******************'
                creatPDF(sampleName, novelmiRNANameNew,
                         clusterNameProbabilityDic[matureMiRNAName], chr,
                         startPos, endPos, strand, armType,
                         readCountSumMatureMiRNA, totalReadCountSum,
                         matureMiRNAPrecusorSeq, matureMiRNAPrecusorStr,
                         matureMiRNASeq, passengerMiRNASeq, f3, f4,
                         matureReadContentlist, passengerReadContentlist)

                i = i + 1
            # Delete the clusterNames in clusterNameListTmp
            for clusterName in clusterNameListTmp:
                #print '%s is remvoed'%(clusterName)
                if clusterName in novelmiRNALOriginalList:
                    novelmiRNALOriginalList.remove(clusterName)
        #if novelmiRNA == 'unmapped_mirna_HUVEC_JH-04:miRCluster_921_28:chr7:94176805_94176832-':
        #	print len(matureMiRNAPrecusorSeq)
        #	print matureMiRNAPrecusorSeq
        #	print matureReadContentlist
        #	print clusterNameClusterSeqDic[novelmiRNA][0]
        #	print clusterNameFeatureDic[matureMiRNAName][1]
        #	print clusterNameFeatureDic[matureMiRNAName][2]
        #	print len(matureReadContentlist[0][0])
        #	print len(matureReadContentlist[1][0])
        #	print passengerReadContentlist
        #	if passengerReadContentlist != 'None':
        #		print len(passengerReadContentlist[0][0])
        #		print len(passengerReadContentlist[1][0])

    outf1.close()
def main(alignment_file, edlevel_step, confint_p, spec_ids_str, permut_n, orf_crd_table, nucl_list):

	global aminoacids

	nucl_list = list(nucl_list)
	aminoacids = list(aminoacids)
	spec_ids_list = spec_ids_str.split(',')
	edlevel_dict = edlevel_dict_init(edlevel_step)
	edlevels = sorted(edlevel_dict.keys())
	l = len(edlevels)
	orf_crd_dict = make_orf_crd_dict(orf_crd_table)

	for align_obj in align_parse(alignment_file):
		if (spec_ids_list[0] not in align_obj.species_list) or (spec_ids_list[1] not in align_obj.species_list):
			continue
		align_length = len(align_obj.align_dict[spec_ids_list[0]])
		seq_id = align_obj.seqinfo_dict[spec_ids_list[0]].keys()[0]
		orf_crds = orf_crd_dict[seq_id]

		for i in range(orf_crds[0], orf_crds[1]):
			if align_obj.align_dict[spec_ids_list[0]][i] != 'A':
				if not align_obj.edinfo_dict[spec_ids_list[0]].get(i+1):
					continue
			if align_obj.align_dict[spec_ids_list[1]][i] == "-":
				continue

			if align_obj.edinfo_dict[spec_ids_list[1]].get(i+1):
				let2 = 'A'
			else:
				let2 = align_obj.align_dict[spec_ids_list[1]][i]

			let1 = 'A'

			codon, aacid, shift = get_codon(align_obj, spec_ids_list[0], orf_crds[0], i)
			
			syn = True

			for nucl in nucl_list:
				codon_new = codon[:]
				codon_new[shift] = nucl
				if aacid != str(Seq(''.join(codon_new)).translate()):
					syn = False

			if syn and aacid not in aminoacids:
				continue

			if align_obj.edinfo_dict[spec_ids_list[0]].get(i+1):
				edlevel = align_obj.edinfo_dict[spec_ids_list[0]][i+1].edlevel
				if syn:
					for j in range(1, l):
						if (edlevel < edlevels[j]) and (edlevel >= edlevels[j - 1]):
							edlevel_dict[edlevels[j - 1]].total_S_E += 1
				else:
					for j in range(1, l):
						if (edlevel < edlevels[j]) and (edlevel >= edlevels[j - 1]):
							edlevel_dict[edlevels[j - 1]].total_N_E += 1
			else:
				if syn:
					for j in range(1, l):
						edlevel_dict[edlevels[j - 1]].total_S_A += 1
				else:
					for j in range(1, l):
						edlevel_dict[edlevels[j - 1]].total_N_A += 1

			if let1 == let2:
				continue

			if let2 in nucl_list:
				codon_new = codon[:]
				codon_new[shift] = let2
				aacid_new = str(Seq(''.join(codon_new)).translate())
				if aacid == aacid_new:
					syn = True
				else:
					syn = False

				if align_obj.edinfo_dict[spec_ids_list[0]].get(i+1):
					edlevel = align_obj.edinfo_dict[spec_ids_list[0]][i+1].edlevel
					if syn:
						for j in range(1, l):
							if (edlevel < edlevels[j]) and (edlevel >= edlevels[j - 1]):
								edlevel_dict[edlevels[j - 1]].dS_e += 1
					else:
						for j in range(1, l):
							if (edlevel < edlevels[j]) and (edlevel >= edlevels[j - 1]):
								edlevel_dict[edlevels[j - 1]].dN_e += 1
				else:
					if syn:
						for j in range(1, l):
							edlevel_dict[edlevels[j - 1]].dS += 1
					else:
						for j in range(1, l):
							edlevel_dict[edlevels[j - 1]].dN += 1

	s = edlevel_dict[sorted(edlevel_dict.keys())[0]]
	dnds_a = (float(s.dN)/s.total_N_A)/(float(s.dS)/s.total_S_A)
	for i in sorted(edlevel_dict.keys()):
		edlevel_dict[i].p_and_confint_print(confint_p, spec_ids_str, permut_n, dnds_a)
Example #19
0
        print refseq
        sys.exit('Read %s is empty' % f_align.get_all_seqs()[1].id)

    if options.threshold < idencount / basecount and basecount > min_length:
        reads[ID] = tmp
        ref[ID] = refseq
        if len(refseq) > maxlen[0]:
            maxlen[0] = len(refseq)
            maxlen[1] = ID
        ID += 1
        countreads_afterreverse += 1
    else:
        ts = r_align.get_seq_by_num(1).tostring()
        tid = f_align.get_all_seqs()[1].id
        disc_seq.append(
            SeqRecord(Seq(ts, generic_nucleotide), id=tid, description=''))
        tseq = f_align.get_all_seqs()[1]
        disc_h.write('>%s\n' % tseq.id)
        disc_h.write(tseq.seq.tostring().replace('-', '') + '\n')

disc_h.close()

print >> sys.stderr, '%d reads were above the threshold (discarded), %d reads left' % (
    countreads - countreads_afterreverse, countreads_afterreverse)
try:
    print >> sys.stderr, 'dropped %d reads with wron length' % sff_droppedreads_length
except:
    pass
print >> sys.stderr, 'forward: %d, reverse: %d' % (count_forward,
                                                   count_reverse)
Example #20
0
            pHash[pid] = 1
        #print "############################ pid = " + str(pid)
        if pid != int(patternId) and int(patternId) != 0:
            #i += 1  # result query number may not equal to num !!!
            continue
    box = random.randrange(r, r + klength)
    for p in range(r, r + klength):
        if boxsize >= 2 and p == box:
            query += "("
            for t in range(0, boxsize):
                query += alphabet[t]
            query += ")"
        else:
            query += seqStr[p]
    # id is very important for multiple K but name and desc is optional
    # and only for user to check query info
    query_seq = SeqIO.SeqRecord(Seq(query, generic_dna),
                                id=str(i),
                                description="dim=" + str(klength))
    query_list.append(query_seq)
    i += 1

SeqIO.write(query_list, query_file, "fasta")
query_file.close()

for key in pHash:
    patternDistributionFile.write(str(key) + " " + str(pHash[key]) + "\n")

patternDistributionFile.close()
#print aln_ref+'\n'+aln_sample
Example #21
0
rnak562 = pd.DataFrame()
exon2 = 'AGAACTCCACAAACCCATC'
exon1 = 'CTTGGAAGGCCGTCTCGTGG'
cryptic = pd.Series()
downstream1 = 'CTCTCTAAAAAAAATCCTTC'

for var in rnareads.index:
    rr = pd.Series(rnareads.loc[var, 'K562'].split(' '))
    rr.drop(0, inplace=True)
    for i in rr.index:
        if (exon2 not in rr.loc[i]):
            rr.drop(i, inplace=True)

    rrup = rr.apply(lambda x: x[114:134])
    casexon = str(Seq(
        lib300.varseq[var][142:162]).reverse_complement()).upper()
    if (casexon in rrup.values):
        rnak562.loc[var, 'incl'] = rrup.value_counts()[casexon]
    else:
        rnak562.loc[var, 'incl'] = 0
    if (exon1 in rrup.values):
        rnak562.loc[var, 'excl'] = rrup.value_counts()[exon1]
    else:
        rnak562.loc[var, 'excl'] = 0
    if (downstream1 in rrup.values):
        rnak562.loc[var, 'cryptic'] = rrup.value_counts()[downstream1]
    else:
        rnak562.loc[var, 'cryptic'] = 0
    rnak562.loc[var, 'rnareads'] = len(rr)
    rnak562.loc[var, 'rawreads'] = ' '.join(rr)
Example #22
0
trained_mm = trainer.train([known_training_seq])

if VERBOSE:
    print trained_mm.transition_prob
    print trained_mm.emission_prob

test_rolls, test_states = generate_rolls(300)

predicted_states, prob = trained_mm.viterbi(test_rolls, DiceTypeAlphabet())
if VERBOSE:
    print "Prediction probability:", prob
    Utilities.pretty_print_prediction(test_rolls, test_states, predicted_states)

# -- Baum-Welch training without known state sequences
print "Training with Baum-Welch..."
training_seq = Trainer.TrainingSequence(rolls, Seq("", DiceTypeAlphabet()))

trainer = Trainer.BaumWelchTrainer(baum_welch_mm)
trained_mm = trainer.train([training_seq], stop_training)

if VERBOSE:
    print trained_mm.transition_prob
    print trained_mm.emission_prob

test_rolls, test_states = generate_rolls(300)

predicted_states, prob = trained_mm.viterbi(test_rolls, DiceTypeAlphabet())
if VERBOSE:
    print "Prediction probability:", prob
    Utilities.pretty_print_prediction(test_rolls, test_states, predicted_states)
Example #23
0
"""

dirFrameCheck = src.check_create_dir(dirTmp + os.sep + "framecheck")
strTmpMarkers = dirFrameCheck + os.sep + "FirstMarkers.faa"
fOut = open(strTmpMarkers, 'w')
iMLength = args.iMLength
iTotLength = args.iTotLength

for gene in SeqIO.parse(open(dirTmp + os.sep + 'premarkers.txt'), "fasta"):
    iCount = 1
    iRemSeq = iTotLength

    mtch = re.search('\+', str(gene.seq))
    if not mtch:
        strMarker = str(gene.seq)
        geneMarker = SeqRecord(Seq(strMarker[0:min(iRemSeq, len(strMarker))]),
                               id=gene.id + "_#" + str(iCount).zfill(2) + '\n',
                               description="")
        SeqIO.write(geneMarker, fOut, "fasta")
        iRemSeq = iRemSeq - len(geneMarker.seq)

    else:
        for strMarker in (re.split('\++', str(gene.seq))):
            if (iRemSeq >= iMLength and len(strMarker) >= iMLength):
                geneMarker = SeqRecord(
                    Seq(strMarker[0:min(iRemSeq, len(strMarker))]),
                    id=gene.id + "_#" + str(iCount).zfill(2) + '\n',
                    description="")
                SeqIO.write(geneMarker, fOut, "fasta")
                iCount += 1
                iRemSeq = iRemSeq - len(geneMarker)
Example #24
0
def mapgenome(genome_fn, reference_fn, delta_prefix):

    NO_ALIGN_STR = "ERROR: Could not find any alignments for"

    # run nucmer
    cmd = [nucmer_path, '--prefix', delta_prefix, reference_fn, genome_fn]
    devnull = open(os.devnull, 'w')
    proc = subprocess.Popen(cmd, stdout=devnull, stderr=subprocess.PIPE)
    _, out_stderr = proc.communicate()
    devnull.close()
    if proc.returncode:
        raise Exception(out_stderr)

    delta_fn = delta_prefix + '.delta'

    # get the first reference sequence only
    ref_record = SeqIO.parse(reference_fn, 'fasta').next()
    ref_seqid = ref_record.id
    ref_len = len(ref_record)

    genome_seq_mapped = np.asarray(['.'] * ref_len)
    genome_count_mapped = np.zeros(ref_len, dtype=np.int)
    for record in SeqIO.parse(genome_fn, 'fasta'):
        genome_seqid = record.id

        # run show-aligns
        align_fn = delta_prefix + '.align'
        with open(align_fn, 'wb') as align_handler:
            cmd = [show_aligns_path, delta_fn, ref_seqid, genome_seqid]
            proc = subprocess.Popen(cmd,
                                    stdout=align_handler,
                                    stderr=subprocess.PIPE)
            _, out_stderr = proc.communicate()

        if proc.returncode:
            if out_stderr.startswith(NO_ALIGN_STR):
                os.remove(align_fn)
                continue
            else:
                raise Exception(out_stderr)

        # load alignments
        with open(align_fn, 'rb') as align_handler:
            mar = strainest.mummer.MummerAlignmentReader(align_handler)
            for al in mar:
                ref_seq = np.asarray(list(al.seq1))
                genome_seq = np.asarray(list(al.seq2))
                genome_seq = genome_seq[np.where(ref_seq != '.')]
                genome_seq_mapped[al.start1 - 1:al.end1] = genome_seq
                genome_count_mapped[al.start1 - 1:al.end1] += 1

        os.remove(align_fn)

    os.remove(delta_fn)

    # remove repeats
    genome_seq_mapped[genome_count_mapped > 1] = '.'

    # sub '.' with '-'
    genome_seq_mapped[genome_seq_mapped == '.'] = '-'

    # write mapped genome
    out_seq = Seq(''.join(genome_seq_mapped)).upper()
    out_id = re.sub('\s+', '_', os.path.basename(genome_fn))
    out_record = SeqRecord(out_seq, id=out_id, description='')

    return out_record
Example #25
0
 def createAnalysis(self, seq_str, batch_ary):
     """Restriction.Analysis creation helper method."""
     rb = Restriction.RestrictionBatch(batch_ary)
     seq = Seq(seq_str)
     return Restriction.Analysis(rb, seq)
Example #26
0
import Bio
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_protein, generic_rna

my_gene = Seq("ACTAGCAGCGGA", generic_dna)
print(type(my_gene))
attributes = [a for a in dir(my_gene) if not a.startswith("_")]
print(attributes)

my_transcript = my_gene.transcribe()
print(my_transcript)
print(my_transcript.alphabet)

my_protein = my_gene.translate()
print(my_protein)
print(my_protein.alphabet)

coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG", generic_dna)
myprot = coding_dna.translate(to_stop=True)
print(myprot)

seq1 = Seq("AAACGGA", generic_dna)
seq2 = Seq("GGAGAT", generic_dna)
mut_seq = seq1.tomutable()
mut_seq
mut_seq[0] = "G"
print(mut_seq)

myseq = Seq("CCAGAAACCCGGAA", generic_dna)
#find the first occurence of the pattern
print(myseq.find("GAA"))
Example #27
0
 def setUp(self):
     """Set up some sequences for later use."""
     base_seq = Seq("AAAA")
     self.ecosite_seq = base_seq + Seq(EcoRI.site) + base_seq
     self.smasite_seq = base_seq + Seq(SmaI.site) + base_seq
     self.kpnsite_seq = base_seq + Seq(KpnI.site) + base_seq
Example #28
0
def get_custom_fasta(ref_fasta,subsectionlist,args,model_kmer_means,kmer_len):
    if (args.verbose is True):
        print ("Generating a custom fasta")
    sequencedict=dict()
    for sequence in subsectionlist:
        if (args.verbose is True):
            print (sequence)
        for record in SeqIO.parse(ref_fasta, 'fasta'):
            if (record.id == sequence):
                if (sequence not in sequencedict):
                    sequencedict[sequence]=list()
                for sections in subsectionlist[sequence]:
                    start = sections[0]
                    end = sections[1]
                    if (len(sequencedict[sequence])>0):
                        sequencedict[sequence]=str(sequencedict[sequence])+str(record.seq[sections[0]-1:sections[1]-1])
                    else:
                        sequencedict[sequence]=str(record.seq[sections[0]-1:sections[1]-1])
    if (args.verbose is True):
        print ("processing the custom fasta")
    kmer_means=dict()
    for sequence in sequencedict:
        kmer_means[record.id]=dict()
        tmp=dict()
        tmp2=dict()
        tmp["F"]=list()
        tmp["R"]=list()
        tmp["Fprime"]=list()
        tmp["Rprime"]=list()
        print ("ID", record.id)
        print ("length", len(record.seq))
        print ("FORWARD STRAND")
        # seq = Seq(sequencedict[sequence], generic_dna)
        seq = Seq(sequencedict[sequence])

        for x in range(len(seq)+1-kmer_len):
            kmer = str(seq[x:x+kmer_len])
            tmp["F"].append(float(model_kmer_means[kmer]))
        print ("REVERSE STRAND")
        seq = revcomp = seq.reverse_complement()
        for x in range(len(seq)+1-kmer_len):
            kmer = str(seq[x:x+kmer_len])
            tmp["R"].append(float(model_kmer_means[kmer]))
        tmp2["Fprime"]=sklearn.preprocessing.scale(tmp["F"], axis=0, with_mean=True, with_std=True, copy=True)
        tmp2["Rprime"]=sklearn.preprocessing.scale(tmp["R"], axis=0, with_mean=True, with_std=True, copy=True)
        kmer_means[record.id]=tmp2
    '''From this dictionary we will return a pair consisting of a list of keys(lookup for sequence name) and a
    3D array each slice of which relates to the seqid,forward and reverse and then the values. This will then
    be used as a numpy shared memory multiprocessing array. We hope.
    Caution - the dictionary returns in the wrong order.
    '''

    items=kmer_means.items()
    '''for k,v in kmer_means.items():
        for x,y in kmer_means[k].items():
            print "idiot check",k,x
            '''
    items_=map(processItems,items)
    seqids,arrays=zip(*items_)
    z=len(seqids)
    print (arrays)
    r,c=list(arrays)[0].shape
    threedarray=multiprocessing.Array(ctypes.c_double,z*r*c)
    threedarrayshared_array = np.ctypeslib.as_array(threedarray.get_obj())
    a = np.array(arrays,dtype=np.float32)
    threedarrayshared_array = a
    return seqids,threedarrayshared_array
 def test_recognition_site_on_both_strands(self):
     """Check if recognition sites on both strands are properly handled."""
     seq = Seq("CTCTTCGAAGAG")
     self.assertEqual(EarI.search(seq), [3, 8])
Example #30
0
def extract_kmers(name,
                  fasta,
                  length,
                  pams,
                  pampos,
                  filename,
                  chroms=[],
                  minchrlen=10000,
                  processes=1):
    """Extract candidate k-mer guideRNAs with their coordinates from FASTA.

    Convention: coordinate reported is for start position of the whole probe
                in the genome in 0-based coordinates; probe includes guideRNA
                and PAM (PAM can be before or after the guide); then for plus
                strand probe continues to the right, for minus strand probe
                continues to the left

    Args:
    name: project name, a folder with this name containing intermediate and 
            final files in it
    fasta: iterator over Bio.SeqRecord.SeqRecord objects containing chromosomes
           as returned by load_fasta()
    length: length of guideRNAs (not including PAM sequence)
    pams: list of primary and alternative PAM sequences
    pampos: position of PAM ('start' or 'end')
    filename: all k-mers will be written in this file in the format
              '<k-mer followed by PAM> <coordinates>''
    chroms: if not empty, inlcude in analysis only chromosomes with names
            from this list
    minchrlen: include in analysis only chromosomes not shorter than this
    processes: how many processes to use; do not specify more than you have 
                on your system

    Return:
    genome info in the format [(<chromosome name>, <chromosome length>)]
        for all processed chromosomes in the order of processing
    """
    if not pampos in ['start', 'end']:
        raise util.iGuideError("'pampos' argument should be 'start' or 'end'")
    pams_extend = [(pam, pam_seq) for pam in pams
                   for pam_seq in util.expand_dna_n(pam)]
    pams_extend_rev = [(pam, str(Seq(pam_seq).reverse_complement()))
                       for pam in pams for pam_seq in util.expand_dna_n(pam)]
    genome = []
    fasta_temp = []
    for chrom in fasta:
        if len(chrom) < minchrlen:
            continue
        if chroms and chrom.id not in chroms:
            continue
        genome.append((chrom.id, len(chrom)))
        fasta_temp.append(chrom)

    #Parallelize extracting kmers from the reference genome sequences by user defined
    #processes or the number of reference sequences
    parts = len(fasta_temp)
    if processes > parts:
        processes = parts

    kmersfiles_temp = [
        tempfile.NamedTemporaryFile(dir=name, suffix='.temp%s' % i)
        for i in range(parts)
    ]

    pool = Pool(processes)
    util.print_log('poolSize %s...' % processes)

    for i in range(parts):
        pool.apply_async(extract_kmers_pool,
                         (fasta_temp[i], length, pampos, pams_extend,
                          pams_extend_rev, kmersfiles_temp[i].name))
    util.print_log('Waiting for all subprocesses done...')
    pool.close()
    pool.join()
    util.print_log('all chromosomes processed')

    util.print_log('done, merge all kmers...')
    total_count = 0
    util.warn_file_exists(filename)
    f = gzip.open(filename, 'w')
    for i in range(parts):
        for line in kmersfiles_temp[i]:
            f.write(line)
            total_count += 1

    for file in kmersfiles_temp:
        file.close()
    f.close()
    util.print_log('total k-mers written: %s' % total_count)

    return genome