def directStringSeq():
    my_string = "GCTGTTATGGGTCGTTGGAAGGGTGGTCGTGCTGCTGGTTAG"
    Compl = complement(my_string)
    reCompl = reverse_complement(my_string)
    transc = transcribe(my_string)
    bTransc = back_transcribe(my_string)
    transl = translate(my_string)
    print('my_string = ', my_string)
    print('Compl = ', Compl)
    print('reCompl = ', reCompl)
    print('transc = ', transc)
    print('bTransc = ', bTransc)
    print('transl = ', transl)
def get_gc_coverage(fasta, binwidth, output):
    half_width = (binwidth-1)//2

    fasta_dict = {record.id:record.seq for record in SeqIO.parse(fasta, "fasta")}
    header = [(chrom, len(seq)) for chrom, seq in fasta_dict.items()]

    with pybw.open(output, 'w') as outbw:
        outbw.addHeader(header)
        for chrom, sequence in fasta_dict.items():
            #pad the ends of the chromosome with mirrored sequence
            padded = complement(reverse_complement(sequence[0:half_width])) + sequence + complement(reverse_complement(sequence[-half_width::]))
            #should really vectorize this
            gc_vector = np.zeros(len(sequence))
            for i in range(len(sequence)):
                gc_vector[i] = GC(padded[i:i+binwidth])
            outbw.addEntries(chrom, 0, values=gc_vector, span=1, step=1)
 def drawnucleotides(self):
     self.calculatefontsize()
     if self.display_sequence:
         sequence = self.genome_window.genome.seq[
             self.genome_window.
             window_left:self.genome_window.window_right + 1]
         if not self.genome_window.top_positive:
             sequence = sequence.reverse_complement()
         for x, nt, nt_comp in zip(self.genome_window.x_array, sequence,
                                   complement(sequence)):
             text = self.ax.text(x,
                                 0,
                                 nt,
                                 ha='center',
                                 va='bottom',
                                 fontsize=self.fontsize - self.font_spacer,
                                 color=self.color_dictionary[nt],
                                 **self.font_kwargs)
Esempio n. 4
0
def add_nucl(S_1, S_2, p_number_seq, count_nucl):
    """
    Adds nucleotides until the linking happens

    :param S_1: the first sequence
    :param S_2: the second sequence
    :param p_number_seq: the probability that nucleotides attach to the first strand
    :param count_nucl: the number of nucleotides that enough for linkage
    :return: linked sequence
    """
    version_for_1 = {}
    version_for_2 = {}
    for i in range(len(S_1) - count_nucl + 1):
        version_for_1.setdefault(S_1[i:(i + count_nucl)], []).append(i)
        version_for_2.setdefault(S_2[i:(i + count_nucl)],
                                 []).append(len(S_1) - i - count_nucl)

    while (complement(S_1[-count_nucl:]) not in version_for_2) and (complement(
            S_2[:count_nucl]) not in version_for_1):
        if random() < p_number_seq:
            S_1 = S_1 + choices(['A', 'C', 'T', 'G'],
                                weights=[A_weig, C_weig, T_weig, G_weig])[0]
            version_for_1.setdefault(S_1[-count_nucl:],
                                     []).append(len(S_1) - count_nucl)
        else:
            S_2 = choices(['A', 'C', 'T', 'G'],
                          weights=[A_weig, C_weig, T_weig, G_weig])[0] + S_2
            version_for_2.setdefault(S_2[:count_nucl],
                                     []).append(len(S_2) - count_nucl)

    if complement(S_1[-count_nucl:]) in version_for_2:
        index = choice(version_for_2.get(complement(S_1[-count_nucl:])))
        S = S_1 + complement(S_2[(-index - 1):])

    else:
        index = choice(version_for_1.get(complement(S_2[:count_nucl])))
        S = S_1[:index] + complement(S_2)
    return S
Esempio n. 5
0
def molecular_weight(
    seq, seq_type="DNA", double_stranded=False, circular=False, monoisotopic=False
):
    """Calculate the molecular mass of DNA, RNA or protein sequences as float.

    Only unambiguous letters are allowed. Nucleotide sequences are assumed to
    have a 5' phosphate.

    Arguments:
     - seq: String or Biopython sequence object.
     - seq_type: The default is to assume DNA; override this with a string
       "DNA", "RNA", or "protein".
     - double_stranded: Calculate the mass for the double stranded molecule?
     - circular: Is the molecule circular (has no ends)?
     - monoisotopic: Use the monoisotopic mass tables?

    >>> print("%0.2f" % molecular_weight("AGC"))
    949.61
    >>> print("%0.2f" % molecular_weight(Seq("AGC")))
    949.61

    However, it is better to be explicit - for example with strings:

    >>> print("%0.2f" % molecular_weight("AGC", "DNA"))
    949.61
    >>> print("%0.2f" % molecular_weight("AGC", "RNA"))
    997.61
    >>> print("%0.2f" % molecular_weight("AGC", "protein"))
    249.29

    """
    # Rewritten by Markus Piotrowski, 2014

    seq = "".join(str(seq).split()).upper()  # Do the minimum formatting

    if seq_type == "DNA":
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_dna_weights
        else:
            weight_table = IUPACData.unambiguous_dna_weights
    elif seq_type == "RNA":
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_unambiguous_rna_weights
        else:
            weight_table = IUPACData.unambiguous_rna_weights
    elif seq_type == "protein":
        if monoisotopic:
            weight_table = IUPACData.monoisotopic_protein_weights
        else:
            weight_table = IUPACData.protein_weights
    else:
        raise ValueError("Allowed seq_types are DNA, RNA or protein, not %r" % seq_type)

    if monoisotopic:
        water = 18.010565
    else:
        water = 18.0153

    try:
        weight = sum(weight_table[x] for x in seq) - (len(seq) - 1) * water
        if circular:
            weight -= water
    except KeyError as e:
        raise ValueError(
            "%s is not a valid unambiguous letter for %s" % (e, seq_type)
        ) from None

    if double_stranded:
        if seq_type == "protein":
            raise ValueError("protein sequences cannot be double-stranded")
        elif seq_type == "DNA":
            seq = complement(seq, inplace=False)  # TODO: remove inplace=False
        elif seq_type == "RNA":
            seq = complement_rna(seq)
        weight += sum(weight_table[x] for x in seq) - (len(seq) - 1) * water
        if circular:
            weight -= water

    return weight