Beispiel #1
0
 def get_sgrna(self):
     # return DataFrame contains possible sgRNAs.
     if not hasattr(self, 'sgrna'):
         ngg = re.compile('([atgcATGC]{20})([atgcATGC](GG|gg|Gg|gG))')
         ccn = re.compile('((CC|cc|Cc|cC)[atgcATGC])([atgcATGC]{20})')
         columns = ['seqname', 'start', 'cut', 'end', 'sgrna', 'pam']
         sgrna = list()
         for chromosome in self.genome:
             sglist = [{
                 'seqname': chromosome.id,
                 'start': x.start(),
                 'cut': x.end() - 6,
                 'end': x.end() - 3,
                 'sgrna': x.group(1),
                 'pam': x.group(2)
             } for x in ngg.finditer(str(chromosome.seq))]
             sglist.extend({
                 'seqname': chromosome.id,
                 'start': x.start() + 3,
                 'cut': x.start() + 6,
                 'end': x.end(),
                 'sgrna': Seq.reverse_complement(x.group(3)),
                 'pam': Seq.reverse_complement(x.group(1))
             } for x in ccn.finditer(str(chromosome.seq)))
             sgrna.append(pd.DataFrame(sglist, columns=columns))
         self.sgrna = pd.concat(sgrna, axis=0, ignore_index=True)
     return self.sgrna
Beispiel #2
0
def itercodon(seq, frame, offset, table, reverse=False):
    stop = 0
    if not reverse:
        for i in xrange(frame, len(seq) - offset, 3):
            subseq = str(seq.seq)[i:i + 3]
            assert (len(subseq) % 3 == 0), (str(seq))
            aa = Seq.translate(subseq, table)
            yield i, aa
        if i + 3 != len(seq):
            subseq = seq[i + 3:] + "N" * (3 - offset)
            assert (len(subseq) % 3 == 0)
            aa = Seq.translate(subseq, table)
            yield i, aa
    else:
        for i in xrange(len(seq), offset, -3):
            # the reverse complement
            subseq = Seq.reverse_complement(str(seq.seq)[i - 3:i])
            assert (len(subseq) % 3 == 0)
            aa = Seq.translate(subseq, table)
            yield i, aa
        if offset:
            subseq = Seq.reverse_complement("N" * (3 - offset) +
                                            str(seq.seq)[:offset])
            assert (len(subseq) % 3 == 0)
            aa = Seq.translate(subseq, table)
            yield i, aa
Beispiel #3
0
    def test_reverse_complement_on_proteins(self):
        """Test reverse complement shouldn't work on a protein!"""
        for s in protein_seqs:
            with self.assertRaises(ValueError):
                Seq.reverse_complement(s)

            with self.assertRaises(ValueError):
                s.reverse_complement()
Beispiel #4
0
    def test_reverse_complement_on_proteins(self):
        """Test reverse complement shouldn't work on a protein!"""
        for s in protein_seqs:
            with self.assertRaises(ValueError):
                Seq.reverse_complement(s)

            with self.assertRaises(ValueError):
                s.reverse_complement()
Beispiel #5
0
 def rc_kmers(self, kmers):
     res={}
     keys=[]
     for s in kmers:
         if Seq.reverse_complement(s) in keys:
             res[s]=Seq.reverse_complement(s)
         else:
             keys.append(s)
             res[s]=s
     return keys,res
Beispiel #6
0
    def test_reverse_complement(self):
        test_seqs_copy = copy.copy(test_seqs)
        test_seqs_copy.pop(21)

        for nucleotide_seq in test_seqs_copy:
            if not isinstance(nucleotide_seq.alphabet, Alphabet.ProteinAlphabet) and \
                    isinstance(nucleotide_seq, Seq.Seq):
                expected = Seq.reverse_complement(nucleotide_seq)
                self.assertEqual(repr(expected), repr(nucleotide_seq.reverse_complement()))
                self.assertEqual(repr(expected[::-1]), repr(nucleotide_seq.complement()))
                self.assertEqual(str(nucleotide_seq.complement()),
                                 str(Seq.reverse_complement(nucleotide_seq))[::-1])
                self.assertEqual(str(nucleotide_seq.reverse_complement()),
                                 str(Seq.reverse_complement(nucleotide_seq)))
def barcode_stats(outpfile, SAMPLE_BARCODE_DICT, delim, filetype, run_cutadapt, logfile, this_dir='./', trimmedbarcodestatus = False):
    # you do not need to mention global to read values of global variables, only if you want to re/assign value
    outfile = open(outpfile, 'w')
    headers_list = ['file' , 'starts_fbarcode', 'ends_fbarcode', 'contains_fbarcode',\
    'starts_FADAPTER', 'ends_FADAPTER','contains_FADAPTER','starts_FWDPRIMER', 'ends_FWDPRIMER', 'contains_FWDPRIMER',\
    'starts_FAD_FP', 'ends_FAD_FP', 'contains_FAD_FP',\
    'starts_REV_COMPLEM_FAD_FP', 'ends_REV_COMPLEM_FAD_FP', 'contains_REV_COMPLEM_FAD_FP',\
    'starts_REVPRIMER', 'ends_REVPRIMER', 'contains_REVPRIMER',\
    'starts_RADAPTER', 'ends_RADAPTER', 'contains_RADAPTER','starts_RAD_RP', 'ends_RAD_RP', 'contains_RAD_RP',\
    'starts_REV_COMPLEM_RAD_RP', 'ends_REV_COMPLEM_RAD_RP', 'contains_REV_COMPLEM_RAD_RP',\
    'starts_rbarcode', 'ends_rbarcode' , 'contains_rbarcode',\
    'starts_REV_COMPLEM_fbarcode', 'ends_REV_COMPLEM_fbarcode' , 'contains_REV_COMPLEM_fbarcode',\
    'starts_REV_COMPLEM_rbarcode', 'ends_REV_COMPLEM_rbarcode' , 'contains_REV_COMPLEM_rbarcode',\
    'starts_REV_COMPLEM_FWDPRIMER', 'ends_REV_COMPLEM_FWDPRIMER', 'contains_REV_COMPLEM_FWDPRIMER',\
    'starts_REV_COMPLEM_REVPRIMER', 'ends_REV_COMPLEM_REVPRIMER', 'contains_REV_COMPLEM_REVPRIMER']
    outfile.write ("%s\n" % delim.join(headers_list))                
    for f in sorted(os.listdir(this_dir)):
        new_f = this_dir + f
        if os.path.isfile(new_f) and new_f.endswith(filetype):
            for k in SAMPLE_BARCODE_DICT:  # for each element in dict
                trimk = k
                if (trimmedbarcodestatus):
                    trimk = 'trim_' + k
                # uncomment these next two lines if you directly want to run the barcode status on trimmed reads
                #else:
                #    continue                
                if new_f.startswith(this_dir+trimk): #if this_dir+trimk in new_f:   # if the key is prefix in filename.
                    barcodes = SAMPLE_BARCODE_DICT[k].split("_")
                    #print 'ok', trimk, new_f
                    fbarcode = barcodes[0]
                    rbarcode = barcodes[1]
                    counts_list = [f]
                    rev_complem_fbarcode = BioSeq.reverse_complement(fbarcode) # search at end in read 2
                    rev_complem_rbarcode = BioSeq.reverse_complement(rbarcode) # search at end in read 1
                    for pattern in [fbarcode, FADAPTER, FWDPRIMER, FAD_FP, REV_COMPLEM_FAD_FP,\
                                      REVPRIMER, RADAPTER, RAD_RP, REV_COMPLEM_RAD_RP, rbarcode,\
                                      rev_complem_fbarcode, rev_complem_rbarcode, REV_COMPLEM_FWDPRIMER, REV_COMPLEM_REVPRIMER]:
                        counts_list.extend(grep_string(pattern, new_f)) # separately checks starts with pattern, ends with pattern
                        counts_list.append(grep_string_contains(pattern, new_f))

                    str_counts_list = [str(i.strip()) for i in counts_list]
                    outfile.write ("%s\n" % delim.join(str_counts_list))
    outfile.close()
    if (run_cutadapt):
        print "Done with initial barcode stats\nNow running cutadapt\n"
        run_cutadapt_method(outpfile, logfile, this_dir, filetype, SAMPLE_BARCODE_DICT, delim)
    else:
        sys.exit('\nRunning cutadapt was not requested.\n')
Beispiel #8
0
def extract_input_counts(args):
    input_info = get_input_info(args)
    groups = [elem[1] for elem in input_info]
    n_groups = max(groups) + 1
    alphabet = summarize.alphabet
    kmer_counts = [
        defaultdict(lambda: [[0 for j in range(len(alphabet))]
                             for i in range(n_groups)]) for li in range(args.l)
    ]
    print(input_info)
    for li in range(args.l):
        lag = li + 1
        for fi in range(len(input_info)):
            indiv_file, group, file_type = input_info[fi]
            for si, elem in enumerate(
                    summarize.load_input(open(indiv_file, 'r'), file_type)):
                seq = elem[1]
                full_seq = '[' * lag + seq + ']'
                for j in range(lag, len(full_seq)):
                    lag_kmer = full_seq[(j - lag):j]
                    next_letter = full_seq[j]
                    kmer_counts[li][lag_kmer][group][
                        alphabet[next_letter]] += 1
                if args.r:
                    seq = Seq.reverse_complement(seq)
                    full_seq = '[' * lag + seq + ']'
                    for j in range(lag, len(full_seq)):
                        lag_kmer = full_seq[(j - lag):j]
                        next_letter = full_seq[j]
                        kmer_counts[li][lag_kmer][group][
                            alphabet[next_letter]] += 1
    return kmer_counts
Beispiel #9
0
def parse_library(library_handle, threshold, mismatches=0):
    """Parse the library file and put the data in a nested dictionary
    containing per marker the two forward flanking sequences, the two reverse
    flanking sequences and a regular expression pattern object.

    :arg stream library_handle: Open readable handle to a library file.
    :arg float threshold: Number of allowed mismatches per nucleotide.
    :arg int mismatches: If set, overrides the dynamic threshold calculation.

    :returns dict: Nested dictionary containing library data.
    """
    library = {}
    data = map(lambda x: x.strip().split('\t'), library_handle.readlines())

    for i in data:
        pattern = '(?!x)x'  # This will never match anything.
        if len(i) == 4:
            pat = i[3].split()
            pattern = '^{}$'.format(''.join(map(
                lambda x: ('({}){{{},{}}}'.format(
                    pat[x], pat[x + 1], pat[x + 2])),
                range(0, len(pat), 3))))

        library[i[0]] = {
            'flanks': [i[1], Seq.reverse_complement(i[2])],
            'counts': [0, 0, 0, 0],
            'pair_match': [0, 0],
            'thresholds': [
                mismatches or int(ceil(len(i[1]) * threshold)),
                mismatches or int(ceil(len(i[2]) * threshold))],
            'reg_exp': re_compile(pattern),
            'new': defaultdict(lambda: [0, 0]),
            'known': defaultdict(lambda: [0, 0])}

    return library
Beispiel #10
0
    def __getGenomeRefSequence(self, chrom, start, length, strand):
        if type(chrom) != str:
            chrom = str(chrom)
        chrom = chrom.strip()
        if not chrom.startswith('chr'):
            chrom = 'chr' + chrom
        #if strand == '+':
        start = start - 1
        end = start + length
        #else:
        #	end = start - 1
        #	start = end + length
        db_file = "{0}:{1}:{2}-{3}".format(self.__profiles[self.__profile],
                                           chrom, start, end)

        with tempfile.NamedTemporaryFile() as tmp:
            #print TWOBITTOFA_BIN, db_file
            ret = call([TWOBITTOFA_BIN, db_file, tmp.name])
            tmp.readline()  # read fasta header line
            seq = ''
            for line in tmp:
                seq += line.strip()

        if strand == '+':
            return seq
        else:
            from Bio import Seq
            return Seq.reverse_complement(seq)
Beispiel #11
0
def convert_from_subtype_to_hxb2(working_dir, position, orientation, subtype):
    """
    Convert a position number in HXB2 to the equivalent in another subtype.

    Args:
        working_dir: working folder in which to place temporary files
        position: hxb2 coordinate position to convert
        subtype: subtype position to convert to
    """

    sequences = [subtype_sequence(subtype), HXB2()]
    if orientation == "reverse":
        sequences = [SeqRecord.SeqRecord(Seq.reverse_complement(s.seq),
                                         id = s.id, name = s.name) for s in sequences]

    alignment = wrappers.mafft(working_dir, sequences)

    hxb2_pos = 0
    subtype_pos = 0
    for i in range(len(alignment[0])):
        if subtype_pos == position:
            return hxb2_pos
        if alignment[0][i] != "-":
            subtype_pos += 1
        if alignment[1][i] != "-":
            hxb2_pos += 1
Beispiel #12
0
def main():
    """
    Returns all position and length of every reverse palindrome in the string having
    length between 4 and 12. A string sequence is a reverse palindrome if the reverse
    compliment is equivalent to itself. (position begins at 1)
    :return: nothing. Position Length of reverse palindromes are printed
    """
    # import SeqIO for parsing FASTA file
    # import Seq to determine reverse complimentary strands
    from Bio import SeqIO as SeqIO
    from Bio import Seq

    # open FASTA file and obtain string - representing DNA sequence.
    fasta_sequence = SeqIO.parse(open(f"{input('file name here: ')}.txt"),
                                 'fasta')
    for fasta in fasta_sequence:
        sequence = fasta.seq

    # determines length of search inside the total DNA sequence
    for length in range(12, 3, -2):

        # determines where within the sequence to view
        for index in range(len(sequence)):

            # checks to prevent searching index beyond the length of sequence
            if (index + length) < len(sequence) + 1:

                # generates sequence with given length, and create reverse complimentary
                possible_palindrome = sequence[index:index + length]
                reverse_comoplimentary = Seq.reverse_complement(
                    possible_palindrome)

                # definition of reverse palindrome: sequence == reverse complimentary
                if possible_palindrome == reverse_comoplimentary:
                    print(str(index + 1) + f" {length}")
Beispiel #13
0
def create_primer_output(config: SSMConfig, mutagenic_primer, mutation,
                         degenerate_codon: str, new_sequence_start,
                         parameters_in_range: bool):
    # The actual degenerate outputs for the export are created on the frontend. This is simply for the resulting
    # table, which can only show one primer at a time.
    first_degenerate_codon = degenerate_codon.split(",")[0]

    mutated = mutagenic_primer.primer.get_mutated_sequence(
        mutation.position, first_degenerate_codon)
    direction = mutagenic_primer.primer.direction
    primer = mutagenic_primer.primer

    if mutagenic_primer.primer.direction == Primer.FORWARD:
        sequence = mutated
    elif mutagenic_primer.primer.direction == Primer.REVERSE:
        sequence = Seq.reverse_complement(mutated)
    else:
        raise NotImplemented()

    return PrimerOutput(
        direction=direction,
        sequence=sequence,
        normal_order_sequence=mutated,
        normal_order_start=primer.get_normal_start() - new_sequence_start,
        start=primer.start - new_sequence_start,
        length=primer.length,
        three_end_temperature=mutagenic_primer.three_end_temperature,
        gc_content=mutagenic_primer.primer.get_gc_content(),
        parameters_in_range=parameters_in_range)
Beispiel #14
0
    def get_complement_elems(self):
        """Gives the complement strand of sequence.

        :return: Complement Seq iterator.
        """
        for base in Seq.reverse_complement(self.seq):
            yield base
Beispiel #15
0
    def test_reverse_complement(self):
        test_seqs_copy = copy.copy(test_seqs)
        test_seqs_copy.pop(21)

        for nucleotide_seq in test_seqs_copy:
            if not isinstance(nucleotide_seq.alphabet, Alphabet.ProteinAlphabet) and \
                    isinstance(nucleotide_seq, Seq.Seq):
                expected = Seq.reverse_complement(nucleotide_seq)
                self.assertEqual(repr(expected),
                                 repr(nucleotide_seq.reverse_complement()))
                self.assertEqual(repr(expected[::-1]),
                                 repr(nucleotide_seq.complement()))
                self.assertEqual(
                    str(nucleotide_seq.complement()),
                    str(Seq.reverse_complement(nucleotide_seq))[::-1])
                self.assertEqual(str(nucleotide_seq.reverse_complement()),
                                 str(Seq.reverse_complement(nucleotide_seq)))
Beispiel #16
0
def reverse_complement(sequence):
    """
    Reverse complement of a sequence represented as unicode string.

    Unfortunately, BioPython's reverse_complement doesn't work on unicode
    strings. We work almost exclusively with unicode strings, so this is a
    convenience wrapper.
    """
    return unicode(Seq.reverse_complement(str(sequence)))
Beispiel #17
0
def reverse_complement(sequence):
    """
    Reverse complement of a sequence represented as unicode string.

    Unfortunately, BioPython's reverse_complement doesn't work on unicode
    strings. We work almost exclusively with unicode strings, so this is a
    convenience wrapper.
    """
    return unicode(Seq.reverse_complement(str(sequence)))
Beispiel #18
0
def reverse_complement(seq):
    """
        Given: A DNA string s of length at most 1000 bp.
        Return: The reverse complement sc of s.
        due to the complement_map,
        the symbol such as \n and something else is illegal
        the input need to be pure sequence
    """
    return Seq.reverse_complement(seq)
Beispiel #19
0
    def rev_complement(cls, string):
        """
        Quick method to perform the reverse complement of a given string,
        using the class translation table.

        :param string: the sequence to be rev-complented
        :type string: str
        """

        return Seq.reverse_complement(string)
Beispiel #20
0
def translate(seq):
    r = {}
    r['First Frame'] = Seq.translate(seq)
    r['Second Frame'] = Seq.translate(seq[1:])
    r['Third Frame'] = Seq.translate(seq[2:])
    seq = Seq.reverse_complement(seq)
    r['Complement First Frame'] = Seq.translate(seq)
    r['Complement Second Frame'] = Seq.translate(seq[1:])
    r['Complement Third Frame'] = Seq.translate(seq[2:])
    return r
Beispiel #21
0
def oligos_1():
    overhangs = ['CATG', 'ACAA']
    bc =  'GATGATTGA'
    kozak =  'gccacc' 
    start = 'atg'
    fwd =overhangs[0] + loxp + bc + kozak + start + lox71
    rev =  seq.reverse_complement(loxp+bc+kozak+start+lox71+overhangs[1])
    
    print fwd.upper()
    print rev.upper()
Beispiel #22
0
def translate(seq):
    r = {}
    r['First Frame'] = Seq.translate(seq)
    r['Second Frame'] = Seq.translate(seq[1:])
    r['Third Frame'] = Seq.translate(seq[2:])
    seq = Seq.reverse_complement(seq)
    r['Complement First Frame'] = Seq.translate(seq)
    r['Complement Second Frame'] = Seq.translate(seq[1:])
    r['Complement Third Frame'] = Seq.translate(seq[2:])
    return r
Beispiel #23
0
def make_fasta(gene_dict, args, gene_fa_out):

    with open(gene_fa_out, "w") as out_handle:
        for rec in SeqIO.parse(args.fasta, "fasta"):
            for gene, start, end, strand in gene_dict[rec.id]:
                seq = str(rec.seq)[start-(args.flank+1):end+args.flank]
                if strand == "-":
                    seq = Seq.reverse_complement(seq)

                out_handle.write(f">{gene} | location={rec.id}:{start}-{end} | strand={strand}\n{seq}\n")
Beispiel #24
0
    def test_reverse_complement(self):
        test_seqs_copy = copy.copy(test_seqs)
        test_seqs_copy.pop(13)

        for nucleotide_seq in test_seqs_copy:
            if isinstance(nucleotide_seq, Seq.Seq):
                expected = Seq.reverse_complement(nucleotide_seq)
                self.assertEqual(repr(expected),
                                 repr(nucleotide_seq.reverse_complement()))
                self.assertEqual(repr(expected[::-1]),
                                 repr(nucleotide_seq.complement()))
                self.assertEqual(
                    str(nucleotide_seq.complement()),
                    str(Seq.reverse_complement(nucleotide_seq))[::-1],
                )
                self.assertEqual(
                    str(nucleotide_seq.reverse_complement()),
                    str(Seq.reverse_complement(nucleotide_seq)),
                )
Beispiel #25
0
def delGene(geneName, cutsite):

# This function asks user for a chromosomal locus, a region to be deleted, a suitable CRIPSR cutsite 
# and outputs oligos for cloning of a pL308 Cas9-gRNA vector, and ones for generating a donor DNA
# to delete the unwanted chromosomal region. Primers Lup+Rdown produce a 1kb band if deletion was
# successful. 
# part of yCRISPRv3 by [email protected]

    #GeneName=input("Name, using quotes: ")
    #cutsite=input("20-mer cut sequence, using quotes: ").upper()
    locus = genomicData[geneName][0]    
    deletion = genomicData[geneName][1]

    deletion = Seq(deletion)
     
    if deletion.find(cutsite)==-1:
        if deletion.reverse_complement().find(cutsite)==-1:
            print ("WARNING: Guide 20-mer sequence not found in deletion region.")
        
    locus=Seq(locus)
    
    index=locus.find(deletion)                  
   
    # index gives the start position within locus of the string deletion.
    # now we delete the deletion region to redefine a newlocus:
    
    newlocus=locus[0:index]+locus[index+len(deletion):]

    # note that since index starts at 0, a value of n points to, in the newlocus,
    # the first nt after the deletion. So we define the newlocus as above. Note too
    # that a string of len=40 ends at an index of 39--so we pick up at index+len-1. 
    
    Lup=newlocus[index-500:index-470]
    Rdown=newlocus[index+469:index+499].reverse_complement()

    Rtemp1 = newlocus[:index].reverse_complement()
    Rtemp2 = newlocus[index:].reverse_complement()

    rPrimer, rLength = getPrimer(Rtemp1)
    lPrimer, lLength = getPrimer(newlocus[index:])

    Rup = getOverhang(Rtemp2, rLength) + rPrimer
    Ldown = getOverhang(newlocus[:index], lLength) + lPrimer

    cutSequence=Seq("cgggtggcgaatgggacttt")+cutsite+Seq("gttttagagctagaaatagc")
    seqprimer=Seq("gacttt")+cutsite
    
    print("cut" + GeneName + "  " + cutSequence)
    print("seq" + GeneName + "  " + seqprimer)
    print("Lup" + GeneName + "del" + " " + Lup)
    print("Rup" + GeneName + "del" + " " + Rup)
    print("Ldown" + GeneName + "del" + " " + Ldown)
    print("Rdown" + GeneName + "del" + " " + Rdown)

    return Ldown, Rup
Beispiel #26
0
 def get_sgrna(self):
     # return DataFrame contains possible sgRNAs.
     if not hasattr(self, 'sgrna'):
         ngg = re.compile(
             '([atgcATGC]{20})([atgcATGC](GG|gg|Gg|gG))'
         )
         ccn = re.compile(
             '((CC|cc|Cc|cC)[atgcATGC])([atgcATGC]{20})'
         )
         columns = ['seqname', 'start', 'cut', 'end', 'sgrna', 'pam']
         sgrna = list()
         for chromosome in self.genome:
             sglist = [
                 {
                     'seqname': chromosome.id,
                     'start': x.start(),
                     'cut': x.end() - 6,
                     'end': x.end() - 3,
                     'sgrna': x.group(1),
                     'pam': x.group(2)
                 }
                 for x in ngg.finditer(str(chromosome.seq))
             ]
             sglist.extend(
                 {
                     'seqname': chromosome.id,
                     'start': x.start() + 3,
                     'cut': x.start() + 6,
                     'end': x.end(),
                     'sgrna': Seq.reverse_complement(x.group(3)),
                     'pam': Seq.reverse_complement(x.group(1))
                 }
                 for x in ccn.finditer(str(chromosome.seq))
             )
             sgrna.append(
                 pd.DataFrame(
                     sglist,
                     columns = columns
                 )
             )
         self.sgrna = pd.concat(sgrna, axis = 0, ignore_index = True)
     return self.sgrna
Beispiel #27
0
def oligos_1():
    overhangs = ['CATG', 'ACAA']
    bc = 'GATGATTGA'
    kozak = 'gccacc'
    start = 'atg'
    fwd = overhangs[0] + loxp + bc + kozak + start + lox71
    rev = seq.reverse_complement(loxp + bc + kozak + start + lox71 +
                                 overhangs[1])

    print fwd.upper()
    print rev.upper()
Beispiel #28
0
def find_orf_all(seq):
    """Given a string representing a coding sequence, find all ORFs for 
    all frames on the + and - strand"""
    all_orfs = []
    all_orfs.extend(find_orf(seq))
    all_orfs.extend(find_orf(seq[1:]))
    all_orfs.extend(find_orf(seq[2:]))
    revc = Seq.reverse_complement(seq)
    all_orfs.extend(find_orf(revc))
    all_orfs.extend(find_orf(revc[1:]))
    all_orfs.extend(find_orf(revc[2:]))
    return list(set(all_orfs))
Beispiel #29
0
def make_fasta(gene_dict, args):

    with open(args.out_file, "w") as out_handle:
        for rec in SeqIO.parse(args.fasta, "fasta"):
            for gene, start, end, strand in gene_dict[rec.id]:
                seq = str(rec.seq)[start - args.f - 1:end + args.f]
                if strand == "-":
                    seq = Seq.reverse_complement(seq)

                out_handle.write(
                    f">{gene} | location={rec.id}:{start-args.f-1}-{end+args.f} including {args.f}n flank | strand={strand}\n{seq}\n"
                )
Beispiel #30
0
 def counter(kmers):
     final_shape = np.r_[np.shape(kmers), [alphabet_size+1]]
     counts = np.zeros([np.size(kmers), alphabet_size+1])
     for i, k in enumerate(kmers.flatten()):
         for j, b in enumerate(alphabet):
             # Get kp1mer count
             counts[i, j] = get_kmc_count(k + b, file, kmer_token, c)
             # Get reverse count (assemblies only look at one strand).
             if reverse:
                 counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b),
                                               file, kmer_token, c)
     return counts.reshape(final_shape)
def orf_reader(infile):
	orfs = {}
	handle = open(infile,"r")
	lines = handle.readlines()
	for line in lines:
		if line[0] != "#":
			line_array = line.split("\t")
			if int(line_array[1]) < 0:
				orfs[line_array[0]] = [Seq.reverse_complement(line_array[4]),line_array[4]]
			else:
				orfs[line_array[0]] = [line_array[4],line_array[4]]
	return orfs
Beispiel #32
0
 def counter(kmers):
     final_shape = np.r_[np.shape(kmers), [alphabet_size+1]]
     counts = np.zeros([np.size(kmers), alphabet_size+1])
     for i, k in enumerate(kmers.flatten()):
         k = k.replace('[', '')
         for j, b in enumerate(alphabet):
             # Get kp1mer count
             counts[i, j] = get_kmc_count(k + b, files[len(k)], kmer_token, c)
             # Get reverse count (assemblies only look at one strand).
             if reverse:
                 if len(k) == lag:
                     counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b),
                                                   files[len(k)], kmer_token, c)
                 if len(k) < lag:
                     counts[i, j] += get_kmc_count(Seq.reverse_complement(k + b),
                                                   files_suf[len(k)], kmer_token, c)
         if len(k) == lag:
             counts[i, -1] = get_kmc_count(k, files_suf[len(k)-1], kmer_token, c)
             if reverse:
                 counts[i, -1] += get_kmc_count(Seq.reverse_complement(k),
                                                files[len(k)-1], kmer_token, c)
     return counts.reshape(final_shape)
def main() -> None:
    """ Make a jazz noise here """

    args = get_args()
    if seqs := [str(rec.seq) for rec in SeqIO.parse(args.file, 'fasta')]:
        rna = seqs[0].replace('T', 'U')
        orfs = set()

        for seq in [rna, Seq.reverse_complement(rna)]:
            for i in range(3):
                if prot := Seq.translate(truncate(seq[i:], 3), to_stop=False):
                    for orf in find_orfs(prot):
                        orfs.add(orf)
Beispiel #34
0
def translateSeq(cds):
    senseOrAnti = 'sense'
    finalCDS = cds
    try:
        translated = Seq.translate(cds,cds=True)
#         finalCDS = cds
    except TranslationError,e:
        try:
            reverseCDS = Seq.reverse_complement(cds)
            translated = Seq.translate(reverseCDS,cds=True)
            finalCDS = reverseCDS
            senseOrAnti = 'anti'
        except TranslationError,e:
            print 'Translation failed in %s'%cds
Beispiel #35
0
def calculate(sequence, cut_length, conditions, cg_clamp,
              self_comp) -> pd.DataFrame:
    sequence = remove_unnecessary(sequence).upper()
    frag_list, position_list = get_fragments(seq=sequence,
                                             cut_length=cut_length)
    used_frag_list = []
    rev_comp_list = []
    tm_list_breslauer = []
    tm_list_santalucia = []
    cg_list = []
    marked_list = []
    for frag, pos in zip(frag_list, position_list):
        if cg_clamp:
            if check_gc_clamp(frag, last=5):
                continue
        if self_comp:
            if check_selfcomp(frag, threshold=4):
                continue
        used_frag_list.append(frag)
        rev_comp_list.append(str(Seq.reverse_complement(frag)))
        nn = NearestNeighbor(frag)
        melting_temp_breslauer = nn.breslauer()
        melting_temp_santalucia = nn.santalucia()
        cg_content = round(GC(frag), 1)
        marked_result = view_position(full_seq=sequence, pos_list=pos)
        tm_list_breslauer.append(melting_temp_breslauer)
        tm_list_santalucia.append(melting_temp_santalucia)
        cg_list.append(cg_content)
        marked_list.append(marked_result)
    df = pd.DataFrame({
        'fragment': used_frag_list,
        'rev_comp': rev_comp_list,
        'breslauer': tm_list_breslauer,
        'santalucia': tm_list_santalucia,
        'cg_content': cg_list,
        'position': marked_list
    })
    # narrow down except homology
    filtered_df, homology_condition = narrow_down(df, conditions=conditions)
    homology_list = get_homology_count(filtered_df['fragment'])
    filtered_df['homology'] = homology_list
    # narrowed down by homology
    if homology_condition:
        filtered_df, _ = narrow_down(filtered_df,
                                     homology_condition,
                                     pending=False)
    # sorted by breslauer
    filtered_df_s = filtered_df.sort_values('breslauer', ascending=False)
    return filtered_df_s
Beispiel #36
0
    def __call__(self):

        # Initialize output files.
        file_out = {
            'suf': [
                open(self.file_out_names['suf'][li], 'w')
                for li in range(self.lag)
            ]
        }
        if self.pr:
            file_out['pre'] = [
                open(self.file_out_names['pre'][li], 'w')
                for li in range(self.lag)
            ]
        else:
            file_out['pre'] = open(self.file_out_names['pre'], 'w')
        if self.file_type != 'fq' or self.reverse:
            file_out['full'] = open(self.file_out_names['full'], 'w')

        # Open file.
        in_file = open(self.file, 'r')
        # Iterate through sequences in files.
        for j, elem in enumerate(load_input(in_file, self.file_type)):
            not_init = j > 0
            name, seq = elem[:2]

            self.__write_out(file_out, not_init, name, seq)

            if self.reverse:
                not_init = True
                seq = Seq.reverse_complement(seq)
                name = name + '_rev'

                self.__write_out(file_out, not_init, name, seq)

        # Close files.
        if self.pr:
            for li in range(self.lag):
                file_out['pre'][li].close()
        else:
            file_out['pre'].close()
        if self.file_type != 'fq' or self.reverse:
            file_out['full'].close()
        for li in range(self.lag):
            file_out['suf'][li].close()
Beispiel #37
0
def dna_aa():
    if session.username == None:
        redirect(URL(r=request, c='account', f='log_in'))
    form = FORM(
        TABLE(
            TR(
                'Sequence (raw format):  ',
                TEXTAREA(_type='text',
                         _name='sequence',
                         requires=IS_NOT_EMPTY())),
            #TR("Sequence Type: ",
            #  SELECT("Raw Format", "FASTA",
            #         _name="seq_type")),
            TR(
                'Action: ',
                SELECT('Complementation',
                       'Transcribe',
                       'Translate',
                       'Back Transcribe',
                       'Back Translate',
                       _name='action'), INPUT(_type='submit',
                                              _value='SUBMIT'))))
    if form.accepts(request.vars, session):
        #if form.vars.seq_type == "FASTA":
        #    session['sequence'] = \
        #        seqClean(fasta_to_raw(form.vars.sequence.upper()))
        #else:
        session['sequence'] = seqClean(form.vars.sequence.upper())
        if form.vars.action == "Complementation":
            session['action'] = "Complementation"
            session['Complement'] = Seq.reverse_complement(session['sequence'])
        if form.vars.action == "Transcribe":
            session['action'] = 'Transcribe'
            session['Transcribed RNA'] = Seq.transcribe(session['sequence'])
        if form.vars.action == "Back Transcribe":
            session['action'] = 'Back Transcribe'
            session['DNA'] = Seq.back_transcribe(session['sequence'])
        if form.vars.action == "Translate":
            session['action'] = 'Translate'
            session.update(translate(session['sequence']))
        if form.vars.action == "Back Translate":
            session['action'] = 'Back Translate'
            session.update(back_translate(session['sequence']))
        redirect(URL(r=request, f='dna_aa_output'))
    return dict(form=form)
Beispiel #38
0
    def get_three_end(self, gene_of_interest, reverse_primer):
        reverse_complement = Seq.reverse_complement(reverse_primer)
        self.check_sequence_occurrences(reverse_complement, "Reverse primer")

        if self.gene_end_in_plasmid is None:
            self.check_sequence_occurrences(gene_of_interest,
                                            "Gene of interest")
            gene_end = self.plasmid_sequence.find(gene_of_interest) + len(
                gene_of_interest)
        else:
            gene_end = self.gene_end_in_plasmid
        reverse_position = self.plasmid_sequence.find(reverse_complement)
        reverse_primer_end = reverse_position + len(reverse_primer)

        if reverse_primer_end > gene_end:
            return self.plasmid_sequence[gene_end:reverse_primer_end]
        else:
            return self.plasmid_sequence[gene_end:] + \
                   self.plasmid_sequence[:reverse_primer_end]
Beispiel #39
0
def bam_to_fastq(bam_file, is_paired):
    """Convert a BAM file to fastq files.
    """
    out_files, out_handles = _get_fastq_handles(bam_file, is_paired)
    if len(out_handles) > 0:
        in_bam = pysam.Samfile(bam_file, mode='rb')
        for read in in_bam:
            num = 1 if (not read.is_paired or read.is_read1) else 2
            # reverse the sequence and quality if mapped to opposite strand
            if read.is_reverse:
                seq = str(Seq.reverse_complement(Seq.Seq(read.seq)))
                qual = "".join(reversed(read.qual))
            else:
                seq = read.seq
                qual = read.qual
            out_handles[num].write("@%s\n%s\n+\n%s\n" %
                                   (read.qname, seq, qual))
    [h.close() for h in out_handles.values()]
    return out_files
Beispiel #40
0
def bam_to_fastq(bam_file, is_paired):
    """Convert a BAM file to fastq files.
    """
    out_files, out_handles = _get_fastq_handles(bam_file,
            is_paired)
    if len(out_handles) > 0:
        in_bam = pysam.Samfile(bam_file, mode='rb')
        for read in in_bam:
            num = 1 if (not read.is_paired or read.is_read1) else 2
            # reverse the sequence and quality if mapped to opposite strand
            if read.is_reverse:
                seq = str(Seq.reverse_complement(Seq.Seq(read.seq)))
                qual = "".join(reversed(read.qual))
            else:
                seq = read.seq
                qual = read.qual
            out_handles[num].write("@%s\n%s\n+\n%s\n" % (read.qname,
                seq, qual))
    [h.close() for h in out_handles.values()]
    return out_files
Beispiel #41
0
def dna_aa():
    if session.username == None:
        redirect(URL(r=request, c='account', f='log_in'))
    form = FORM(TABLE(TR('Sequence (raw format):  ', 
                        TEXTAREA(_type='text', _name='sequence',
                                 requires=IS_NOT_EMPTY())),
                      #TR("Sequence Type: ", 
                      #  SELECT("Raw Format", "FASTA",
                      #         _name="seq_type")),
                      TR('Action: ', 
                        SELECT('Complementation', 'Transcribe', 'Translate', 
                               'Back Transcribe', 'Back Translate',
                               _name='action'),
                      INPUT(_type='submit', _value='SUBMIT'))))
    if form.accepts(request.vars,session):
        #if form.vars.seq_type == "FASTA": 
        #    session['sequence'] = \
        #        seqClean(fasta_to_raw(form.vars.sequence.upper()))
        #else: 
        session['sequence'] = seqClean(form.vars.sequence.upper())
        if form.vars.action == "Complementation":
           session['action'] = "Complementation"
           session['Complement'] = Seq.reverse_complement(session['sequence'])
        if form.vars.action == "Transcribe": 
            session['action'] = 'Transcribe'
            session['Transcribed RNA'] = Seq.transcribe(session['sequence'])
        if form.vars.action == "Back Transcribe": 
            session['action'] = 'Back Transcribe'
            session['DNA'] = Seq.back_transcribe(session['sequence'])
        if form.vars.action == "Translate":
            session['action'] = 'Translate'
            session.update(translate(session['sequence']))
        if form.vars.action == "Back Translate":
            session['action'] = 'Back Translate'
            session.update(back_translate(session['sequence']))
        redirect(URL(r=request, f='dna_aa_output'))
    return dict(form=form)
Beispiel #42
0
    compl_values = complement(values).replace("T", "U")  # need to help as no alphabet
    print "%s={%s} --> {%s}=%s" % (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char])
    assert set(compl_values) == set(ambiguous_rna_values[ambiguous_rna_complement[ambig_char]])

print
print "Reverse complements:"
for sequence in [
    Seq.Seq("".join(sorted(ambiguous_rna_values))),
    Seq.Seq("".join(sorted(ambiguous_dna_values))),
    Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna),
    Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna),
    Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X", ""), IUPAC.IUPACAmbiguousRNA()),
    Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X", ""), IUPAC.IUPACAmbiguousDNA()),
    Seq.Seq("AWGAARCKG"),
]:  # Note no U or T
    print "%s -> %s" % (repr(sequence), repr(Seq.reverse_complement(sequence)))
    assert str(sequence) == str(
        Seq.reverse_complement(Seq.reverse_complement(sequence))
    ), "Dobule reverse complement didn't preserve the sequence!"
print

###########################################################################

test_seqs = [
    s,
    t,
    u,
    Seq.Seq("ATGAAACTG"),
    "ATGAAACtg",
    # TODO - Fix ambiguous translation
    # Seq.Seq("ATGAARCTG"),
Beispiel #43
0
 def test_reverse_complement_of_rna(self):
     seq = "AUGAAACUG"
     self.assertEqual("CAGUUUCAU", Seq.reverse_complement(seq))
Beispiel #44
0
def complement(sequence) :
    #TODO - Add a complement function to Bio/Seq.py?
    #There is already a complement method on the Seq and MutableSeq objects.
    return Seq.reverse_complement(sequence)[::-1]
Beispiel #45
0
    compl_values = complement(values).replace("T","U") #need to help as no alphabet
    print "%s={%s} --> {%s}=%s" % \
        (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char])
    assert set(compl_values) == set(ambiguous_rna_values[ambiguous_rna_complement[ambig_char]])

print
print "Reverse complements:"
for sequence in [Seq.Seq("".join(sorted(ambiguous_rna_values))),
            Seq.Seq("".join(sorted(ambiguous_dna_values))),
            Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna),
            Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna),
            Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X",""), IUPAC.IUPACAmbiguousRNA()),
            Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X",""), IUPAC.IUPACAmbiguousDNA()),
            Seq.Seq("AWGAARCKG")]:  # Note no U or T
        print "%s -> %s" \
              % (repr(sequence), repr(Seq.reverse_complement(sequence)))
        assert sequence.tostring() \
           == Seq.reverse_complement(Seq.reverse_complement(sequence)).tostring(), \
           "Dobule reverse complement didn't preserve the sequence!"
print

###########################################################################

test_seqs = [s,t,u,
             Seq.Seq("ATGAAACTG"),
             "ATGAAACtg",
             #TODO - Fix ambiguous translation
             #Seq.Seq("ATGAARCTG"),
             #Seq.Seq("AWGAARCKG"),  # Note no U or T
             #Seq.Seq("".join(ambiguous_rna_values)),
             #Seq.Seq("".join(ambiguous_dna_values)),
Beispiel #46
0
def twin(km):
    ''' Retorna la secuencia invertida '''
    return Seq.reverse_complement(km)
Beispiel #47
0
    compl_values = complement(values).replace("T", "U")  # need to help as no alphabet
    print("%s={%s} --> {%s}=%s" %
        (ambig_char, values, compl_values, ambiguous_rna_complement[ambig_char]))
    assert set(compl_values) == set(ambiguous_rna_values[ambiguous_rna_complement[ambig_char]])

print("")
print("Reverse complements:")
for sequence in [Seq.Seq("".join(sorted(ambiguous_rna_values))),
            Seq.Seq("".join(sorted(ambiguous_dna_values))),
            Seq.Seq("".join(sorted(ambiguous_rna_values)), Alphabet.generic_rna),
            Seq.Seq("".join(sorted(ambiguous_dna_values)), Alphabet.generic_dna),
            Seq.Seq("".join(sorted(ambiguous_rna_values)).replace("X", ""), IUPAC.IUPACAmbiguousRNA()),
            Seq.Seq("".join(sorted(ambiguous_dna_values)).replace("X", ""), IUPAC.IUPACAmbiguousDNA()),
            Seq.Seq("AWGAARCKG")]:  # Note no U or T
        print("%s -> %s"
              % (repr(sequence), repr(Seq.reverse_complement(sequence))))
        assert str(sequence) \
           == str(Seq.reverse_complement(Seq.reverse_complement(sequence))), \
           "Dobule reverse complement didn't preserve the sequence!"
print("")

###########################################################################

test_seqs = [s, t, u,
             Seq.Seq("ATGAAACTG"),
             "ATGAAACtg",
             # TODO - Fix ambiguous translation
             # Seq.Seq("ATGAARCTG"),
             # Seq.Seq("AWGAARCKG"),  # Note no U or T
             # Seq.Seq("".join(ambiguous_rna_values)),
             # Seq.Seq("".join(ambiguous_dna_values)),
def twin(km):
    return Seq.reverse_complement(km)
Beispiel #49
0
#ORF: AUG
#CLOSE : UAA, UAG, UGA

#>Rosalind_99

#MLLGSFRLIPKETLIQVAGSSPCNLS
#M
#MGMTPRLGLESLLE
#MTPRLGLESLLE


with open("rosalind_orf.txt", 'r') as Z:
    DNA1 = Z.read()
    DNA = DNA1.replace("\n", '')
ReverseDNA = Seq(DNA, generic_dna)
FinalReverse = str(ReverseDNA.reverse_complement())

FinalizedProt = []

def ReadingFrameFinder(DNASTRING):
    CleanDNA = DNASTRING.rstrip("\n")
    OpenLocations = []
    CloseLocations = []
    stringlen = len(CleanDNA)
    TtoU = CleanDNA.replace("T", 'U')
    readingframeRange = xrange(0, stringlen)
    PossibleGenes = []
    for item in readingframeRange:
        if TtoU[item:item+3] == "AUG":
            Newthing = xrange(item, stringlen, 3)
            storage = item
    coordinates_dict[header].append("\t".join(column_list))


#-----------------------------------------------------
# Step 3
# Itterate through the fasta accessions, extracting
# sequence data
#-----------------------------------------------------

seq_records = list(SeqIO.parse(conf.fasta_file, "fasta"))
# print(len(seq_records))

for accession in seq_records:
    header = accession.id
    # print header
    for coordinates in coordinates_dict[header]:
        # print ("hello")
        # print coordinates
        coordinate_list = coordinates.split("\t")
        # print coordinate_list
        start = int(coordinate_list[1])
        stop = int(coordinate_list[2])
        extracted_seq = accession.seq[start:stop]
        if '-' in coordinate_list[3]:
            # print ("reversed")
            extracted_seq = Seq.reverse_complement(extracted_seq)
        print (">" + coordinate_list[4])
        print extracted_seq
    # print accession.id
    # print accession.seq[5:10]
def _main():
    parser = argparse.ArgumentParser(description="Search Unique Sequence")
    parser.add_argument('--gene2refseq', type=commonlib.FileType('r'), help="default: %(default)s", nargs='?', default=os.path.join(BASEDIR, '../gene2refseq_test.txt.bz2'))
    parser.add_argument('--refseq_rna', type=commonlib.FileType('r'), nargs="?", help="default: %(default)s", default=os.path.join(BASEDIR, '../refseq.rna.bz2'))
    parser.add_argument('--unique-ngram', type=commonlib.FileType('w'), nargs="?", help="default: %(default)s", default=os.path.join(BASEDIR, '../unique-10gram.txt.bz2'))
    parser.add_argument('--not-unique-ngram', type=commonlib.FileType('w'), nargs="?", help="default: %(default)s", default=os.path.join(BASEDIR, '../unique-10gram-not.txt.bz2'))
    parser.add_argument('--fragment-specific-unique-ngram', type=commonlib.FileType('w'), nargs="?", help="default: %(default)s", default=os.path.join(BASEDIR, '../unique-10gram-fragment-specific.txt.bz2'))
    parser.add_argument('--ngram', type=int, default=10)
    parser.add_argument('--coverage-output', type=commonlib.FileType('w'), help="default: %(default)s", default=os.path.join(BASEDIR, '../coverage.txt'))
    options = parser.parse_args()

    # load gene2refseq
    gene2refseq = collections.defaultdict(set)
    refseq2gene = dict()

    for row in csv.reader(options.gene2refseq, delimiter='\t', quotechar=None):
        if row[3] == '-': continue
        
        gene2refseq[row[1]].add(row[3])
        refseq2gene[row[3]] = row[1]

    print gene2refseq

    refseq2sequence = dict()
    ngram2refseq = collections.defaultdict(set)

    available_gene2refseq = collections.defaultdict(list)

    # load sequences and calculate uniqueness

    for record in SeqIO.parse(options.refseq_rna, 'fasta'):
        refseq_id = record.id.split('|')[3]
        refseq2sequence[refseq_id] = {'id': refseq_id, 'seq': str(record.seq), 'coverage': list(['.']*len(record.seq)), 'gene': 0, 'fragment': 0}
        available_gene2refseq[refseq2gene[refseq_id]].append(refseq_id)
        for i in xrange(len(record.seq)-options.ngram+1):
            fragment = record.seq[i:i+options.ngram]
            
            ngram2refseq[str(fragment)].add((refseq_id, refseq2gene[refseq_id]))
            ngram2refseq[str(Seq.reverse_complement(fragment))].add((refseq_id, refseq2gene[refseq_id]))

    # write out gene unique and non-unique, isoform specific n-grams

    unique_ngrams = []
    unique_no_common_ngrams = []
    output = csv.writer(options.unique_ngram, delimiter='\t', quotechar=None)
    output_notunique = csv.writer(options.not_unique_ngram, delimiter='\t', quotechar=None)
    output_specific = csv.writer(options.fragment_specific_unique_ngram, delimiter='\t', quotechar=None)
    for k, v in ngram2refseq.iteritems():
        geneset = set([x[1] for x in v])
        refseqset = set([x[0] for x in v])
        if len(geneset) == 1:
            geneid = geneset.pop()
            print refseqset, available_gene2refseq[geneid]
            if refseqset == set(available_gene2refseq[geneid]):
                unique_ngrams.append((k, geneid, ', '.join(refseqset)))
                output.writerow([k, geneid, ', '.join(refseqset)])
            else:
                unique_no_common_ngrams.append((k, geneid, ', '.join(refseqset)))
                output_specific.writerow([k, geneid, ', '.join(refseqset)])
        else:
            output_notunique.writerow([k, ', '.join(geneset), ', '.join(refseqset)])

    # isoform specific reads
    for oneunique in unique_no_common_ngrams:
        #print oneunique
        ngram = len(oneunique[0])
        for refseq in oneunique[2].split(', '):
            record = refseq2sequence[refseq]
            refseq2sequence[refseq]['gene'] = oneunique[1]
            pos = 0
            while True:
                pos = record['seq'].find(oneunique[0], pos)
                if pos < 0: break
                record['coverage'][pos] = '_'
                for i in xrange(ngram-1):
                    if record['coverage'][pos+1+i] == '.':
                        record['coverage'][pos+1+i] = ','
                pos += 1


    # calculate coverage
    for oneunique in unique_ngrams:
        #print oneunique
        ngram = len(oneunique[0])
        for refseq in oneunique[2].split(', '):
            record = refseq2sequence[refseq]
            refseq2sequence[refseq]['gene'] = oneunique[1]
            refseq2sequence[refseq]['fragment'] += 1
            pos = 0
            while True:
                pos = record['seq'].find(oneunique[0], pos)
                if pos < 0: break
                record['coverage'][pos] = 'X'
                for i in xrange(ngram-1):
                    if record['coverage'][pos+1+i] != 'X':
                        record['coverage'][pos+1+i] = '*'
                pos += 1

    # write out coverage
    coverage_result = csv.writer(options.coverage_output, delimiter='\t', quotechar=None)
    coverage_result.writerow(['GeneID', 'RefSeqID', 'Length', '# of covered base', '# of unique fragment', '# of found fragmnet', 'coverage', 'cover'])
    for k, v in refseq2sequence.iteritems():
        coverage = v['coverage']
        covered = len([x for x in v['coverage'] if x == 'X' or x == '*'])
        starts = len([x for x in v['coverage'] if x == 'X'])
        assert len(coverage) == len(v['seq'])
        percent = starts/float(len(coverage)-ngram+1)
        coverage_result.writerow([v['gene'], k, len(coverage), covered, v['fragment'], starts, str(percent), ''.join(coverage)])
Beispiel #52
0
 def test_reverse_complement_of_dna(self):
     seq = "ATGAAACTG"
     self.assertEqual("CAGTTTCAT", Seq.reverse_complement(seq))