def generateSeqHandles(anIndexCfg):
    """
        The YAML config file to parse is like:
    
    handles:
        prefix: "TTAGTCTCCGACGGCAGGCTTCAAT"
        postfix: "ACGCACCCACCGGGACTCAG"
    indexes: [
        "ACAGTC",
        "TGATGC",
        "TCTCAG"
    ]

    There is a handle at one end of each sequence which is as follows:
    TTAGTCTCCGACGGCAGGCTTCAAT-ACAGTC-ACGCACCCACCGGGACTCAG
              prefix         -index -      postfix
    """
    forwardIdx= []     # the result array to collect handle sequence strings
    handlePrefix = anIndexCfg["handles"]["prefix"]
    handlePostfix = anIndexCfg["handles"]["postfix"]
    for index in anIndexCfg["indexes"]:
        forwardIdx.append(handlePrefix + index + handlePostfix)
    
    reverseIdx = []       # to collect reverse complements
    for handle in forwardIdx:
        seq = Seq(handle)
        rc = str(seq.reverse_complement())
        reverseIdx.append(rc)

    return (forwardIdx,reverseIdx)
def translateSeq(DNASeq):
	
	seq=DNASeq
	try:
		myseq= Seq(seq)
		protseq=Seq.translate(myseq, table=11,cds=True)

	except:
		try:
			seq=reverseComplement(seq)
			myseq= Seq(seq)
			protseq=Seq.translate(myseq, table=11,cds=True)

		except:
			try:
				seq=seq[::-1]
				myseq= Seq(seq)
				protseq=Seq.translate(myseq, table=11,cds=True)
			except:
				try:
					seq=seq[::-1]                           
					seq=reverseComplement(seq)
					myseq= Seq(seq)
					protseq=Seq.translate(myseq, table=11,cds=True)
				except:
					raise
	return protseq
Esempio n. 3
0
def stitch(fragments):
	#this function takes seq records and prints primers

	#let's make an empty sequence file
	Nfrags=len(fragments)
	donor=Seq("")
	index=[]
	print("")
	for i in range (0, Nfrags):
		donor=donor+fragments[i]
	# Dummy assignment setup to allow for compilation
	Lup = ""
	Rup = ""
	Ldown = ""
	Rdown = ""
	L = ""
	R = ""

	for i in range (0, Nfrags):
		if i==0:
			Lup = "Lup"+ fragments[i].id + " " + getPrimer(donor)
			Rup = "Rup"+ fragments[i].id + "(" + fragments[i+1].id + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement())
		elif i==Nfrags-1:
			Ldown = "Ldown"+ fragments[i].id + "(" + fragments[i-1].id + ") " + overhangPrimer(fragments[i],fragments[i-1])
			Rdown = "Rdown"+ fragments[i].id + " " + getPrimer(donor.reverse_complement())
		else:
			L = "L"+ fragments[i].id + "(" + fragments[i-1].id + ") " + overhangPrimer(fragments[i],fragments[i-1])
			R = "R"+ fragments[i].id + "(" + fragments[i+1].id + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement())

	sequenceLength = len(donor.seq)
	donorSequence = donor.seq

	return str(Lup), str(Rup), str(Ldown), str(Rdown), str(L), str(R), "Sequence Length: " + str(sequenceLength), "Sequence: " + str(donorSequence)
Esempio n. 4
0
def translate(records, translate):
    """
    Perform translation from generic DNA/RNA to proteins.  Bio.Seq
    does not perform back-translation because the codons would
    more-or-less be arbitrary.  Option to translate only up until
    reaching a stop codon.  translate must be one of the following:
        dna2protein
        dna2proteinstop
        rna2protein
        rna2proteinstop
    """
    logging.info('Applying translation generator: '
                 'operation to perform is ' + translate + '.')

    to_stop = translate.endswith('stop')

    source_type = translate[:3]
    alphabet = {'dna': IUPAC.ambiguous_dna, 'rna': IUPAC.ambiguous_rna}[source_type]

    # Get a translation table
    table = {'dna': CodonTable.ambiguous_dna_by_name["Standard"],
             'rna': CodonTable.ambiguous_rna_by_name["Standard"]}[source_type]

    # Handle ambiguities by replacing ambiguous codons with 'X'
    # TODO: this copy operation causes infinite recursion with python3.6 -
    # not sure why it was here to begin with.
    # table = copy.deepcopy(table)
    table.forward_table = CodonWarningTable(table.forward_table)

    for record in records:
        sequence = str(record.seq)
        seq = Seq(sequence, alphabet)
        protein = seq.translate(table, to_stop=to_stop)
        yield SeqRecord(protein, id=record.id, description=record.description)
def translateSeq(DNASeq):
    seq=DNASeq
    reversedSeq=False
    try:
        myseq= Seq(seq)
        #print myseq
        protseq=Seq.translate(myseq, table=11,cds=True)
    except:
        reversedSeq=True
        try:
            seq=reverseComplement(seq)
            myseq= Seq(seq)
            #print myseq
            protseq=Seq.translate(myseq, table=11,cds=True)
                        
        except:
            try:
                seq=seq[::-1]
                myseq= Seq(seq)
                #print myseq
                protseq=Seq.translate(myseq, table=11,cds=True)
            except:
                reversedSeq=False
                try:
                    seq=seq[::-1]                           
                    seq=reverseComplement(seq)
                    myseq= Seq(seq)
                    #print myseq
                    protseq=Seq.translate(myseq, table=11,cds=True)
                except:
                    raise
    return protseq, reversedSeq
Esempio n. 6
0
def bio_seq_count_method(data):
    '''
    Uses the count method on the Seq class from BioPython. Includes
    instantiation of the class.
    '''
    dna = Seq(data)
    return dna.count('A'), dna.count('C'), dna.count('G'), dna.count('T')
Esempio n. 7
0
def myTranslate(nucl):
    """Given a raw sequence of nucleotides, return raw sequence of amino acids."""
    #print nucl
    nucseq = Seq(nucl)
    #print nucseq
    aminoseq = nucseq.translate()
    return str(aminoseq)
def get_reads_seqs(bamfile, rnames):
    """
    Return the sequences of all the reads from the bam file
    Arguments:
    - `bamfile`: The pysam file
    - `rnames`: reads names
    """
    r1_seqs = {}
    r2_seqs = {}
    rqns = set()
    reads = defaultdict(list)
    for read in bamfile.fetch(until_eof=True):
        rqns.add(read.qname)
        reads[read.qname].append(read)
    for rn in set(rnames) & rqns:
        for read in reads[rn]:
            if read.is_read1:
                outseq = Seq(read.seq)
                if not read.is_reverse:
                    outseq = outseq.reverse_complement()
                r1_seqs[read.qname] = str(outseq)
            else:
                outseq = Seq(read.seq)
                if read.is_reverse:
                    outseq = outseq.reverse_complement()
                r2_seqs[read.qname] = str(outseq)
    # r1_seqs is the 3' end of the second fused RNA, r2_seqs is the 5' of the
    # first fused RNA
    return r1_seqs, r2_seqs
def translateSeq(DNASeq):
	seq=DNASeq
	try:
		myseq= Seq(seq)
		protseq=Seq.translate(myseq, table=11,cds=True)
	except:
		try:
			seq=reverseComplement(seq)
			myseq= Seq(seq)
			protseq=Seq.translate(myseq, table=11,cds=True)
						
		except:
			try:
				seq=seq[::-1]
				myseq= Seq(seq)
				protseq=Seq.translate(myseq, table=11,cds=True)
			except:
				try:
					seq=seq[::-1]							
					seq=reverseComplement(seq)
					myseq= Seq(seq)
					protseq=Seq.translate(myseq, table=11,cds=True)
				except Exception as e:
					print "translated error"
					print e
					raise
	return protseq
def stitch(fragments):
    #this function takes seq records and prints primers
    
    #let's make an empty sequence file
    Nfrags=len(fragments)
    donor=Seq("")
    index=[]
    print("")
    for i in range (0, Nfrags):
        donor=donor+fragments[i]
    
    for i in range (0, Nfrags):
        if i==0:
            print("Lup"+ fragments[i].name + " " + getPrimer(donor))
            print("Rup"+ fragments[i].name + "(" + fragments[i+1].name + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement()))
        elif i==Nfrags-1:
            print("Ldown"+ fragments[i].name + "(" + fragments[i-1].name + ") " + overhangPrimer(fragments[i],fragments[i-1]))
            print("Rdown"+ fragments[i].name + " " + getPrimer(donor.reverse_complement()))
        else:
            print("L"+ fragments[i].name + "(" + fragments[i-1].name + ") " + overhangPrimer(fragments[i],fragments[i-1]))
            print("R"+ fragments[i].name + "(" + fragments[i+1].name + ") " + overhangPrimer(fragments[i].reverse_complement(),fragments[i+1].reverse_complement()))

    print("")
    print("Your donor DNA cassette, has the following bp length and sequence:")


    print("")
    print(len(donor.seq))
    print("")

    print(donor.seq)

    print("")
    print("You might want to copy this entire prompt and save it for your records.")
def insert_element(pos, ref, ofile):
    chrn = pos[0]
    chrseq = ref[chrn]
    half1 = chrseq[:pos[2]]
    half2 = chrseq[pos[2]:]
    repid =pos[4]
    repseq=pos[5]
    reptsd=pos[6]
    repname=pos[7]
    #print 'insert: %s, %s' %(reptsd, repseq)
    #Chr1    not.give        transposable_element_attribute  1132975 1132977 -       .       .       ID=Chr1.1132977.spanners;avg_flankers=17;spanners=0;type=homozygous;TE=mping;TSD=TAA
    gff_newline = '%s\tPseudoGenome\tTransposable_element\t%s\t%s\t%s\t.\t.\tID=%s_%s_%s;Original_ID=%s;TE=%s;TSD=%s;' %(chrn, pos[1], pos[2], pos[3], chrn, pos[1], pos[2], repid, repname, reptsd)
    print >> ofile, gff_newline
    ##we choose sequence at target site as tsd, not use tsd provided
    tsdstart = pos[2] - len(reptsd)
    tsdseq   = chrseq[tsdstart:pos[2]]
    newseq   = ''
    if pos[3] == '+':
        newseq = half1 + repseq + tsdseq + half2
        #print tsdseq, repseq
    else:
        repseq_seq = Seq(repseq)
        repseq_rec = repseq_seq.reverse_complement()
        #print tsdseq, str(repseq_rec)
        newseq = half1 + str(repseq_rec) + tsdseq + half2
    ref[chrn] = newseq
Esempio n. 12
0
def findFragendSites(fasta, resite):
    ''' Function creates FragendDict object. The object contains
    the location of all fragends for eachh strand of all
    chromosomes within a FASTA file.
    '''
    # Process restriction enzyme size and create output dictionary
    resite = resite.upper()
    frags = {'resite': resite}
    # Create sequence object for resite and reverse complent
    standard = Seq(resite)
    revcomp = standard.reverse_complement()
    # Open and parse fasta file
    fastaHandle = open(fasta)
    fastaData = SeqIO.parse(fastaHandle,'fasta')
    # Loop through fasta file and extract fragend information for each chromosome
    for fasta in fastaData:
        # Extract name and sequence
        fName, fSequence = str(fasta.id), str(fasta.seq).upper()
        # Add re sites to dictionary using 1 based index
        forward = nt_search(fSequence, standard)[1:]
        if forward:
            frags[(fName,'+')] = [x + len(resite) for x in forward]
        else:
            frags[(fName,'+')] = []
        reverse = nt_search(fSequence, revcomp)[1:]
        if reverse:
            frags[(fName,'-')] = [x + 1 for x in reverse]
        else:
            frags[(fName,'-')] = []
    # Close input file and return data
    fastaHandle.close()
    return(frags)
Esempio n. 13
0
def translate(records, translate):
    """
    Perform translation from generic DNA/RNA to proteins.  Bio.Seq
    does not perform back-translation because the codons would
    more-or-less be arbitrary.  Option to translate only up until
    reaching a stop codon.  translate must be one of the following:
        dna2protein
        dna2proteinstop
        rna2protein
        rna2proteinstop
    """
    logging.info('Applying translation generator: '
                 'operation to perform is ' + translate + '.')

    to_stop = translate.endswith('stop')

    source_type = translate[:3]
    alphabet = {'dna': generic_dna, 'rna': generic_rna}[source_type]

    # Get a translation table
    table = {'dna': CodonTable.standard_dna_table,
             'rna': CodonTable.standard_rna_table}[source_type]

    # Handle ambiguities by replacing ambiguous codons with 'X'
    forward = CodonWarningDict()
    forward.update(table.forward_table)
    table.forward_table = forward

    for record in records:
        sequence = str(record.seq)
        seq = Seq(sequence, alphabet)
        protein = seq.translate(table, to_stop=to_stop)
        yield SeqRecord(protein, id=record.id, description=record.description)
Esempio n. 14
0
def filter_low_complexity(s, o):
    sread = readread(s)
    biased = 0
    low_complexity = 0
    total_reads = 0
    remaining_reads = 0
    while (sread[0]):
        total_reads += 1
        my_read = Seq(sread[1], generic_dna)
        a = my_read.count("A")
        c = my_read.count("C")
        t = my_read.count("T")
        g = my_read.count("G")
        seq_len = len(my_read)
        count_list = [a, c, t, g]
        if (count_list.count(0) < 2):
            if (max(a, c, t, g) / seq_len < 0.9):
                writeread(sread, o)
                remaining_reads += 1
            else:
                biased += 1
        else:
            low_complexity += 1
        sread = readread(s)
    removed = biased + low_complexity
    sys.stderr.write("Total reads processed: %s\\n" % total_reads)
    sys.stderr.write(r"Low complexity reads removed: %s\n" % low_complexity)
    sys.stderr.write(r"Biased reads removed: %s\n" % biased)
    sys.stderr.write(r"Total reads removed: %s\n" % removed)
    sys.stderr.write(r"Total reads remaining: %s\n" % remaining_reads)
Esempio n. 15
0
def get_seq_meta(g, request):
	"""get all the sequence metadata"""		
	#get features
	feats = []
	for f in g.features.all():
		quals = []
		for q in f.qualifiers.all():
			quals.append({	'name': q.name,
							'data': q.data,
						 })
		s = None
		if f.direction == 'f':
			s = 1
		elif f.direction == 'r':
			s = -1
		feats.append({	'start': f.start,
							'end': f.end,
							'strand': s,
							'type': f.type,
							'qualifiers': quals,
						})
		
	#assume Ambiguous DNA
	let = Seq(IUPAC.IUPACAmbiguousDNA.letters, IUPAC.IUPACAmbiguousDNA())
	rlet = let.complement()
	alpha = {}
	for i in range(len(let)):
		alpha[let[i].lower()] = rlet[i].lower()
		alpha[let[i].upper()] = rlet[i].upper()
	
	return JsonResponse({	'len': len(g.sequence),
							'feats': feats,
							'alpha': alpha,
						})
def translateSeq(DNASeq):
	seq=DNASeq
	reversedSeq=False
	tableid=11
	try:
		myseq= Seq(seq)
		protseq=Seq.translate(myseq, table=tableid,cds=True)
	except:
		reversedSeq=True
		try:
			seq=reverseComplement(seq)
			myseq= Seq(seq)
			protseq=Seq.translate(myseq, table=tableid,cds=True)
						
		except:
			try:
				seq=seq[::-1]
				myseq= Seq(seq)
				protseq=Seq.translate(myseq, table=tableid,cds=True)
			except:
				reversedSeq=False
				try:
					seq=seq[::-1]							
					seq=reverseComplement(seq)
					myseq= Seq(seq)
					protseq=Seq.translate(myseq, table=tableid,cds=True)
				except Exception as e:
					print "translated error"
					print e
					protseq=""
	return protseq,seq,reversedSeq
Esempio n. 17
0
def  translate (dna_seq, phase, mitochondrial=False, strip_stop=True, verbose=False):

    pepseq = ""
    if phase < 0: phase = 0
    
    for phase_adjustment in range(3): # in case the phase assigned by ensembl is wrong
        phase = (phase+phase_adjustment)%3
        offset = phase2offset(phase)
        dnaseq = Seq (dna_seq[offset:], generic_dna)
        if mitochondrial:
            pepseq = dnaseq.translate(table="Vertebrate Mitochondrial").tostring()
        else:
            pepseq = dnaseq.translate().tostring()
    
        if verbose:
            print " ** translation for:", dnaseq
            print " ** phase:", phase
            print " ** ", pepseq
    
        if strip_stop: 
            if pepseq and pepseq[-1]=='*':
                pepseq = pepseq[:-1]
                # the case when we have only an ending piece of codon
                if not pepseq:
                    # offset is 0 - we are reading only one or two nucleotides from the left
                    length_translated = len(dna_seq) - 3
                    return [0, length_translated, pepseq, phase]
            if not '*' in pepseq:
                return [offset, 3*len(pepseq), pepseq, phase]
        elif not '*' in pepseq[:-1]:
            return [offset, 3*len(pepseq), pepseq, phase]

    return [-1, 0, "", 0]
Esempio n. 18
0
 def _process_single_end(self, input_fh, output_fh):
     for header, seq, qualities in self._parse_sequences(input_fh):
         raw_seq_len = len(seq)
         self._stats["total_no_of_reads"] += 1
         if self._fastq and not self._min_phred_score is None:
             seq = self._trim_by_quality(seq, qualities)
         if self._reverse_complement:
             seq = Seq(seq)
             seq = str(seq.reverse_complement())
         if not self._adapter is None:
             seq = self._clip_adapter(seq)
         if self._poly_a_clipping:
             seq = self._poly_a_clipper.clip_poly_a_strech(seq)
             seq = self._poly_a_clipper.remove_3_prime_a(seq)
         clipped_seq_len = len(seq)
         if clipped_seq_len == raw_seq_len - 1:
             self._stats["single_a_removed"] += 1
         elif clipped_seq_len < raw_seq_len - 1:
             self._stats["polya_removed"] += 1
         else:
             self._stats["unmodified"] += 1
         if clipped_seq_len < self._min_read_length:
             self._stats["too_short"] += 1
             continue
         self._stats["long_enough"] += 1
         self._stats["read_length_before_processing_and_freq"][
             raw_seq_len] += 1
         self._stats["read_length_after_processing_and_freq"][
             clipped_seq_len] += 1
         # Encoding to bytes is necessary due to saving via gzip
         output_fh.write(str.encode(">%s\n%s\n" % (header, seq)))
Esempio n. 19
0
def scanSequences(title,sequence,quality):
    tmp = str(random.random())[2:]
    seq = Seq(sequence)
    tempfile = open("hmm.seq"+tmp,"w")
    tempfile.write(">forward\n"+sequence+"\n>reverse\n"+seq.reverse_complement().tostring())   # writing full length seqto file for hmmscan
    tempfile.close()
    
    local_path = os.getcwd()+"/"
    hmmscan_bin = "/usr/local/bin/hmmscan"
    hmmresult_filename = doHMMScan("hmm.seq"+tmp,local_path,hmmscan_bin)
    target_cregions = processHMMresult(hmmresult_filename,local_path)
    s = q =''
    if target_cregions['ns5b_5prime'] =='' and target_cregions['ns5b_3prime'] =='':
        s = sequence
        q = quality
    elif target_cregions['ns5b_5prime'] != '' and target_cregions=='forward':
        x = target_cregions['ns5b_5prime'][0]
        s = sequence[x:]
        q = quality[x:]
    elif target_cregions['ns5b_3prime'] != '' and target_cregions=='forward':
        x = target_cregions['ns5b_3prime'][0]
        s = sequence[:x]
        q = quality[:x]
    elif target_cregions['ns5b_5prime'] and target_cregions=='reverse':
        x = len(sequence)-target_cregions['ns5b_5prime'][0]
        s = sequence[:x]
        q = quality[:x]
    elif target_cregions['ns5b_3prime'] and target_cregions=='reverse':
        x = len(sequence)-target_cregions['ns5b_3prime'][0]+1     # This is a little tricky to compensate for 0-based index of string
        s = sequence[x:]
        q = quality[x:]
    
  #  print "title: ",title," q: ",q," s: ",s
    return (title,s,q)
Esempio n. 20
0
def AA_sequence(refDNA_dic,cds_df,gene,seq_type='AA'):
    pr_seqs = []
    tr_seqs = []
    # 1. get all proteins
    gene_df = cds_df[cds_df['geneid'].values==gene]
    prs = list(set(gene_df['access'].tolist()))
    prs = sorted(prs)
    obj = trpr(gene_df)
    # 2. loop for each pr
    for pr in prs:
        # 1) get chromosome
        chrom = obj.get_chrom(pr,id_type='access')
        pos = obj.get_trpr_pos(pr)
        ref_seq = refDNA_dic[chrom].seq
        sequence = ''.join([ref_seq[p-1] for p in pos])
        nt_seq = Seq(sequence,generic_dna)
        if pos[0]>pos[1]:
            nt_seq = nt_seq.complement()
        AA = str(nt_seq.translate())
        tr_seqs.append(str(nt_seq))
        pr_seqs.append(AA)
    if seq_type=='AA':
        return pr_seqs,prs
    else:
        return tr_seqs,prs
Esempio n. 21
0
 def __init__(self, _seq, _gene_id="unknown", _transcript_id=None, _orig_transcript=None, _vars=None):
     """
     :param str _seq: String of an IUPACProtein alphabet, representing the
                      protein
     :param str _gene_id: ID of the genome the protein originated from
     :param str _transcript_id: ID of the transcript the protein originated 
                                from
     :param Transcript _orig_transcript: Reference to the originating 
                                         transcript
     :param dict(int,list(Variant)) _vars: Nonsynonymous variants that are
                                           assoziated with the protein. 
                                           key=position within protein, 
                                           value=list of variants at that pos
     """
     # Init parent type:
     MetadataLogger.__init__(self)
     Seq.__init__(self, _seq.upper(), IUPAC.IUPACProtein)
     # Init own member:
     if _vars is None:
         self.vars = dict()
     else:
         self.vars = _vars  # {prot-position: list(variant)}
     self.orig_transcript = _orig_transcript
     self.transcript_id = "Protein_%i"%Protein.newid() if _transcript_id is None else _transcript_id
     self.gene_id = _gene_id
Esempio n. 22
0
            def setup_transcript_data(cls, hdp, tx_ac, pro_ac):
                """helper for generating RefTranscriptData from for c_to_p"""
                tx_info = hdp.get_tx_identity_info(var_c.ac)
                tx_seq = hdp.get_tx_seq(tx_ac)

                if tx_info is None or tx_seq is None:
                    raise hgvs.exceptions.HGVSError("Missing transcript data for accession: {}".format(tx_ac))

                # use 1-based hgvs coords
                cds_start = tx_info["cds_start_i"] + 1
                cds_stop = tx_info["cds_end_i"]

                # padding list so biopython won't complain during the conversion
                tx_seq_to_translate = tx_seq[cds_start - 1 : cds_stop]
                if len(tx_seq_to_translate) % 3 != 0:
                    "".join(list(tx_seq_to_translate).extend(["N"] * ((3 - len(tx_seq_to_translate) % 3) % 3)))

                tx_seq_cds = Seq(tx_seq_to_translate)
                protein_seq = str(tx_seq_cds.translate())

                if pro_ac is None:
                    # get_acs... will always return at least the MD5_ accession
                    pro_ac = hdp.get_acs_for_protein_seq(protein_seq)[0]

                transcript_data = RefTranscriptData(tx_seq, protein_seq, cds_start, cds_stop, pro_ac)

                return transcript_data
Esempio n. 23
0
def multiple_dna(*args):
    """
    List of tuples: (seq_name, seq_frame, seq)
    """
    seq_name_lengths = []
    input_file = NamedTemporaryFile(prefix="mafft_")

    for arg in args:
        seq_name, seq_frame, seq = arg

        if seq_frame < 0:
            seq_name = "%s(%s)" % (seq_name, "-")
            seq = Seq(seq).reverse_complement().tostring()
        elif seq_frame > 0:
            seq_name = "%s(%s)" % (seq_name, "+")

        input_file.write(">%s\n%s\n" % (seq_name, seq.upper()))
        seq_name_lengths.append(len(seq_name))

    input_file.flush()

    namelength = max(seq_name_lengths) + 4

    mafft_cmd = (
        "mafft --genafpair --maxiterate 1000 --preservecase --clustalout --namelength "
        + str(namelength)
        + " "
        + input_file.name
    )
    mafft_proc = Popen(mafft_cmd, stdout=PIPE, stderr=PIPE, shell=True)

    stdout, stderr = mafft_proc.communicate()
    input_file.close()

    return stdout
Esempio n. 24
0
def find_palindromes_variable(seq):
    """
    Go through for each length (starting with 2) by steps of 2 (even numbers are
    the only lenghts capable of being palindromes) and determine if there are
    palindromes in seq.

    Return once you go through the entire sequence or once you reach a length with no palindromes
    (once you find no palindromes there won't be any in the future)
    """
    palindromeDict={}
    seq_len = len(seq)
    max_size = seq_len
    for size in range(2,max_size+1,2):
        palindromeDict[size]={}
        found = False
        for i in range(seq_len-size+1):
            sub_seq=seq[i:i+size]
            biopy_seq=Seq(sub_seq, generic_dna)
            #if sub_seq[::-1] == mckinney_complement(sub_seq):
            if sub_seq[::-1] == biopy_seq.complement().tostring():
                found=True
                palindromeDict[size][i]=sub_seq
        if not found:
            #return if no palindromes found
            return palindromeDict
    return palindromeDict
Esempio n. 25
0
def translater(DNA,req):
    req.content_type = 'text/html'
    coding_dna = Seq(DNA, generic_dna)
    trans = coding_dna.translate()
    req.write("Hieronder staat de translatie:")
    req.write("<BR><textarea>"+str(trans)+"</textarea><BR>")
    req.write('<form action="http://cytosine.nl/~owe8_pg1/Thierry/afvink1.py"><BR><input type="submit" value="Terug"></form>')
def make_consensus( rev_string, for_string, seqfile):
    "function that accepts 2 sequence and returns the consensus sequence"
    # make fasta file for each paired sequence
    rev_sequence = Seq(rev_string.replace("\n", "").replace('\r', '').replace(' ', ''), IUPAC.ambiguous_dna)
    rev_sequence= rev_sequence.reverse_complement()
    for_sequence = Seq(for_string.replace("\n", "").replace('\r', '').replace(' ', ''), IUPAC.ambiguous_dna)
    paired_sequences = [SeqRecord(rev_sequence, id="rev"), SeqRecord(for_sequence, id="for")]
    if not os.path.exists("results/"):
        os.makedirs("results/")
    fasta_file = "results/" + seqfile + ".fasta"
    SeqIO.write(paired_sequences, fasta_file, "fasta")
    # align the paired sequences
    aln_file = "results/" + seqfile + ".aln"
    # clustalw_cline = ClustalwCommandline("clustalw", infile=fasta_file, outfile=aln_file, pwgapopen="100", gapopen="100")
    clustalw_cline = ClustalwCommandline("clustalw", infile=fasta_file, outfile=aln_file, pwgapopen=100, gapopen=100)
    clustalw_cline()
    # hack so that dumb_consensus will accept 1 base call against N
    f = open(aln_file, 'r+')
    contents = f.read()
    f.close()
    f = open(aln_file, 'w')
    f.write( contents.replace('N','.') )
    f.close()
    # read in alignment file and generate consensus
    alignment = AlignIO.read(aln_file, "clustal")
    summary_align = AlignInfo.SummaryInfo(alignment)
    return summary_align.dumb_consensus(ambiguous = "N", threshold=0.0, require_multiple=0)
Esempio n. 27
0
def translateSeq(DNASeq,transTable):
	seq=DNASeq
	tableid=transTable
	reversedSeq=False
	try:
		myseq= Seq(seq)
		protseq=Seq.translate(myseq, table=tableid,cds=True)
	except:
		reversedSeq=True
		try:
			seq=reverseComplement(seq)
			myseq= Seq(seq)
			protseq=Seq.translate(myseq, table=tableid,cds=True)
						
		except:
			try:
				seq=seq[::-1]
				myseq= Seq(seq)
				protseq=Seq.translate(myseq, table=tableid,cds=True)
			except:
				reversedSeq=False
				try:
					seq=seq[::-1]							
					seq=reverseComplement(seq)
					myseq= Seq(seq)
					protseq=Seq.translate(myseq, table=tableid,cds=True)
				except Exception as e:

					raise ValueError(e)
	return protseq,seq,reversedSeq
Esempio n. 28
0
def pairwise_protein(query_name, query_seq, query_frame, subject_name, subject_seq, subject_frame):

    if query_frame < 0:
        query_name = query_name + "(" + str(query_frame) + ")"
        query_seq = Seq(query_seq).reverse_complement()[-query_frame - 1 :].translate().tostring()

    elif query_frame > 0:
        query_name = query_name + "(" + str(query_frame) + ")"
        query_seq = Seq(query_seq)[query_frame - 1 :].translate().tostring()

    if subject_frame < 0:
        subject_name = subject_name + "(" + str(subject_frame) + ")"
        subject_seq = Seq(subject_seq).reverse_complement()[-subject_frame - 1 :].translate().tostring()

    elif subject_frame > 0:
        subject_name = subject_name + "(" + str(subject_frame) + ")"
        subject_seq = Seq(subject_seq)[subject_frame - 1 :].translate().tostring()

    input_file = NamedTemporaryFile(prefix="mafft_")
    input_file.write("\n".join([">" + query_name, query_seq.upper(), ">" + subject_name, subject_seq.upper()]))
    input_file.flush()

    namelength = max([len(query_name), len(subject_name)]) + 4

    mafft_cmd = "mafft --preservecase --clustalout --namelength " + str(namelength) + " " + input_file.name
    mafft_proc = Popen(mafft_cmd, stdout=PIPE, stderr=PIPE, shell=True)

    stdout, stderr = mafft_proc.communicate()

    return stdout
Esempio n. 29
0
def prepend_barcode(seqfile, bcfile, rc, text=''):
    tmph = open(seqfile+'.tmp', 'w')
    itr1 = FastqGeneralIterator(open(seqfile))
    itr2 = FastqGeneralIterator(open(bcfile))
    (h1, s1, q1) = itr1.next()
    (h2, s2, q2) = itr2.next()
    while 1:
        h1 = h1.split()[0]
        h2 = h2.split()[0]
        while h1 != h2:
            try:
                (h2, s2, q2) = itr2.next()
                h2 = h2.split()[0]
            except (StopIteration, IOError):
                break
        if rc:
            rcs = Seq(s2, generic_dna)
            s2 = rcs.reverse_complement()
            q2 = q2[::-1]
        if text:
            h1 = h1+'.'+text
        tmph.write("@%s\n%s%s\n+\n%s%s\n" %(h1, s2, s1, q2, q1))
        try:
            (h1, s1, q1) = itr1.next()
            (h2, s2, q2) = itr2.next()
        except (StopIteration, IOError):
            break
    tmph.close()
    os.rename(seqfile+'.tmp', seqfile)
Esempio n. 30
0
def gen_filter(pos,fpath) : 
	for seqr in SeqIO.parse(fpath,"fasta") : 
		seq = Seq("".join([seqr.seq[p] for p in pos]),generic_protein)
		if (seq.count('-')+0.0)/len(seq) > GAP_LIM : 
			continue
		seqr2 = SeqRecord(seq,id=seqr.id)
		yield seqr2
Esempio n. 31
0
from Bio.Seq import Seq

# Nucleotide Sequences
my_dna = Seq("AGTACACTGGTAGGCCTTACAG_T")
print(my_dna)  # AGTACACTGGTAGGCCTTACAG_T
print(my_dna.complement())  # TCATGTGACCATCCGGAATGTC_A
print(my_dna.reverse_complement())  # A_CTGTAAGGCCTACCAGTGTACT
print(my_dna.transcribe())  # AGUACACUGGUAGGCCUUACAG_U

my_rna = Seq("GAC_U")
print(my_rna)  # GAC_U
print(my_rna.reverse_complement())  # A_GUC
print(my_rna.reverse_complement())  # A_GUC
print(my_rna.transcribe())  # GAC_U
Esempio n. 32
0
    def test_count_overlap_start_end_GG(self):
        """Check our count_overlap method using GG with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 3),
            (3, None, 3),
            (3, 6, 2),
            (4, 6, 1),
            (4, -1, 2),
            (-5, None, 2),
            (-5, 7, 2),
            (7, -5, 0),
            (-100, None, 3),
            (None, 100, 3),
            (-100, 1000, 3),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("GG", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("GG", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 0),
            ("N", 1, 7, 0),
            ("N", -4, None, 0),
            ("N", -4, None, 0),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("GG", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("G", 100, 105, 0),
            ("G", -1, 4, 0),
            ("G", 4, -1, 0),
            ("G", -8, -2, 0),
            ("G", -2, -8, 0),
            ("G", 8, 2, 0),
            ("G", 2, 8, 0),
            ("GG", 8, 2, 0),
            ("GG", 2, 8, 0),
            ("GG", -5, -1, 0),
            ("GG", 1, 5, 0),
            ("GGG", None, None, 0),
            ("GGGGGGGGG", None, None, 0),
            ("GGG", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("GG", 1), 0)
Esempio n. 33
0
 def test_str_count(self):
     """Check matches the python string count method."""
     self._test_method("count", start_end=True)
     self.assertEqual(Seq("AC777GT").count("7"), 3)
     self.assertRaises(TypeError, Seq("AC777GT").count, 7)
     self.assertRaises(TypeError, Seq("AC777GT").count, None)
Esempio n. 34
0
class StringMethodTests(unittest.TestCase):
    _examples = [
        # These are length 9, a multiple of 3 for translation tests:
        Seq("ACGTGGGGT"),
        Seq("ACGUGGGGU"),
        Seq("GG"),
        Seq("A"),
        UnknownSeq(1),
        UnknownSeq(1, character="n"),
        UnknownSeq(1, character="N"),
        UnknownSeq(12, character="N"),
        UnknownSeq(12, character="X"),
        UnknownSeq(12),
    ]
    for seq in _examples[:]:
        if isinstance(seq, Seq):
            _examples.append(MutableSeq(seq))
    _start_end_values = [0, 1, 2, 1000, -1, -2, -999, None]

    def _test_method(self,
                     method_name,
                     pre_comp_function=None,
                     start_end=False):
        """Check this method matches the plain string's method."""
        if pre_comp_function is None:
            # Define a no-op function:

            def pre_comp_function(x):
                return x

        self.assertIsInstance(method_name, str)
        for example1 in self._examples:
            if not hasattr(example1, method_name):
                # e.g. MutableSeq does not support find
                continue
            str1 = str(example1)

            for example2 in self._examples:
                if not hasattr(example2, method_name):
                    # e.g. MutableSeq does not support find
                    continue
                if (method_name in ("index", "rindex")
                        and isinstance(example1, MutableSeq)
                        and len(example2) > 1):
                    # MutableSeq index only supports single entries
                    continue
                str2 = str(example2)

                try:
                    i = pre_comp_function(getattr(example1, method_name)(str2))
                except ValueError:
                    i = ValueError
                try:
                    j = pre_comp_function(getattr(str1, method_name)(str2))
                except ValueError:
                    j = ValueError
                if i != j:
                    raise ValueError("%r.%s(%r) = %r, not %r" %
                                     (example1, method_name, str2, i, j))

                try:
                    i = pre_comp_function(
                        getattr(example1, method_name)(example2))
                except ValueError:
                    i = ValueError
                try:
                    j = pre_comp_function(getattr(str1, method_name)(str2))
                except ValueError:
                    j = ValueError
                if i != j:
                    raise ValueError("%r.%s(%r) = %r, not %r" %
                                     (example1, method_name, example2, i, j))

                if start_end:
                    if isinstance(example1, MutableSeq):
                        # Does not support start/end arguments
                        continue
                    for start in self._start_end_values:
                        try:
                            i = pre_comp_function(
                                getattr(example1, method_name)(str2, start))
                        except ValueError:
                            i = ValueError
                        try:
                            j = pre_comp_function(
                                getattr(str1, method_name)(str2, start))
                        except ValueError:
                            j = ValueError
                        if i != j:
                            raise ValueError(
                                "%r.%s(%r, %i) = %r, not %r" %
                                (example1, method_name, str2, start, i, j))

                        for end in self._start_end_values:
                            try:
                                i = pre_comp_function(
                                    getattr(example1, method_name)(str2, start,
                                                                   end))
                            except ValueError:
                                i = ValueError
                            try:
                                j = pre_comp_function(
                                    getattr(str1, method_name)(str2, start,
                                                               end))
                            except ValueError:
                                j = ValueError
                            if i != j:
                                raise ValueError(
                                    "%r.%s(%r, %i, %i) = %r, not %r" % (
                                        example1,
                                        method_name,
                                        str2,
                                        start,
                                        end,
                                        i,
                                        j,
                                    ))

    def test_str_count(self):
        """Check matches the python string count method."""
        self._test_method("count", start_end=True)
        self.assertEqual(Seq("AC777GT").count("7"), 3)
        self.assertRaises(TypeError, Seq("AC777GT").count, 7)
        self.assertRaises(TypeError, Seq("AC777GT").count, None)

    def test_count_overlap(self):
        """Check count_overlap exception matches python string count method."""
        self.assertEqual(Seq("AC777GT").count("77"), 1)
        self.assertEqual(Seq("AC777GT").count_overlap("77"), 2)
        self.assertEqual(Seq("AC777GT").count_overlap("7"), 3)
        self.assertRaises(TypeError, Seq("AC777GT").count_overlap, 7)
        self.assertRaises(TypeError, Seq("AC777GT").count_overlap, None)

    def test_str_count_overlap_GG(self):
        """Check our count_overlap method using GG."""
        # Testing with self._examples
        expected = [
            3,
            3,
            1,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ]
        expected *= 2  # MutableSeq() Tests

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term GG as a string
            self.assertEqual(seq.count_overlap("GG"), exp)
            self.assertEqual(seq.count_overlap("G" * 5), 0)
            # Using search term GG as a Seq
            self.assertEqual(seq.count_overlap(Seq("GG")), exp)
            self.assertEqual(seq.count_overlap(Seq("G" * 5)), 0)

    def test_count_overlap_start_end_GG(self):
        """Check our count_overlap method using GG with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 3),
            (3, None, 3),
            (3, 6, 2),
            (4, 6, 1),
            (4, -1, 2),
            (-5, None, 2),
            (-5, 7, 2),
            (7, -5, 0),
            (-100, None, 3),
            (None, 100, 3),
            (-100, 1000, 3),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("GG", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("GG", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("GG"), 5)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 2, 8), 1)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", -11, 6), 3)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("GG", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("GG", -2, -10), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 0),
            ("N", 1, 7, 0),
            ("N", -4, None, 0),
            ("N", -4, None, 0),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("GG", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("GG", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("G", 100, 105, 0),
            ("G", -1, 4, 0),
            ("G", 4, -1, 0),
            ("G", -8, -2, 0),
            ("G", -2, -8, 0),
            ("G", 8, 2, 0),
            ("G", 2, 8, 0),
            ("GG", 8, 2, 0),
            ("GG", 2, 8, 0),
            ("GG", -5, -1, 0),
            ("GG", 1, 5, 0),
            ("GGG", None, None, 0),
            ("GGGGGGGGG", None, None, 0),
            ("GGG", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("GG", 1), 0)

    def test_str_count_overlap_NN(self):
        """Check our count_overlap method using NN."""
        # Testing with self._examples
        expected = [
            0,
            0,
            0,
            0,  # Seq() Tests
            0,
            0,
            0,
            11,
            0,
            0,
        ]  # UnknownSeq() Tests
        expected *= 2  # MutableSeq() Tests

        assert len(self._examples) == len(expected)

        for seq, exp in zip(self._examples, expected):
            # Using search term NN as a string
            self.assertEqual(seq.count_overlap("NN"), exp)
            self.assertEqual(seq.count_overlap("N" * 13), 0)
            # Using search term NN as a Seq
            self.assertEqual(seq.count_overlap(Seq("NN")), exp)
            self.assertEqual(seq.count_overlap(Seq("N" * 13)), 0)

    def test_count_overlap_start_end_NN(self):
        """Check our count_overlap method using NN with variable ends and starts."""
        # Testing Seq() and MutableSeq() with variable start and end arguments
        start_end_exp = [
            (1, 7, 0),
            (3, None, 0),
            (3, 6, 0),
            (4, 6, 0),
            (4, -1, 0),
            (-5, None, 0),
            (-5, 7, 0),
            (7, -5, 0),
            (-100, None, 0),
            (None, 100, 0),
            (-100, 1000, 0),
        ]

        testing_seq = "GTAGGGGAG"

        for start, end, exp in start_end_exp:
            self.assertEqual(
                Seq(testing_seq).count_overlap("NN", start, end), exp)
            self.assertEqual(
                MutableSeq(testing_seq).count_overlap("NN", start, end), exp)

        # Testing Seq() and MutableSeq() with a more heterogeneous sequenece
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(MutableSeq("GGGTGGTAGGG").count_overlap("NN"), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 2, 8), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", -11, 6), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(
            MutableSeq("GGGTGGTAGGG").count_overlap("NN", 7, 2), 0)
        self.assertEqual(Seq("GGGTGGTAGGG").count_overlap("NN", -10, -2), 0)

        # Testing UnknownSeq() with variable start and end arguments
        char_start_end_exp = [
            ("N", 1, 7, 5),
            ("N", 1, 7, 5),
            ("N", -4, None, 3),
            ("N", -4, None, 3),
            ("X", 1, 7, 0),
        ]

        for char, start, end, exp in char_start_end_exp:
            self.assertEqual(
                UnknownSeq(12, character=char).count_overlap("NN", start, end),
                exp)
        self.assertEqual(
            UnknownSeq(12, character="X").count_overlap("NN", 1, 7), 0)

        # Testing UnknownSeq() with some more cases including unusual edge cases
        substr_start_end_exp = [
            ("N", 100, 105, 0),
            ("N", -1, 4, 0),
            ("N", 4, -1, 2),
            ("N", -8, -2, 5),
            ("N", -2, -8, 0),
            ("N", 8, 2, 0),
            ("N", 2, 8, 5),
            ("NN", 8, 2, 0),
            ("NN", 2, 8, 4),
            ("NN", -5, -1, 3),
            ("NN", 1, 5, 3),
            ("NNN", None, None, 5),
            ("NNNNNNNNN", None, None, 0),
            ("NNN", 1, 2, 0),
        ]

        for substr, start, end, exp in substr_start_end_exp:
            self.assertEqual(
                UnknownSeq(7, character="N").count_overlap(substr, start, end),
                exp)
        self.assertEqual(
            UnknownSeq(7, character="N").count_overlap("NN", 1), 5)

    def test_str_find(self):
        """Check matches the python string find method."""
        self._test_method("find", start_end=True)
        self.assertEqual(Seq("AC7GT").find("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").find, 7)
        self.assertRaises(TypeError, Seq("ACGT").find, None)

    def test_str_rfind(self):
        """Check matches the python string rfind method."""
        self._test_method("rfind", start_end=True)
        self.assertEqual(Seq("AC7GT").rfind("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").rfind, 7)
        self.assertRaises(TypeError, Seq("ACGT").rfind, None)

    def test_str_index(self):
        """Check matches the python string index method."""
        self._test_method("index", start_end=True)
        self.assertEqual(Seq("AC7GT").index("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").index, 7)
        self.assertRaises(TypeError, Seq("ACGT").index, None)

    def test_str_rindex(self):
        """Check matches the python string rindex method."""
        self._test_method("rindex", start_end=True)
        self.assertEqual(Seq("AC7GT").rindex("7"), 2)
        self.assertRaises(TypeError, Seq("AC7GT").rindex, 7)
        self.assertRaises(TypeError, Seq("ACGT").rindex, None)

    def test_str_startswith(self):
        """Check matches the python string startswith method."""
        self._test_method("startswith", start_end=True)
        self.assertTrue("ABCDE".startswith(("ABE", "OBE", "ABC")))
        self.assertRaises(TypeError, Seq("ACGT").startswith, None)

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            if not hasattr(example1, "startswith"):
                # e.g. MutableSeq does not support this
                continue
            subs = tuple(example1[start:start + 2]
                         for start in range(0,
                                            len(example1) - 2, 3))
            subs_str = tuple(str(s) for s in subs)

            self.assertEqual(
                str(example1).startswith(subs_str), example1.startswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).startswith(subs_str, 3),
                example1.startswith(subs, 3))
            self.assertEqual(
                str(example1).startswith(subs_str, 2, 6),
                example1.startswith(subs, 2, 6),
            )

    def test_str_endswith(self):
        """Check matches the python string endswith method."""
        self._test_method("endswith", start_end=True)
        self.assertTrue("ABCDE".endswith(("ABE", "OBE", "CDE")))
        self.assertRaises(TypeError, Seq("ACGT").endswith, None)

        # Now check with a tuple of sub sequences
        for example1 in self._examples:
            if not hasattr(example1, "endswith"):
                # e.g. MutableSeq does not support this
                continue
            subs = tuple(example1[start:start + 2]
                         for start in range(0,
                                            len(example1) - 2, 3))
            subs_str = tuple(str(s) for s in subs)

            self.assertEqual(
                str(example1).endswith(subs_str), example1.endswith(subs))
            self.assertEqual(
                str(example1).startswith(subs_str),
                example1.startswith(subs_str))  # strings!
            self.assertEqual(
                str(example1).endswith(subs_str, 3),
                example1.endswith(subs, 3))
            self.assertEqual(
                str(example1).endswith(subs_str, 2, 6),
                example1.endswith(subs, 2, 6))

    def test_str_strip(self):
        """Check matches the python string strip method."""
        self._test_method("strip", pre_comp_function=str)
        self.assertEqual(Seq(" ACGT ").strip(), "ACGT")
        self.assertRaises(TypeError, Seq("ACGT").strip, 7)

    def test_str_rstrip(self):
        """Check matches the python string rstrip method."""
        self._test_method("rstrip", pre_comp_function=str)
        self.assertEqual(Seq(" ACGT ").rstrip(), " ACGT")
        self.assertRaises(TypeError, Seq("ACGT").rstrip, 7)

    def test_str_lstrip(self):
        """Check matches the python string lstrip method."""
        self._test_method("rstrip", pre_comp_function=str)
        self.assertEqual(Seq(" ACGT ").lstrip(), "ACGT ")
        self.assertRaises(TypeError, Seq("ACGT").lstrip, 7)

    def test_str_split(self):
        """Check matches the python string rstrip method."""
        # Calling split should return a list of Seq-like objects, we'll
        # just apply str() to each of them so it matches the string method
        self._test_method(
            "split",
            pre_comp_function=lambda x: [str(y) for y in x]  # noqa: E731
        )
        self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".split("7"))
        self.assertRaises(TypeError, Seq("AC7GT").split, 7)

    def test_str_rsplit(self):
        """Check matches the python string rstrip method."""
        # Calling rsplit should return a list of Seq-like objects, we'll
        # just apply str() to each of them so it matches the string method
        self._test_method(
            "rsplit",
            pre_comp_function=lambda x: [str(y) for y in x]  # noqa: E731
        )
        self.assertEqual(Seq("AC7GT").rsplit("7"), "AC7GT".rsplit("7"))
        self.assertRaises(TypeError, Seq("AC7GT").rsplit, 7)

    def test_str_length(self):
        """Check matches the python string __len__ method."""
        for example1 in self._examples:
            str1 = str(example1)
            self.assertEqual(len(example1), len(str1))

    def test_str_upper(self):
        """Check matches the python string upper method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(str(example1.upper()), str1.upper())

    def test_str_lower(self):
        """Check matches the python string lower method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(str(example1.lower()), str1.lower())

    def test_str_encode(self):
        """Check matches the python string encode method."""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            str1 = str(example1)
            self.assertEqual(example1.encode("ascii"), str1.encode("ascii"))
            self.assertEqual(example1.encode(), str1.encode())

    def test_str_hash(self):
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            with warnings.catch_warnings():
                # Silence change in behaviour warning
                warnings.simplefilter("ignore", BiopythonWarning)
                self.assertEqual(
                    hash(str(example1)),
                    hash(example1),
                    "Hash mismatch, %r for %r vs %r for %r" %
                    (hash(str(example1)), id(example1), hash(example1),
                     example1),
                )

    def test_str_comparison(self):
        for example1 in self._examples:
            for example2 in self._examples:
                with warnings.catch_warnings():
                    self.assertEqual(
                        str(example1) == str(example2),
                        example1 == example2,
                        "Checking %r == %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) != str(example2),
                        example1 != example2,
                        "Checking %r != %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) < str(example2),
                        example1 < example2,
                        "Checking %r < %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) <= str(example2),
                        example1 <= example2,
                        "Checking %r <= %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) > str(example2),
                        example1 > example2,
                        "Checking %r > %r" % (example1, example2),
                    )
                    self.assertEqual(
                        str(example1) >= str(example2),
                        example1 >= example2,
                        "Checking %r >= %r" % (example1, example2),
                    )

    def test_str_getitem(self):
        """Check slicing and indexing works like a string."""
        for example1 in self._examples:
            str1 = str(example1)
            for i in self._start_end_values:
                if i is not None and abs(i) < len(example1):
                    self.assertEqual(str(example1[i]), str1[i])
                self.assertEqual(str(example1[:i]), str1[:i])
                self.assertEqual(str(example1[i:]), str1[i:])
                for j in self._start_end_values:
                    self.assertEqual(str(example1[i:j]), str1[i:j])
                    for step in range(-3, 4):
                        if step == 0:
                            try:
                                print(example1[i:j:step])
                                self._assert(False)  # Should fail!
                            except ValueError:
                                pass
                        else:
                            self.assertEqual(str(example1[i:j:step]),
                                             str1[i:j:step])

    def test_tomutable(self):
        """Check creating a MutableSeq object."""
        for example1 in self._examples:
            mut = MutableSeq(example1)
            self.assertIsInstance(mut, MutableSeq)
            self.assertEqual(str(mut), str(example1))

    def test_toseq(self):
        """Check creating a Seq object."""
        for example1 in self._examples:
            seq = Seq(example1)
            self.assertIsInstance(seq, Seq)
            self.assertEqual(str(seq), str(example1))

    def test_the_complement(self):
        """Check obj.complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            if "U" in str1 or "u" in str1:
                mapping = str.maketrans("ACGUacgu", "UGCAugca")
            else:
                # Default to DNA, e.g. complement("A") -> "T" not "U"
                mapping = str.maketrans("ACGTacgt", "TGCAtgca")
            self.assertEqual(str1.translate(mapping), str(comp))

    def test_the_reverse_complement(self):
        """Check obj.reverse_complement() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                comp = example1.reverse_complement()
            except ValueError as e:
                self.assertEqual(str(e), "Proteins do not have complements!")
                continue
            str1 = str(example1)
            if "U" in str1 or "u" in str1:
                mapping = str.maketrans("ACGUacgu", "UGCAugca")
            else:
                # Defaults to DNA, so reverse_complement("A") --> "T" not "U"
                mapping = str.maketrans("ACGTacgt", "TGCAtgca")
            self.assertEqual(str1.translate(mapping)[::-1], str(comp))

    def test_the_transcription(self):
        """Check obj.transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be transcribed!":
                    continue
                if str(e) == "RNA cannot be transcribed!":
                    continue
                raise
            str1 = str(example1)
            if len(str1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            self.assertEqual(
                str1.replace("T", "U").replace("t", "u"), str(tran))

    def test_the_back_transcription(self):
        """Check obj.back_transcribe() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            try:
                tran = example1.back_transcribe()
            except ValueError as e:
                if str(e) == "Proteins cannot be back transcribed!":
                    continue
                if str(e) == "DNA cannot be back transcribed!":
                    continue
                raise
            str1 = str(example1)
            self.assertEqual(
                str1.replace("U", "T").replace("u", "t"), str(tran))

    def test_the_translate(self):
        """Check obj.translate() method."""
        mapping = ""
        for example1 in self._examples:
            if isinstance(example1, MutableSeq):
                continue
            if len(example1) % 3 != 0:
                # TODO - Check for or silence the expected warning?
                continue
            try:
                tran = example1.translate()
            except ValueError as e:
                if str(e) == "Proteins cannot be translated!":
                    continue
                raise
            # Try with positional vs named argument:
            self.assertEqual(example1.translate(11),
                             example1.translate(table=11))

            # TODO - check the actual translation, and all the optional args

    def test_the_translation_of_stops(self):
        """Check obj.translate() method with stop codons."""
        misc_stops = "TAATAGTGAAGAAGG"
        nuc = Seq(misc_stops)
        self.assertEqual("***RR", str(nuc.translate()))
        self.assertEqual("***RR", str(nuc.translate(1)))
        self.assertEqual("***RR", str(nuc.translate("SGC0")))
        self.assertEqual("**W**", str(nuc.translate(table=2)))
        self.assertEqual("**WRR",
                         str(nuc.translate(table="Yeast Mitochondrial")))
        self.assertEqual("**WSS", str(nuc.translate(table=5)))
        self.assertEqual("**WSS", str(nuc.translate(table=9)))
        self.assertEqual("**CRR", str(nuc.translate(table="Euplotid Nuclear")))
        self.assertEqual("***RR", str(nuc.translate(table=11)))
        self.assertEqual("***RR", str(nuc.translate(table="11")))
        self.assertEqual("***RR", str(nuc.translate(table="Bacterial")))
        self.assertEqual("**GRR", str(nuc.translate(table=25)))
        self.assertEqual("", str(nuc.translate(to_stop=True)))
        self.assertEqual("O*ORR", str(nuc.translate(table=special_table)))
        self.assertEqual("*QWRR",
                         str(nuc.translate(table=Chilodonella_uncinata_table)))
        # These test the Bio.Seq.translate() function - move these?:
        self.assertEqual(
            "*QWRR", translate(str(nuc), table=Chilodonella_uncinata_table))
        self.assertEqual("O*ORR", translate(str(nuc), table=special_table))
        self.assertEqual("", translate(str(nuc), to_stop=True))
        self.assertEqual("***RR", translate(str(nuc), table="Bacterial"))
        self.assertEqual("***RR", translate(str(nuc), table="11"))
        self.assertEqual("***RR", translate(str(nuc), table=11))
        self.assertEqual("**W**", translate(str(nuc), table=2))
        self.assertEqual(str(Seq("TAT").translate()), "Y")
        self.assertEqual(str(Seq("TAR").translate()), "*")
        self.assertEqual(str(Seq("TAN").translate()), "X")
        self.assertEqual(str(Seq("NNN").translate()), "X")
        self.assertEqual(str(Seq("TAt").translate()), "Y")
        self.assertEqual(str(Seq("TaR").translate()), "*")
        self.assertEqual(str(Seq("TaN").translate()), "X")
        self.assertEqual(str(Seq("nnN").translate()), "X")
        self.assertEqual(str(Seq("tat").translate()), "Y")
        self.assertEqual(str(Seq("tar").translate()), "*")
        self.assertEqual(str(Seq("tan").translate()), "X")
        self.assertEqual(str(Seq("nnn").translate()), "X")

    def test_the_translation_of_invalid_codons(self):
        """Check obj.translate() method with invalid codons."""
        for codon in ["TA?", "N-N", "AC_", "Ac_"]:
            nuc = Seq(codon)
            try:
                nuc.translate()
                self.fail("Translating %s should fail" % codon)
            except TranslationError:
                pass

    def test_the_translation_of_ambig_codons(self):
        """Check obj.translate() method with ambiguous codons."""
        for ambig_values in [ambiguous_dna_values, ambiguous_rna_values]:
            ambig = set(ambig_values.keys())
            ambig.remove("X")
            for c1 in ambig:
                for c2 in ambig:
                    for c3 in ambig:
                        values = {
                            str(Seq(a + b + c).translate())
                            for a in ambig_values[c1] for b in ambig_values[c2]
                            for c in ambig_values[c3]
                        }
                        t = str(Seq(c1 + c2 + c3).translate())
                        if t == "*":
                            self.assertEqual(values, set("*"))
                        elif t == "X":
                            self.assertGreater(
                                len(values),
                                1,
                                "translate('%s') = '%s' not '%s'" %
                                (c1 + c2 + c3, t, ",".join(values)),
                            )
                        elif t == "Z":
                            self.assertEqual(values, set("EQ"))
                        elif t == "B":
                            self.assertEqual(values, set("DN"))
                        elif t == "J":
                            self.assertEqual(values, set("LI"))
                        else:
                            self.assertEqual(values, set(t))
                        # TODO - Use the Bio.Data.IUPACData module for the
                        # ambiguous protein mappings?

    def test_init_typeerror(self):
        """Check Seq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, Seq, ("A", "C", "G", "T"))
        self.assertRaises(TypeError, Seq, ["A", "C", "G", "T"])
        self.assertRaises(TypeError, Seq, 1)
        self.assertRaises(TypeError, Seq, 1.0)

    def test_MutableSeq_init_typeerror(self):
        """Check MutableSeq __init__ gives TypeError exceptions."""
        self.assertRaises(TypeError, MutableSeq, ("A", "C", "G", "T"))
        self.assertRaises(TypeError, MutableSeq, ["A", "C", "G", "T"])
        self.assertRaises(TypeError, MutableSeq, 1)
        self.assertRaises(TypeError, MutableSeq, 1.0)

    def test_join_Seq_TypeError(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = Seq("NNNNN")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_UnknownSeq_TypeError_iter(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = UnknownSeq(5, character="-")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_MutableSeq_TypeError_iter(self):
        """Checks that a TypeError is thrown for all non-iterable types."""
        # No iterable types which contain non-accepted types either.

        spacer = MutableSeq("MMMMM")
        self.assertRaises(TypeError, spacer.join, 5)
        self.assertRaises(TypeError, spacer.join, ["ATG", "ATG", 5, "ATG"])

    def test_join_Seq(self):
        """Checks if Seq join correctly concatenates sequence with the spacer."""
        spacer = Seq("NNNNN")
        self.assertEqual(
            "N" * 15,
            spacer.join([Seq("NNNNN"), Seq("NNNNN")]),
        )

        spacer1 = Seq("")
        spacers = [spacer1, Seq("NNNNN"), Seq("GGG")]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target)))

    def test_join_UnknownSeq(self):
        """Checks if UnknownSeq join correctly concatenates sequence with the spacer."""
        spacer1 = UnknownSeq(5, character="-")
        spacer2 = UnknownSeq(0, character="-")
        spacers = [spacer1, spacer2]

        self.assertEqual(
            "-" * 15,
            spacer1.join(
                [UnknownSeq(5, character="-"),
                 UnknownSeq(5, character="-")]),
        )
        self.assertEqual(
            "N" * 5 + "-" * 10,
            spacer1.join([Seq("NNNNN"),
                          UnknownSeq(5, character="-")]),
        )

        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer2.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))
            # Now try single sequence arguments, should join the letters
            for target in example_strings + example_strings_seqs:
                self.assertEqual(
                    str(spacer).join(str(target)), str(spacer.join(target)))

    def test_join_MutableSeq_mixed(self):
        """Check MutableSeq objects can be joined."""
        spacer = MutableSeq("NNNNN")
        self.assertEqual(
            "N" * 15,
            spacer.join([MutableSeq("NNNNN"),
                         MutableSeq("NNNNN")]),
        )
        self.assertRaises(
            TypeError,
            spacer.join([Seq("NNNNN"), MutableSeq("NNNNN")]),
        )

    def test_join_Seq_with_file(self):
        """Checks if Seq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = Seq("NNNNN")
        spacer1 = Seq("")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_join_UnknownSeq_with_file(self):
        """Checks if UnknownSeq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = UnknownSeq(0, character="-")
        spacer1 = UnknownSeq(5, character="-")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_join_MutableSeq(self):
        """Checks if MutableSeq join correctly concatenates sequence with the spacer."""
        # Only expect it to take Seq objects and/or strings in an iterable!

        spacer1 = MutableSeq("")
        spacers = [
            spacer1,
            MutableSeq("NNNNN"),
            MutableSeq("GGG"),
        ]
        example_strings = ["ATG", "ATG", "ATG", "ATG"]
        example_strings_seqs = ["ATG", "ATG", Seq("ATG"), "ATG"]

        # strings with empty spacer
        str_concatenated = spacer1.join(example_strings)

        self.assertEqual(str(str_concatenated), "".join(example_strings))

        for spacer in spacers:
            seq_concatenated = spacer.join(example_strings_seqs)
            self.assertEqual(str(seq_concatenated),
                             str(spacer).join(example_strings))

    def test_join_MutableSeq_with_file(self):
        """Checks if MutableSeq join correctly concatenates sequence from a file with the spacer."""
        filename = "Fasta/f003"
        seqlist = [record.seq for record in SeqIO.parse(filename, "fasta")]
        seqlist_as_strings = [str(_) for _ in seqlist]

        spacer = MutableSeq("NNNNN")
        spacer1 = MutableSeq("")
        # seq objects with spacer
        seq_concatenated = spacer.join(seqlist)
        # seq objects with empty spacer
        seq_concatenated1 = spacer1.join(seqlist)

        ref_data = ref_data1 = ""
        ref_data = str(spacer).join(seqlist_as_strings)
        ref_data1 = str(spacer1).join(seqlist_as_strings)

        self.assertEqual(str(seq_concatenated), ref_data)
        self.assertEqual(str(seq_concatenated1), ref_data1)
        with self.assertRaises(TypeError):
            spacer.join(SeqIO.parse(filename, "fasta"))

    def test_equality(self):
        """Test equality when mixing types."""
        self.assertEqual(Seq("6"), "6")
        self.assertNotEqual(Seq("6"), 6)
        self.assertEqual(Seq(""), "")
        self.assertNotEqual(Seq(""), None)
        self.assertEqual(Seq("None"), "None")
        self.assertNotEqual(Seq("None"), None)

        self.assertEqual(MutableSeq("6"), "6")
        self.assertNotEqual(MutableSeq("6"), 6)
        self.assertEqual(MutableSeq(""), "")
        self.assertNotEqual(MutableSeq(""), None)
        self.assertEqual(MutableSeq("None"), "None")
        self.assertNotEqual(MutableSeq("None"), None)

        self.assertEqual(UnknownSeq(1, character="6"), "6")
        self.assertNotEqual(UnknownSeq(1, character="6"), 6)
        self.assertEqual(UnknownSeq(0), "")
        self.assertNotEqual(UnknownSeq(0), None)
Esempio n. 35
0
#!/usr/bin/env python3
# transcribe.py
# Import Seq
from Bio.Seq import Seq
dna = Seq("AGTACACTGGTA")
rna = dna.transcribe()
print(rna)
Esempio n. 36
0
# 4.5.4.calc_melting_temperature.py
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Seq import Seq

myseq = Seq("AGTCTGGGACGGCGCGGCAATCGCA")
print(mt.Tm_Wallace(myseq))  # 84.0 이 출력된다.
Esempio n. 37
0
def rev_comp(seq):
    return str(Seq(seq).reverse_complement())
    def parseline(self, line):
        """Parse a line from the Rebase emboss_e.xxx file."""
        line = [line[0]] + [line[1].upper()] + [int(i) for i in line[2:9]] + line[9:]
        name = line[0].replace("-", "_").replace(".", "_")
        site = line[1]  # sequence of the recognition site
        dna = Seq(site)
        size = line[2]  # size of the recognition site
        #
        #   Calculate the overhang.
        #
        fst5 = line[5]  # first site sense strand
        fst3 = line[6]  # first site antisense strand
        scd5 = line[7]  # second site sense strand
        scd3 = line[8]  # second site antisense strand

        #
        #   the overhang is the difference between the two cut
        #
        ovhg1 = fst5 - fst3
        ovhg2 = scd5 - scd3

        #
        #   0 has the meaning 'do not cut' in rebase. So we get short of 1
        #   for the negative numbers so we add 1 to negative sites for now.
        #   We will deal with the record later.
        #

        if fst5 < 0:
            fst5 += 1
        if fst3 < 0:
            fst3 += 1
        if scd5 < 0:
            scd5 += 1
        if scd3 < 0:
            scd3 += 1

        if ovhg2 != 0 and ovhg1 != ovhg2:
            #
            #   different length of the overhang of the first and second cut
            #   it's a pain to deal with and at the moment it concerns only
            #   one enzyme which is not commercially available (HaeIV).
            #   So we don't deal with it but we check the progression
            #   of the affair.
            #   Should HaeIV become commercially available or other similar
            #   new enzymes be added, this might be modified.
            #
            print(
                "\nWARNING : %s cut twice with different overhang length each time."
                "\n\tUnable to deal with this behaviour. "
                "\n\tThis enzyme will not be included in the database. Sorry." % name
            )
            print("\tChecking...")
            raise OverhangError
        if 0 <= fst5 <= size and 0 <= fst3 <= size:
            #
            # cut inside recognition site
            #
            if fst5 < fst3:
                #
                #  5' overhang
                #
                ovhg1 = ovhgseq = site[fst5:fst3]
            elif fst5 > fst3:
                #
                #  3' overhang
                #
                ovhg1 = ovhgseq = site[fst3:fst5]
            else:
                #
                #  blunt
                #
                ovhg1 = ovhgseq = ""
            for base in "NRYWMSKHDBV":
                if base in ovhg1:
                    #
                    #   site and overhang degenerated
                    #
                    ovhgseq = ovhg1
                    if fst5 < fst3:
                        ovhg1 = -len(ovhg1)
                    else:
                        ovhg1 = len(ovhg1)
                    break
                else:
                    continue
        elif 0 <= fst5 <= size:
            #
            #   5' cut inside the site 3' outside
            #
            if fst5 < fst3:
                #
                #   3' cut after the site
                #
                ovhgseq = site[fst5:] + (fst3 - size) * "N"
            elif fst5 > fst3:
                #
                #   3' cut before the site
                #
                ovhgseq = abs(fst3) * "N" + site[:fst5]
            else:
                #
                #   blunt outside
                #
                ovhg1 = ovhgseq = ""
        elif 0 <= fst3 <= size:
            #
            #   3' cut inside the site, 5' outside
            #
            if fst5 < fst3:
                #
                #   5' cut before the site
                #
                ovhgseq = abs(fst5) * "N" + site[:fst3]
            elif fst5 > fst3:
                #
                #   5' cut after the site
                #
                ovhgseq = site[fst3:] + (fst5 - size) * "N"
            else:
                #
                #   should not happen
                #
                raise ValueError("Error in #1")
        elif fst3 < 0 and size < fst5:
            #
            #   3' overhang. site is included.
            #
            ovhgseq = abs(fst3) * "N" + site + (fst5 - size) * "N"
        elif fst5 < 0 and size < fst3:
            #
            #   5' overhang. site is included.
            #
            ovhgseq = abs(fst5) * "N" + site + (fst3 - size) * "N"
        else:
            #
            #   5' and  3' outside of the site
            #
            ovhgseq = "N" * abs(ovhg1)
        #
        #   Now line[5] to [8] are the location of the cut but we have to
        #   deal with the weird mathematics of biologists.
        #
        #   EMBOSS sequence numbering give:
        #                 DNA = 'a c g t A C G T'
        #                             -1 1 2 3 4
        #
        #   Biologists do not know about 0. Too much use of latin certainly.
        #
        #   To compensate, we add 1 to the positions if they are negative.
        #   No need to modify 0 as it means no cut and will not been used.
        #   Positive numbers should be ok since our sequence starts 1.
        #
        #   Moreover line[6] and line[8] represent cut on the reverse strand.
        #   They will be used for non palindromic sites and sre.finditer
        #   will detect the site in inverse orientation so we need to add the
        #   length of the site to compensate (+1 if they are negative).
        #
        for x in (5, 7):
            if line[x] < 0:
                line[x] += 1
        for x in (6, 8):
            if line[x] > 0:
                line[x] -= size
            elif line[x] < 0:
                line[x] = line[x] - size + 1
        #
        #   now is the site palindromic?
        #   produce the regular expression which correspond to the site.
        #   tag of the regex will be the name of the enzyme for palindromic
        #   enzymesband two tags for the other, the name for the sense sequence
        #   and the name with '_as' at the end for the antisense sequence.
        #
        rg = ""
        if is_palindrome(dna):
            line.append(True)
            rg = "".join(["(?=(?P<", name, ">", regex(site.upper()), "))"])
        else:
            line.append(False)
            sense = "".join(["(?=(?P<", name, ">", regex(site.upper()), "))"])
            antisense = "".join(
                ["(?=(?P<", name, "_as>", regex(dna.reverse_complement()), "))"]
            )
            rg = sense + "|" + antisense
        #
        #   exact frequency of the site. (ie freq(N) == 1, ...)
        #
        freq = 1
        for base in site.upper():
            freq *= 4.0 / len(amb_dna[base])
        line.append(freq)
        #
        #   append regex and ovhg1, they have not been appended before not to
        #   break the factory class. simply to leazy to make the changes there.
        #
        line.append(rg)
        line.append(ovhg1)
        line.append(ovhgseq)
        return line
Esempio n. 39
0
def Diff_Features(features, indices, sequences, type, sample_id,
                  p_val_threshold, idx_pos, idx_neg, directory_results, group,
                  kernel, sample_avg, top_seq):
    pos_mean = []
    neg_mean = []
    p_val = []
    feature_num = list(range(len(features.T)))
    for i in feature_num:
        if sample_avg is False:
            pos = features[idx_pos, i]
            neg = features[idx_neg, i]
        else:
            df_temp = pd.DataFrame()
            df_temp['pos'] = features[idx_pos, i]
            df_temp['sample_id'] = sample_id[idx_pos]
            df_temp = df_temp.groupby(['sample_id']).agg({'pos': 'mean'})
            pos = np.asarray(df_temp['pos'].tolist())

            df_temp = pd.DataFrame()
            df_temp['neg'] = features[idx_neg, i]
            df_temp['sample_id'] = sample_id[idx_neg]
            df_temp = df_temp.groupby(['sample_id']).agg({'neg': 'mean'})
            neg = np.asarray(df_temp['neg'].tolist())

        pos_mean.append(np.mean(pos))
        neg_mean.append(np.mean(neg))
        try:
            stat, p = mannwhitneyu(pos, neg)
            p_val.append(p)
        except:
            p_val.append(1.0)

    df_features = pd.DataFrame()
    df_features['Feature'] = feature_num
    df_features['P_Val'] = p_val
    df_features['Pos'] = pos_mean
    df_features['Neg'] = neg_mean
    df_features['Mag'] = df_features['Pos'] - df_features['Neg']

    df_features = df_features[df_features['P_Val'] < p_val_threshold]

    df_features.sort_values(by='Mag', inplace=True, ascending=False)

    # Get motifs for positive features
    dir = os.path.join(directory_results, group + '_' + type + '_Motifs')
    if not os.path.exists(dir):
        os.makedirs(dir)

    file_list = [f for f in os.listdir(dir)]
    [os.remove(os.path.join(dir, f)) for f in file_list]

    seq_cluster = []
    feature_keep = []
    for feature in df_features['Feature'].tolist():
        if df_features['Mag'][feature] > 0:
            feature_keep.append(feature)
            sel = np.flip(features[:, feature].argsort(), -1)
            sel = sel[0:top_seq]
            seq_sel = sequences[sel]
            ind_sel = indices[sel, feature]
            seq_cluster.append(seq_sel)

            motifs = []
            for ii, i in enumerate(ind_sel, 0):
                motif = seq_sel[ii][int(i):int(i) + kernel]
                if len(motif) < kernel:
                    motif = motif + 'X' * (kernel - len(motif))
                motif = motif.lower()
                motif = SeqRecord(Seq(motif, IUPAC.protein), str(ii))
                motifs.append(motif)

            SeqIO.write(
                motifs,
                os.path.join(dir, 'feature_') + str(feature) + '.fasta',
                'fasta')

    seq_features_df_pos = pd.DataFrame()

    for ii, f in enumerate(feature_keep, 0):
        seq_features_df_pos[f] = seq_cluster[ii]

    return seq_features_df_pos
    def information_mixer(self, file1, file2, file3):
        """Combine extracted data from the three emboss_x.xxx files."""
        #
        #   Mix all the information from the 3 files and produce a coherent
        #   restriction record.
        #
        methfile = self.removestart(file1)
        sitefile = self.removestart(file2)
        supplier = self.removestart(file3)

        i1, i2 = 0, 0
        oldblock = None
        try:
            while True:
                block, i1 = self.getblock(methfile, i1)
                bl = self.get(block)
                line = (sitefile[i2].strip()).split()
                name = line[0]
                if name == bl[0]:
                    line.append(bl[1])  # -> methylation
                    line.append(bl[2])  # -> suppliers
                else:
                    bl = self.get(oldblock)
                    if line[0] == bl[0]:
                        line.append(bl[1])
                        line.append(bl[2])
                        i2 += 1
                    else:
                        raise TypeError
                oldblock = block
                i2 += 1
                try:
                    line = self.parseline(line)
                except OverhangError:  # overhang error
                    n = name  # do not include the enzyme
                    if not bl[2]:
                        print(f"Anyway, {n} is not commercially available.\n")
                    else:
                        print(f"Unfortunately, {n} is commercially available.\n")

                    continue
                # Hyphens and dots can't be used as a Python name, nor as a
                # group name in a regular expression. e.g. 'CviKI-1',
                # 'R2.BceSIV'
                name = name.replace("-", "_").replace(".", "_")
                if name in enzymedict:
                    #
                    #   deal with TaqII and its two sites.
                    #
                    print(f"\nWARNING : {name} has two different sites.\n")
                    other = line[0].replace("-", "_").replace(".", "_")
                    dna = Seq(line[1])
                    sense1 = regex(dna)
                    antisense1 = regex(str(dna.reverse_complement()))
                    dna = Seq(enzymedict[other][0])
                    sense2 = regex(dna)
                    antisense2 = regex(dna.reverse_complement())
                    sense = f"(?=(?P<{other}>{sense1})|{sense2})"
                    antisense = f"(?=(?P<{other}_as>{antisense1}|{antisense2}))"
                    reg = sense + "|" + antisense
                    line[1] = line[1] + "|" + enzymedict[other][0]
                    line[-1] = reg
                #
                #   the data to produce the enzyme class are then stored in
                #   enzymedict.
                #
                enzymedict[name] = line[1:]  # element zero was the name
        except IndexError:
            pass
        for i in supplier:
            #
            #   construction of the list of suppliers.
            #
            t = i.strip().split(" ", 1)
            suppliersdict[t[0]] = (t[1], [])
Esempio n. 41
0
def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
    from Bio import SeqIO
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq

    sequences = compress_seq['sequences']
    ref = compress_seq['reference']
    positions = compress_seq['positions']

    #If want to exclude sites from initial treebuild, read in here
    strip_pos = load_mask_sites(stripFile) if stripFile else []

    #Get sequence names
    seqNames = list(sequences.keys())

    #Check non-ref sites to see if informative
    printPositionMap = False  #If true, prints file mapping Fasta position to real position
    sites = []
    pos = []

    for key in positions:
        if key not in strip_pos:
            pattern = []
            for k in sequences.keys():
                #looping try/except is faster than list comprehension
                try:
                    pattern.append(sequences[k][key])
                except KeyError:
                    pattern.append(ref[key])
            origPattern = list(pattern)
            if '-' in pattern or 'N' in pattern:
                #remove gaps/Ns to see if otherwise informative
                pattern = [
                    value for value in origPattern
                    if value != '-' and value != 'N'
                ]
            un = np.unique(pattern, return_counts=True)
            #If not all - or N, not all same base, and >1 differing base, append
            if len(un[0]) != 0 and len(
                    un[0]) != 1 and not (len(un[0]) == 2 and min(un[1]) == 1):
                sites.append(origPattern)
                pos.append("\t".join([str(len(pos) + 1), str(key)]))

    #Rotate and convert to SeqRecord
    sites = np.asarray(sites)
    align = np.rot90(sites)
    seqNamesCorr = list(reversed(seqNames))
    toFasta = [
        SeqRecord(id=seqNamesCorr[i],
                  seq=Seq("".join(align[i])),
                  description='') for i in range(len(sequences.keys()))
    ]

    fasta_file = os.path.join(os.path.dirname(alignment),
                              'informative_sites.fasta')

    #now output this as fasta to read into raxml or iqtree
    SeqIO.write(toFasta, fasta_file, 'fasta')

    #If want a position map, print:
    if printPositionMap:
        with open(fasta_file + ".positions.txt", 'w',
                  encoding='utf-8') as the_file:
            the_file.write("\n".join(pos))

    return fasta_file
Esempio n. 42
0
        if (float(str(record.seq).count('N')) / float(len(str(record.seq))) *
                100) <= 1:
            #only sequences with more than 2000 bp will be chopped
            if len(str(record.seq)) > 2000:
                #length (about 1000 pb ) of the block is defined by folowing formula: int(round(len(record.seq.tostring())/round(len(record.seq.tostring())/1000)))
                for pos, block in enumerate(
                        blocks(
                            str(record.seq),
                            int(
                                round(
                                    len(str(record.seq)) /
                                    round(len(str(record.seq)) / 1000))))):
                    #safe only blocks with >=800bp length
                    if len(block) >= 800:
                        block_record = SeqRecord(
                            Seq(block, record.seq.alphabet),
                            id="%s_%i" % (block_record.id, pos),
                            name=record.name,
                            description="forward")
                        #reverse transcribe and do the similar as above
                        reverse_record = SeqRecord(
                            Seq(str(block_record.seq.reverse_complement()),
                                block_record.seq.alphabet),
                            id="%s_%i_rev" % (block_record.id, pos),
                            description="reverse")

                        SeqIO.write(block_record, open(outfile, 'a'), "fasta")
                        SeqIO.write(reverse_record, open(outfile, 'a'),
                                    "fasta")
            else:
                #if length is less than 2000bp than just safe both strands
Esempio n. 43
0
        if gene2strand[i] == "+":
            chromosome = gene2chromosome[i]
            genes = chromosome2gene[chromosome]
            i_index = genes.index(i)
            neighbour_index = i_index + 1
            downstream_neighbours = genes[neighbour_index:neighbour_index + 3]
            LOG.write(i + ":{0}\n".format(str(downstream_neighbours)))
            for j in downstream_neighbours:
                neighbour_id2neighbour_seq[j] = gene2seq[j]
        else:
            chromosome = gene2chromosome[i]
            genes = chromosome2gene[chromosome]
            i_index = genes.index(i)
            neighbour_index = i_index - 3
            if neighbour_index >= 0:
                downstream_neighbours = genes[neighbour_index:i_index]
            else:
                downstream_neighbours = genes[0:i_index]
            LOG.write(i + ":{0}\n".format(str(downstream_neighbours)))
            for j in downstream_neighbours:
                neighbour_id2neighbour_seq[j] = gene2seq[j]
    LOG.close()

    neighbour_ids = neighbour_id2neighbour_seq.keys()
    OUT = open("./data/neighbours.fa", "w")
    for i in neighbour_ids:
        coding_dna = Seq(neighbour_id2neighbour_seq[i], IUPAC.unambiguous_dna)
        protein = str(coding_dna.translate()).replace("*", "")
        OUT.write(">" + i + "\n" + protein + "\n")
    OUT.close()
Esempio n. 44
0
#Intro to Binf Armory
#Josh Rudolph
#9/26/17

from Bio.Seq import Seq

with open("rosalind_ini.txt", 'r') as f:
    sequence = Seq(f.readline().rstrip('\n'))

print sequence.count('A'), sequence.count('C'), sequence.count(
    'G'), sequence.count('T')
Esempio n. 45
0
    def __next__(self):
        """Parse the next alignment from the handle."""
        handle = self.handle
        line = handle.readline()

        if not line:
            raise StopIteration

        # Strip out header comments
        while line and line.strip().startswith("#"):
            line = handle.readline()

        seqs = {}
        seq_regions = {}
        passed_end_alignment = False

        latest_id = None
        while True:
            if not line:
                break  # end of file
            line = line.strip()

            if line.startswith("="):
                # There may be more data, but we've reached the end of this
                # alignment
                break
            elif line.startswith(">"):
                m = XMFA_HEADER_REGEX_BIOPYTHON.match(line)
                if not m:
                    m = XMFA_HEADER_REGEX.match(line)
                    if not m:
                        raise ValueError("Malformed header line: %s", line)

                parsed_id = m.group("id")
                parsed_data = {}
                for key in ("start", "end", "id", "strand", "name",
                            "realname"):
                    try:
                        value = m.group(key)
                        if key == "start":
                            value = int(value)
                            # Convert to zero based counting
                            if value > 0:
                                value -= 1

                        if key == "end":
                            value = int(value)
                        parsed_data[key] = value
                    except IndexError:
                        # This will occur if we're asking for a group that
                        # doesn't exist. It's fine.
                        pass
                seq_regions[parsed_id] = parsed_data

                if parsed_id not in self._ids:
                    self._ids.append(parsed_id)

                seqs.setdefault(parsed_id, "")
                latest_id = parsed_id
            else:
                assert not passed_end_alignment
                if latest_id is None:
                    raise ValueError("Saw sequence before definition line")
                seqs[latest_id] += line
            line = handle.readline()

        assert len(seqs) <= len(self._ids)

        self.ids = self._ids
        self.sequences = seqs

        if self._ids and seqs:
            alignment_length = max(map(len, list(seqs.values())))
            records = []
            for id in self._ids:
                if id not in seqs or len(seqs[id]) == 0 or len(seqs[id]) == 0:
                    seq = "-" * alignment_length
                else:
                    seq = seqs[id]

                if alignment_length != len(seq):
                    raise ValueError(
                        "Sequences have different lengths, or repeated identifier"
                    )

                # Sometimes we don't see a particular sequence in the
                # alignment, so we skip that record since it isn't present in
                # that LCB/alignment
                if id not in seq_regions:
                    continue

                if seq_regions[id]["start"] != 0 or seq_regions[id]["end"] != 0:
                    suffix = "/{start}-{end}".format(**seq_regions[id])
                    if "realname" in seq_regions[id]:
                        corrected_id = seq_regions[id]["realname"]
                    else:
                        corrected_id = seq_regions[id]["name"]
                    if corrected_id.count(suffix) == 0:
                        corrected_id += suffix
                else:
                    if "realname" in seq_regions[id]:
                        corrected_id = seq_regions[id]["realname"]
                    else:
                        corrected_id = seq_regions[id]["name"]

                record = SeqRecord(Seq(seq), id=corrected_id, name=id)

                record.annotations["start"] = seq_regions[id]["start"]
                record.annotations["end"] = seq_regions[id]["end"]
                record.annotations["strand"] = (1 if seq_regions[id]["strand"]
                                                == "+" else -1)

                records.append(record)
            return MultipleSeqAlignment(records)
        else:
            raise StopIteration
Esempio n. 46
0
def simulate_read(record, ErrorModel, i, cpu_number):
    """From a read pair from one genome (or sequence) according to an
    ErrorModel

    Each read is a SeqRecord object
    returns a tuple containing the forward and reverse read.

    Args:
        record (SeqRecord): sequence or genome of reference
        ErrorModel (ErrorModel): an ErrorModel class
        i (int): a number identifying the read
        cpu_number (int): cpu number. Is added to the read id.

    Returns:
        tuple: tuple containg a forward read and a reverse read
    """
    logger = logging.getLogger(__name__)
    sequence = record.seq
    header = record.id

    read_length = ErrorModel.read_length
    insert_size = ErrorModel.random_insert_size()
    # generate the forward read
    try:  # a ref sequence has to be longer than 2 * read_length + i_size
        assert read_length < len(record.seq)
        forward_start = random.randrange(
            0,
            len(record.seq) - (2 * read_length + insert_size))
    except AssertionError as e:
        raise
    except ValueError as e:
        logger.debug('%s shorter than template length for this ErrorModel:%s' %
                     (record.id, e))
        forward_start = max(0,
                            random.randrange(0,
                                             len(record.seq) - read_length))
        # raise

    forward_end = forward_start + read_length
    bounds = (forward_start, forward_end)
    # create a perfect read
    forward = SeqRecord(Seq(str(sequence[forward_start:forward_end]),
                            IUPAC.unambiguous_dna),
                        id='%s_%s_%s/1' % (header, i, cpu_number),
                        description='')
    # add the indels, the qual scores and modify the record accordingly
    forward.seq = ErrorModel.introduce_indels(forward, 'forward', sequence,
                                              bounds)
    forward = ErrorModel.introduce_error_scores(forward, 'forward')
    forward.seq = ErrorModel.mut_sequence(forward, 'forward')

    # generate the reverse read
    try:
        reverse_start = forward_end + insert_size
        reverse_end = reverse_start + read_length
        assert reverse_end < len(record.seq)
    except AssertionError as e:
        # we use random insert when the modelled template length distribution
        # is too large
        reverse_end = random.randrange(read_length, len(record.seq))
        reverse_start = reverse_end - read_length
    bounds = (reverse_start, reverse_end)
    # create a perfect read
    reverse = SeqRecord(Seq(rev_comp(str(sequence[reverse_start:reverse_end])),
                            IUPAC.unambiguous_dna),
                        id='%s_%s_%s/2' % (header, i, cpu_number),
                        description='')
    # add the indels, the qual scores and modify the record accordingly
    reverse.seq = ErrorModel.introduce_indels(reverse, 'reverse', sequence,
                                              bounds)
    reverse = ErrorModel.introduce_error_scores(reverse, 'reverse')
    reverse.seq = ErrorModel.mut_sequence(reverse, 'reverse')

    return (forward, reverse)
Esempio n. 47
0
def export_aa_record(gene_seq, gene_id, gene_description, output_handle):
    seq_object = Seq(gene_seq, IUPAC.protein)
    seq_record = SeqRecord(seq_object)
    seq_record.id = gene_id
    seq_record.description = gene_description
    SeqIO.write(seq_record, output_handle, 'fasta')
Esempio n. 48
0
#dpl=DNARenderer();
inpt='GACTCGGGGTGCCCTTCTGCGTGAAGGCTGAGAAATACCCGTATCACCTGATCTGGATAATGCCAGCGTAGGGAAGTT';
inpt=inpt.upper();
Acnt,Gcnt,Tcnt,Ccnt=0,0,0,0;
Acnt=inpt.count('A');
Gcnt=inpt.count('G');
Tcnt=inpt.count('T');
Ccnt=inpt.count('C');
GCPcnt=((Gcnt+Ccnt)/(Acnt+Gcnt+Tcnt+Ccnt));
print GCPcnt
x=['A','G','T','C','GCCount'];
y=[Acnt,Gcnt,Tcnt,Ccnt,GCPcnt];
sns.axes_style('white')
sns.set_style('white')
ax = sns.barplot(x, y)
mysq=Seq(inpt,IUPAC.unambiguous_dna);
print mysq.alphabet;
mysq1=[mysq[i:i+3] for i in range(0, len(mysq), 3)]
mRna=mysq.transcribe();
print mRna;
mRna=[mRna[i:i+3] for i in range(0, len(mRna), 3)]
for i in mRna:
	finDna.append(i.translate());
	finStr+=str(i.translate())
print "the number of codons is %s" %(str(len(mysq1)));
'''
print finDna;
print standard_table;
print mito_table
#finDna=mRna.translate();
#print finDna;
Esempio n. 49
0
 def motif_hit(self, line):
     seq = Seq(line.split("\t")[0], IUPAC.unambiguous_dna)
     self.current_motif.add_instance(seq)
Esempio n. 50
0
def prodigal_parser(seq_file, sco_file, prefix, output_folder):

    bin_ffn_file = '%s.ffn' % prefix
    bin_faa_file = '%s.faa' % prefix
    bin_gbk_file = '%s.gbk' % prefix
    pwd_bin_ffn_file = '%s/%s' % (output_folder, bin_ffn_file)
    pwd_bin_faa_file = '%s/%s' % (output_folder, bin_faa_file)
    pwd_bin_gbk_file = '%s/%s' % (output_folder, bin_gbk_file)

    # get sequence id list
    id_to_sequence_dict = {}
    sequence_id_list = []
    for each_seq in SeqIO.parse(seq_file, 'fasta'):
        id_to_sequence_dict[each_seq.id] = str(each_seq.seq)
        sequence_id_list.append(each_seq.id)

    # get sequence to cds dict and sequence to transl_table dict
    current_seq_id = ''
    current_transl_table = ''
    current_seq_csd_list = []
    seq_to_cds_dict = {}
    seq_to_transl_table_dict = {}
    for each_cds in open(sco_file):
        if each_cds.startswith('# Sequence Data'):

            # add to dict
            if current_seq_id != '':
                seq_to_cds_dict[current_seq_id] = current_seq_csd_list
                seq_to_transl_table_dict[current_seq_id] = current_transl_table

            # reset value
            current_seq_id = each_cds.strip().split('=')[-1][1:-1].split(
                ' ')[0]
            current_transl_table = ''
            current_seq_csd_list = []

        elif each_cds.startswith('# Model Data'):
            current_transl_table = each_cds.strip().split(';')[-2].split(
                '=')[-1]

        else:
            current_seq_csd_list.append('_'.join(
                each_cds.strip().split('_')[1:]))

    seq_to_cds_dict[current_seq_id] = current_seq_csd_list
    seq_to_transl_table_dict[current_seq_id] = current_transl_table

    bin_gbk_file_handle = open(pwd_bin_gbk_file, 'w')
    bin_ffn_file_handle = open(pwd_bin_ffn_file, 'w')
    bin_faa_file_handle = open(pwd_bin_faa_file, 'w')
    gene_index = 1
    for seq_id in sequence_id_list:

        # create SeqRecord
        current_sequence = Seq(id_to_sequence_dict[seq_id])
        current_SeqRecord = SeqRecord(current_sequence, id=seq_id)
        current_SeqRecord.seq.alphabet = generic_dna
        transl_table = seq_to_transl_table_dict[seq_id]

        # add SeqFeature to SeqRecord
        for cds in seq_to_cds_dict[seq_id]:

            # define locus_tag id
            locus_tag_id = '%s_%s' % (prefix, "{:0>5}".format(gene_index))

            # define FeatureLocation
            cds_split = cds.split('_')
            cds_start = SF.ExactPosition(int(cds_split[0]))
            cds_end = SF.ExactPosition(int(cds_split[1]))
            cds_strand = cds_split[2]
            current_strand = None
            if cds_strand == '+':
                current_strand = 1
            if cds_strand == '-':
                current_strand = -1
            current_feature_location = FeatureLocation(cds_start,
                                                       cds_end,
                                                       strand=current_strand)

            # get nc sequence
            sequence_nc = ''
            if cds_strand == '+':
                sequence_nc = id_to_sequence_dict[seq_id][cds_start -
                                                          1:cds_end]
            if cds_strand == '-':
                sequence_nc = str(
                    Seq(id_to_sequence_dict[seq_id][cds_start - 1:cds_end],
                        generic_dna).reverse_complement())

            # translate to aa sequence
            sequence_aa = str(
                SeqRecord(Seq(sequence_nc)).seq.translate(table=transl_table))

            # remove * at the end
            sequence_aa = sequence_aa[:-1]

            # export nc and aa sequences
            export_dna_record(sequence_nc, locus_tag_id, '',
                              bin_ffn_file_handle)
            export_aa_record(sequence_aa, locus_tag_id, '',
                             bin_faa_file_handle)

            # Define feature type
            current_feature_type = 'CDS'

            # Define feature qualifiers
            current_qualifiers_dict = {}
            current_qualifiers_dict['locus_tag'] = locus_tag_id
            current_qualifiers_dict['transl_table'] = transl_table
            current_qualifiers_dict['translation'] = sequence_aa

            # Create a SeqFeature
            current_feature = SeqFeature(current_feature_location,
                                         type=current_feature_type,
                                         qualifiers=current_qualifiers_dict)

            # Append Feature to SeqRecord
            current_SeqRecord.features.append(current_feature)
            gene_index += 1

        # export to gbk file
        SeqIO.write(current_SeqRecord, bin_gbk_file_handle, 'genbank')

    bin_gbk_file_handle.close()
    bin_ffn_file_handle.close()
    bin_faa_file_handle.close()
Esempio n. 51
0
def get_structure_seqrecords(model):
    """Get a dictionary of a PDB file's sequences.

    Special cases include:
        - Insertion codes. In the case of residue numbers like "15A", "15B", both residues are written out. Example: 9LPR
        - HETATMs. Currently written as an "X", or unknown amino acid.

    Args:
        model: Biopython Model object of a Structure

    Returns:
        list: List of SeqRecords

    """

    structure_seq_records = []

    # Loop over each chain of the PDB
    for chain in model:
        tracker = 0
        chain_seq = ''
        chain_resnums = []

        # Loop over the residues
        for res in chain.get_residues():
            # NOTE: you can get the residue number too
            res_id = res.id
            res_num = res_id[1]
            res_icode = res_id[2]

            # Double check if the residue name is a standard residue
            # If it is not a standard residue (ie. selenomethionine),
            # it will be filled in with an X on the next iteration)
            if Polypeptide.is_aa(res, standard=True):
                end_tracker = res_num
                res_aa_one = Polypeptide.three_to_one(res.get_resname())

                # Tracker to fill in X's
                if end_tracker != (tracker + 1):
                    if res_icode != ' ':
                        chain_seq += res_aa_one
                        chain_resnums.append(res_num)
                        tracker = end_tracker + 1
                        continue
                    else:
                        multiplier = (end_tracker - tracker - 1)
                        chain_seq += 'X' * multiplier
                        # Residue numbers for unresolved or nonstandard residues are Infinite
                        chain_resnums.extend([float("Inf")] * multiplier)

                chain_seq += res_aa_one
                chain_resnums.append(res_num)
                tracker = end_tracker

            else:
                continue

        chain_seq_record = SeqRecord(Seq(chain_seq, IUPAC.protein), id=chain.get_id())
        chain_seq_record.letter_annotations['structure_resnums'] = chain_resnums
        structure_seq_records.append(chain_seq_record)

    return structure_seq_records
Esempio n. 52
0
def main():
    ap = GooeyParser(
        description=
        "convert each row of a tabular file with the fasta headers and sequences in each row in single-fasta files or a multi-fasta file, with trimmed sequences"
    )
    ap.add_argument("-in",
                    "--input",
                    required=True,
                    widget='FileChooser',
                    help="input txt file")
    ap.add_argument("-start",
                    "--start",
                    required=False,
                    default=1,
                    type=int,
                    help="region to start writing the fasta file(default 1)")
    ap.add_argument(
        "-stop",
        "--stop",
        required=False,
        type=int,
        help=
        "region to stop writing the fasta file(it can be both a positive and  a negative number)"
    )
    ap.add_argument(
        "-pro",
        "--program",
        required=False,
        default=1,
        type=int,
        help=
        "program to choose 1) add both start and stop location 2) the stop location with be that of the sequence length. Default is 1"
    )
    ap.add_argument(
        "-type",
        "--type",
        required=False,
        default=1,
        type=int,
        help=
        "type of fasta to export 1) 1 multi-fasta file 2)  many single-fasta files. Default is 1"
    )
    ap.add_argument("-out",
                    "--output",
                    required=False,
                    widget='FileSaver',
                    type=int,
                    help="output multi-fasta file")
    args = vars(ap.parse_args())

    # main
    # create function to trim fasta records
    def fastatrim(fastaseq):
        # choose program
        if args['program'] == 1:
            # fix the index for start parameter
            if args['start'] > 0:
                seq_start = args['start'] - 1
            else:
                print("-start parameter must be a positive integer")
                exit(1)
            # add end parameter
            seq_end = args['stop']
        else:
            # fix the index for start parameter
            if args['start'] > 0:
                seq_start = args['start'] - 1
            else:
                print("-start parameter must be a positive integer")
                exit(1)
            # add end parameter according to program 2
            args['stop'] = len(fastaseq)
            seq_end = args['stop']
        # subset each fasta record
        return fastaseq[seq_start:seq_end]

    df = pd.read_csv(args['input'], header=None, sep="\t")
    # select ids and sequence columns, convert to lists
    headers = df.iloc[:, 0].values.tolist()
    sequences = df.iloc[:, 1].values.tolist()
    # choose fasta type to export
    if args['type'] == 1:
        # setup empty list
        seqs_for_fasta = []
        # iter elements on pairs to export in single fasta files
        for (ids, seq) in zip(headers, sequences):
            seqs_for_fasta.append(
                SeqRecord(Seq(fastatrim(str(seq))),
                          id=str(ids),
                          description=""))
            SeqIO.write(seqs_for_fasta, args['output'], "fasta")
    else:
        # iter elements on pairs to export in single fasta files
        for (ids, seq) in zip(headers, sequences):
            seq_for_fasta = SeqRecord(Seq(fastatrim(str(seq))),
                                      id=str(ids),
                                      description="")
            SeqIO.write(seq_for_fasta, "".join([str(ids), ".fasta"]), "fasta")
Esempio n. 53
0
def parseData(fasta_file,
              bed_file,
              output_dir,
              output_name,
              promoter_size=40,
              sliding_step=1,
              tetranucleotide_size=4,
              test_with_sample=False,
              test_sample_size=5000,
              tokenizer_path=None,
              data_output_type=None,
              save_to_disk=True):
    start_time = time.time()
    seq_arr = open(fasta_file, 'r').readlines()
    seq_info = seq_arr[0]
    seq_id = seq_info.split(" ")[0][1:]
    all_genome = "".join(seq_arr[1:]).replace("\n", "")
    logs_arr = list()
    logs_arr = h1(
        logs_arr,
        '{} - PROMOTER PREDICTION OF GENOME {} WITH LENGTH {} USING SLIDING WINDOW FROM {}'
        .format(output_name, seq_id, len(seq_arr), fasta_file))

    prom_arr, inv_prom_arr, prom_tetra_arr, inv_prom_tetra_arr, prom_tetra_arr_str, inv_prom_tetra_arr_str = None, None, None, None, None, None

    prom_arr_path = "{}/{}/{}.data".format(output_dir, output_name,
                                           "40BP_SEQUENCES")
    inv_prom_arr_path = "{}/{}/{}.data".format(output_dir, output_name,
                                               "40BP_SEQUENCES_INV")
    prom_tetra_arr_path = "{}/{}/{}.data".format(output_dir, output_name,
                                                 "TETRA_NUCLEOTIDES")
    inv_prom_tetra_arr_path = "{}/{}/{}.data".format(output_dir, output_name,
                                                     "TETRA_NUCLEOTIDES_INV")

    if os.path.exists(prom_arr_path) and os.path.exists(
            inv_prom_arr_path) and os.path.exists(
                prom_tetra_arr_path) and os.path.exists(
                    inv_prom_tetra_arr_path):
        logs_arr = h1(
            logs_arr,
            "(1-3/4) LOADING EXISTING SEQUENCES AND TETRANUCLEOTIDE DATA")

        prom_arr = joblib.load(prom_arr_path)
        inv_prom_arr = joblib.load(inv_prom_arr_path)
        prom_tetra_arr = joblib.load(prom_tetra_arr_path)
        inv_prom_tetra_arr = joblib.load(inv_prom_tetra_arr_path)

        logs_arr = h1(
            logs_arr, """
    SAMPLES:
    _______________________________________
    ORIGINAL : {} \n\t {} \n
    INVERSE  : {} \n\t {} \n
    -----------
    ORIGINAL JOINT : {} \n\t {} \n
    INVERSE  JOINT : {} \n\t {} \n
    """.format(prom_arr.shape, prom_arr[-1], inv_prom_arr.shape,
               inv_prom_arr[-1], prom_tetra_arr.shape, prom_tetra_arr[-1],
               inv_prom_tetra_arr.shape, inv_prom_tetra_arr[-1]))

        logs_arr = h1(
            logs_arr, "\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format(
                time.strftime("%H:%M:%S",
                              time.gmtime(time.time() - start_time))))

        if (test_with_sample):
            logs_arr = h1(
                logs_arr,
                '(3.25/4) REDUCING DATASET FOR TESTING. FROM SIZE {} TO {}.'.
                format(prom_arr.shape, test_sample_size))
            prom_arr = prom_arr[:test_sample_size]
            inv_prom_arr = inv_prom_arr[:test_sample_size]
            prom_tetra_arr = prom_tetra_arr[:test_sample_size]
            inv_prom_tetra_arr = inv_prom_tetra_arr[:test_sample_size]

        logs_arr = h1(logs_arr,
                      "\t (3.5/4) CREATING JOINT TETRANUCLEOTIDE ARRAYS")

        prom_tetra_arr_str = tetranucleotide_list_to_string_list(
            prom_tetra_arr)
        inv_prom_tetra_arr_str = tetranucleotide_list_to_string_list(
            inv_prom_tetra_arr)

        logs_arr = h1(
            logs_arr, """
    SAMPLES:
    _______________________________________
    ORIGINAL : \n\t {} \n
    INVERSE  : \n\t {} \n
    """.format(prom_tetra_arr_str[-1], inv_prom_tetra_arr_str[-1]))
        logs_arr = h1(
            logs_arr, "\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format(
                time.strftime("%H:%M:%S",
                              time.gmtime(time.time() - start_time))))

    else:
        logs_arr = h1(
            logs_arr,
            "(1/4) CUTTING GENOME SEQUENCE INTO {} BP SEQUENCES USING SLIDING WINDOW OF STEP {}. TOTAL SAMPLES: {}. SAMPLES USING SLIDING STEP: {}"
            .format(promoter_size, sliding_step,
                    len(all_genome) - promoter_size - 1, sliding_step))
        prom_arr = np.array([
            all_genome[i:i + promoter_size] for i in progressbar.progressbar(
                range(0,
                      len(all_genome) - promoter_size - 1, sliding_step))
        ])
        logs_arr = h1(
            logs_arr, "\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format(
                time.strftime("%H:%M:%S",
                              time.gmtime(time.time() - start_time))))

        if (test_with_sample):
            logs_arr = h1(
                logs_arr,
                '(1.5/4) REDUCING DATASET FOR TESTING. FROM SIZE {} TO {}.'.
                format(prom_arr.shape, test_sample_size))
            prom_arr = prom_arr[:test_sample_size]
        else:
            logs_arr = h1(
                logs_arr,
                '\t DATA IS NOT FOR TESTING. RUNNING COMPLETE DATASET OF SIZE: {}. '
                .format(prom_arr.shape))
        logs_arr = h1(
            logs_arr, "\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format(
                time.strftime("%H:%M:%S",
                              time.gmtime(time.time() - start_time))))

        logs_arr = h1(
            logs_arr,
            "(2/4) OBTAINING TETRANUCLEOTIDES FROM EACH 40 BP SEQUENCES")
        logs_arr = h1(
            logs_arr,
            """\t RESULTING SEQUENCES ARRAY OF SIZE {}. EACH SEQUENCE WITH SIZE OF {} bp. SAMPLE:  "{}"\n"""
            .format(len(prom_arr), len(prom_arr[-1]), prom_arr[-1]))
        prom_tetra_arr = np.empty(len(prom_arr), dtype=object)
        for i_s in progressbar.progressbar(range(len(prom_arr))):
            sequence = prom_arr[i_s]
            prom_tetra_arr[i_s] = np.array([
                sequence[i_t:i_t + tetranucleotide_size]
                for i_t in range(len(sequence) - (tetranucleotide_size - 1))
            ])
        prom_tetra_arr_str = tetranucleotide_list_to_string_list(
            prom_tetra_arr)
        logs_arr = h1(
            logs_arr, """
    SAMPLES:
    _______________________________________
    ORIGINAL: {} \n\t {} \n\n TETRANUCLEOTODES: {} \n\t {} \n
    """.format(prom_arr.shape, prom_arr[-1], prom_tetra_arr_str.shape,
               prom_tetra_arr_str[-1]))
        logs_arr = h1(
            logs_arr, "\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format(
                time.strftime("%H:%M:%S",
                              time.gmtime(time.time() - start_time))))

        logs_arr = h1(logs_arr, "(3/4) GENERATING INVERSE SEQUENCES ")
        inv_prom_arr = np.array(
            [str(Seq(seq[::-1]).complement()) for seq in prom_arr])
        inv_prom_tetra_arr = np.empty(len(inv_prom_arr), dtype=object)
        for i_s in progressbar.progressbar(range(len(inv_prom_arr))):
            sequence = inv_prom_arr[i_s]
            inv_prom_tetra_arr[i_s] = np.array([
                sequence[i_t:i_t + tetranucleotide_size]
                for i_t in range(len(sequence) - (tetranucleotide_size - 1))
            ])
        inv_prom_tetra_arr_str = tetranucleotide_list_to_string_list(
            inv_prom_tetra_arr)
        logs_arr = h1(
            logs_arr, "\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format(
                time.strftime("%H:%M:%S",
                              time.gmtime(time.time() - start_time))))
        logs_arr = h1(
            logs_arr, """
    SAMPLES:
    _______________________________________
    ORIGINAL : {} \n\t {} \n
    INVERSE  : {} \n\t {} \n
    -----------
    ORIGINAL JOINT : {} \n\t {} \n
    INVERSE  JOINT : {} \n\t {} \n
    """.format(prom_arr.shape, prom_arr[-1], inv_prom_arr.shape,
               inv_prom_arr[-1], prom_tetra_arr_str.shape,
               prom_tetra_arr_str[-1], inv_prom_tetra_arr_str.shape,
               inv_prom_tetra_arr_str[-1]))
        logs_arr = h1(
            logs_arr, "\t TIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format(
                time.strftime("%H:%M:%S",
                              time.gmtime(time.time() - start_time))))

    X, X_inv = None, None
    if (data_output_type == "RNN"):
        logs_arr = h1(logs_arr, "\t LOADING RNN TOKENIZER")
        tokenizer = pickle.load(open(tokenizer_path, 'rb'))
        logs_arr = h1(logs_arr, "\t TOKENIZER   : {}".format(tokenizer))

        logs_arr = h1(
            logs_arr,
            "(4/5) PARSING PROMOTER 40BP SEQUENCES TO RNN TOKEN SEQUENCES")
        tetra_tokens = np.array(
            tokenizer.texts_to_sequences(prom_tetra_arr_str))
        inv_tetra_tokens = np.array(
            tokenizer.texts_to_sequences(inv_prom_tetra_arr_str))
        X, X_inv = tetra_tokens, inv_tetra_tokens
        logs_arr = h1(
            logs_arr, """
    TOKEN SAMPLES:
    _______________________________________
    ORIGINAL : {} \n\t {} \n\t {} \n
    -----------
    INVERSE  : {} \n\t {} \n\t {} \n
    """.format(X.shape, prom_arr[-1], X[-1], X_inv.shape, X_inv[-1],
               inv_prom_arr[-1]))
    elif (data_output_type == "RF-HOT"):
        logs_arr = h1(
            logs_arr,
            "(4/5)  PARSING PROMOTER 40BP SEQUENCES TO HOT ENCODING FORMAT")
        tetra_hot = tetraToHotEncoding(prom_arr)
        inv_tetra_hot = tetraToHotEncoding(inv_prom_arr)
        X, X_inv = tetra_hot, inv_tetra_hot
        logs_arr = h1(
            logs_arr, """
    HOT ENCODING SAMPLES:
    _______________________________________
    ORIGINAL : {} \n\t {} \n\t {} \n
    -----------
    INVERSE  : {} \n\t {} \n\t {} \n
    """.format(X.shape, prom_arr[-1], X[-1], X_inv.shape, X_inv[-1],
               inv_prom_arr[-1]))
    elif (data_output_type == "RF-TETRA"):
        logs_arr = h1(
            logs_arr,
            "(4/5) PARSING PROMOTER-40BP SEQUENCES TO TETRA-FREQUENCIES FORMAT"
        )
        tetra_freq = promoterToTetraFreq(prom_arr)
        inv_tetra_freq = promoterToTetraFreq(inv_prom_arr)
        X, X_inv = tetra_freq.values, inv_tetra_freq.values
    else:
        logs_arr = h1(logs_arr, "ERROR: NO DATA OUTPUT TYPE SPECIFIED.")
        raise "ERROR: NO DATA OUTPUT TYPE SPECIFIED."

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    if not os.path.exists("{}/{}".format(output_dir, output_name)):
        os.makedirs("{}/{}".format(output_dir, output_name))
    if (save_to_disk):
        logs_arr = h1(logs_arr, "(5/5) SAVING FILES")

        logs_arr = h1(logs_arr, "\tSAVING X: {}".format(X.shape))
        joblib.dump(
            X, "{}/{}/{}.data".format(output_dir, output_name,
                                      data_output_type))

        logs_arr = h1(logs_arr, "\tSAVING X_INV: {}".format(X_inv.shape))
        joblib.dump(
            X_inv, "{}/{}/{}_INV.data".format(output_dir, output_name,
                                              data_output_type))
    else:
        logs_arr = h1(logs_arr, "NOT SAVING TO DISK.")
    logs_arr = h1(
        logs_arr, "\tTIME ELAPSED FROM START (HOUR:MIN:SEC): {}".format(
            time.strftime("%H:%M:%S", time.gmtime(time.time() - start_time))))

    with open(
            "{}/{}/{}_LOGS.txt".format(output_dir, output_name,
                                       data_output_type), 'w') as f:
        for item in logs_arr:
            f.write("%s\n" % item)
    return X, X_inv
Esempio n. 54
0
def get_matches(seq_list, fwd_primer, rev_primer, expected_size,
                partial=False, strand="+"):
    """given a seqence list  and regex compilations of your primers
    return the matches
    """
    # assert logger is not None, "must use logger!"
    assert strand in ["-", "+"], "strand must be either + or -"
    assert isinstance(seq_list[0], SeqRecord), "must submit list of SeqRecords"
    if strand == "+":
        fwd = re.compile(fwd_primer, re.IGNORECASE)
        rev = re.compile(str(SeqRecord(Seq(rev_primer).reverse_complement()).seq),
                         re.IGNORECASE)
    else:
        fwd = re.compile(rev_primer, re.IGNORECASE)
        rev = re.compile(str(SeqRecord(Seq(fwd_primer).reverse_complement()).seq),
                         re.IGNORECASE)
    matches = []
    for i in seq_list:
        coords_F = None
        coords_R = None
        try:
            coords_F = fwd.search(str(i.seq)).span()
            sys.stderr.write("F match!\n")
        except:
            pass
        try:
            coords_R = rev.search(str(i.seq)).span()
            sys.stderr.write("R match!\n")
        except:
            pass

        if coords_F is not None and coords_R is not None:
            sys.stderr.write("Match found on %s (%s)\n" % (i.id, strand))
            matches.append(PcrHit(
                template_orientation=strand,
                template_id=i.id,
                F_start=coords_F[0],
                R_start=coords_R[0],
                F_end=coords_F[1],
                R_end=coords_R[1],
                partial=partial
            ))
        elif coords_F is not None:
            if len(i.seq[coords_F[0]:]) > expected_size:
                sys.stderr.write("Possible match on %s (%s)\n" % (i.id, strand))
                matches.append(PcrHit(
                    template_orientation=strand,
                    template_id=i.id,
                    F_start=coords_F[0],
                    R_start=None,
                    F_end=coords_F[1],
                    R_end=None,
                    partial=partial
                ))
            else:
                pass
        elif coords_R is not None:
            if not coords_R[0] < expected_size:
                sys.stderr.write("Possible match on %s (%s)\n" % (i.id, strand))
                matches.append(PcrHit(
                    template_orientation=strand,
                    template_id=i.id,
                    R_start=coords_R[0],
                    F_start=None,
                    R_end=coords_R[1],
                    F_end=None,
                    partial=partial
                ))

        else:
            # sys.stderr.write("No hits on %s" % i.id)
            pass
    return(matches)
Esempio n. 55
0
 def hash2():
     SeqRecord(Seq("A")).__hash__()
Esempio n. 56
0
    def setUp(self):
        self.config = build_config([])
        self.rules_file = path.get_full_path(__file__, "..", "cluster_rules",
                                             "strict.txt")
        self.signature_file = path.get_full_path(__file__, "..", "data",
                                                 "hmmdetails.txt")
        self.signature_names = {
            sig.name
            for sig in core.get_signature_profiles()
        }
        self.filter_file = path.get_full_path(__file__, "..",
                                              "filterhmmdetails.txt")
        self.results_by_id = {
            "GENE_1": [
                FakeHSPHit("modelA", "GENE_1", 0, 10, 50, 0),
                FakeHSPHit("modelB", "GENE_1", 0, 10, 50, 0)
            ],
            "GENE_2": [
                FakeHSPHit("modelC", "GENE_2", 0, 10, 50, 0),
                FakeHSPHit("modelB", "GENE_2", 0, 10, 50, 0)
            ],
            "GENE_3": [
                FakeHSPHit("modelC", "GENE_3", 0, 10, 50, 0),
                FakeHSPHit("modelF", "GENE_3", 0, 10, 50, 0)
            ],
            "GENE_4": [
                FakeHSPHit("modelA", "GENE_4", 0, 10, 50, 0),
                FakeHSPHit("modelE", "GENE_4", 0, 10, 50, 0)
            ],
            "GENE_5": [
                FakeHSPHit("modelA", "GENE_5", 0, 10, 50, 0),
                FakeHSPHit("modelG", "GENE_5", 0, 10, 50, 0)
            ]
        }
        self.feature_by_id = {
            "GENE_1": DummyCDS(0, 30000, locus_tag="GENE_1"),
            "GENE_2": DummyCDS(30000, 50000, locus_tag="GENE_2"),
            "GENE_3": DummyCDS(70000, 90000, locus_tag="GENE_3"),
            "GENE_X": DummyCDS(95000, 100000, locus_tag="GENE_X"),  # no hits
            "GENE_4": DummyCDS(125000, 140000, locus_tag="GENE_4"),
            "GENE_5": DummyCDS(130000, 150000, locus_tag="GENE_5")
        }

        self.test_names = {
            "modelA", "modelB", "modelC", "modelF", "modelG", "a", "b", "c",
            "d"
        }

        self.rules = rule_parser.Parser(
            "\n".join([
                "RULE MetaboliteA CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS modelA",
                "RULE MetaboliteB CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS cds(modelA and modelB)",
                "RULE MetaboliteC CUTOFF 10 NEIGHBOURHOOD 5 CONDITIONS (modelA and modelB)",
                "RULE MetaboliteD CUTOFF 20 NEIGHBOURHOOD 5 CONDITIONS minimum(2,[modelC,modelB]) and modelA",
                "RULE Metabolite0 CUTOFF 1 NEIGHBOURHOOD 3 CONDITIONS modelF",
                "RULE Metabolite1 CUTOFF 1 NEIGHBOURHOOD 3 CONDITIONS modelG"
            ]), self.test_names).rules
        self.features = []
        for gene_id in self.feature_by_id:
            self.features.append(self.feature_by_id[gene_id])
        self.features.sort(
            key=lambda x: x.location.start)  # vital for py3 < 3.5
        self.record = Record()
        self.record._record.seq = Seq("A" * 150000)
        for feature in self.features:
            self.record.add_cds_feature(feature)
Esempio n. 57
0
 def ge():
     SeqRecord(Seq("A")) >= SeqRecord(Seq("A"))
Esempio n. 58
0
 def test_valid_id(self):
     with self.assertRaises(TypeError):
         SeqRecord(Seq("ACGT"), id={})
Esempio n. 59
0
 def gt():
     SeqRecord(Seq("A")) > SeqRecord(Seq("A"))
Esempio n. 60
0
 def hash1():
     hash(SeqRecord(Seq("A")))