def load_fasta_file(input_file: str) -> Tuple[str, List]: """ Load a fasta file into a list of SeqRecords. :param input_file: The path to the input fasta file. :returns: A tuple of the sequence type ('protein' or 'dna'), and the list of SeqRecords. """ if _is_gzipped(input_file): openfunc = gzip.open bit = 'rt' else: openfunc = open bit = 'r' with openfunc(input_file, bit) as handle: seqs = [ x.upper() for x in SeqIO.parse( handle=handle, format='fasta', alphabet=IUPAC.ambiguous_dna) ] if not all(_verify_alphabet(x.seq) for x in seqs): handle.seek(0) seqs = [ x.upper() for x in SeqIO.parse(handle=handle, format='fasta', alphabet=HasStopCodon( IUPAC.extended_protein)) ] if not all(_verify_alphabet(x.seq) for x in seqs): raise ValueError( 'Invalid input file (neither DNA nor protein FASTA).') return 'protein', seqs return 'dna', seqs
def __init__(self, strand1, strand2=None, align=(0, 0)): """ Constructs a duplex from two strands. The duplex contains only the subsequences of the two strands that are specified by the 'align' parameter. Parameters ---------- strand1 : Biopython Seq object or string First strand strand2 : Biopython Seq object or string, optional Second strand. If not specified, the reverse complement is used. align : tuple of int, optional Specifies the beginning and end position of the duplex. First number is the position on strand 1 where the duplex begins, and the second number is the position on strand 2 where the duplex ends. Default is (0,0), meaning that the 5' end of the first strand is the beginning of the duplex, and the 5' end of the second strand is the end. Notes ----- If a hybrid DNA-RNA duplex is specified, the seq attribute will be the DNA strand and the cseq attribute will be the RNA strand, regardless of the order in which the strands were given. """ s1 = _disambiguate(strand1) if _verify_alphabet(s1) is False: raise ValueError("Couldn't identify strand 1 %s as " "unambiguous DNA or RNA" % str(strand1)) if strand2 is None: s2 = s1.reverse_complement() else: s2 = _disambiguate(strand2) if _verify_alphabet(s2) is False: raise ValueError("Couldn't identify strand 2 %s as " "as unambiguous DNA or RNA" % str(strand2)) # calculate dangling ends (TODO: implement) i = align[0] m = align[1] #j = min(len(s1), ) #n = min( # For DNA-RNA hybrids, use the DNA strand as seq and the # RNA strand as cseq if ((s1.alphabet == IUPAC.unambiguous_rna) and (s2.alphabet == IUPAC.unambiguous_dna)): self.seq = s2 self.cseq = s1 else: self.seq = s1 self.cseq = s2
def read(self, input_handle): """Read patterns from the specified handle. """ all_patterns = [] while True: cur_line = input_handle.readline() if not(cur_line): break cur_pattern = cur_line.rstrip() # split up signatures if self.separator in cur_pattern: cur_pattern = tuple(cur_pattern.split(self.separator)) if self._alphabet is not None: # make single patterns (not signatures) into lists, so we # can check signatures and single patterns the same if not isinstance(cur_pattern, tuple): test_pattern = [cur_pattern] else: test_pattern = cur_pattern for pattern_item in test_pattern: pattern_seq = Seq(pattern_item, self._alphabet) if not(_verify_alphabet(pattern_seq)): raise ValueError("Pattern %s not matching alphabet %s" % (cur_pattern, self._alphabet)) all_patterns.append(cur_pattern) return all_patterns
def similarity_score(matrix, ref, query): """ Similarity for pseudosequences using a substitution matrix. Args: matrix: subs matrix as dictionary ref: reference sequence query: query sequence Returns: a similarity value normalized to matrix """ if type(ref) is not str or type(query) is not str: return r = ref q = query s = Seq(q, alphabet=IUPAC.IUPACProtein) #check protein sequence of query if _verify_alphabet(s) is False: return sim = sum([matrix[i][j] for i, j in zip(r, q) if (i != '-' and j != '-')]) sim1 = sum([matrix[i][j] for i, j in zip(r, r) if (i != '-' and j != '-')]) sim2 = sum([matrix[i][j] for i, j in zip(q, q) if (i != '-' and j != '-')]) #normalise the score normsim = sim / np.sqrt(sim1 * sim2) return normsim
def _get_signature_dict(self, seq_records, sig_size, max_gap): """Return a dictionary with all signatures and their counts. This internal function does all of the hard work for the find_signatures function. """ if self._alphabet_strict: alphabet = seq_records[0].seq.alphabet else: alphabet = None # loop through all records to find signatures all_sigs = {} for seq_record in seq_records: # if we are working with alphabets, make sure we are consistent if alphabet is not None: assert seq_record.seq.alphabet == alphabet, \ "Working with alphabet %s and got %s" % \ (alphabet, seq_record.seq.alphabet) # now start finding signatures in the sequence largest_sig_size = sig_size * 2 + max_gap for start in range(len(seq_record.seq) - (largest_sig_size - 1)): # find the first part of the signature first_sig = seq_record.seq[start:start + sig_size].tostring() # now find all of the second parts of the signature for second in range(start + 1, (start + 1) + max_gap): second_sig = seq_record.seq[second:second + sig_size].tostring() # if we are being alphabet strict, make sure both parts # of the sig fall within the specified alphabet if alphabet is not None: first_seq = Seq(first_sig, alphabet) second_seq = Seq(second_sig, alphabet) if _verify_alphabet(first_seq) \ and _verify_alphabet(second_seq): all_sigs = self._add_sig(all_sigs, (first_sig, second_sig)) # if we are not being strict, just add the motif else: all_sigs = self._add_sig(all_sigs, (first_sig, second_sig)) return all_sigs
def _get_signature_dict(self, seq_records, sig_size, max_gap): """Return a dictionary with all signatures and their counts. This internal function does all of the hard work for the find_signatures function. """ if self._alphabet_strict: alphabet = seq_records[0].seq.alphabet else: alphabet = None # loop through all records to find signatures all_sigs = {} for seq_record in seq_records: # if we are working with alphabets, make sure we are consistent if alphabet is not None: assert seq_record.seq.alphabet == alphabet, \ "Working with alphabet %s and got %s" % \ (alphabet, seq_record.seq.alphabet) # now start finding signatures in the sequence largest_sig_size = sig_size * 2 + max_gap for start in range(len(seq_record.seq) - (largest_sig_size - 1)): # find the first part of the signature first_sig = str(seq_record.seq[start:start + sig_size]) # now find all of the second parts of the signature for second in range(start + 1, (start + 1) + max_gap): second_sig = str(seq_record.seq[second: second + sig_size]) # if we are being alphabet strict, make sure both parts # of the sig fall within the specified alphabet if alphabet is not None: first_seq = Seq(first_sig, alphabet) second_seq = Seq(second_sig, alphabet) if _verify_alphabet(first_seq) \ and _verify_alphabet(second_seq): all_sigs = self._add_sig(all_sigs, (first_sig, second_sig)) # if we are not being strict, just add the motif else: all_sigs = self._add_sig(all_sigs, (first_sig, second_sig)) return all_sigs
def clean_dna_sequence(sequence, strict=False, alphabet=None): sequence = sequence.strip() sequence = re.sub(r'\s+', '', sequence) if strict: # throws exception if DNA is not valid if alphabet is None: alphabet = IUPAC.unambiguous_dna if not _verify_alphabet(Seq(sequence.upper(), alphabet)): raise Exception("Sequence %s contains illegal character. Expecting %s only." % (sequence, alphabet.letters)) return sequence
def checkProtein(self): self.checkExtension() for i in self.proteomeList: for seq in SeqIO.parse(i, "fasta"): protSeq = str(seq.seq).translate(None, '*') seqObj = Seq(protSeq, IUPAC.extended_protein) my_seq = _verify_alphabet(seqObj) if my_seq is False: raise TypeError( "In file %s, sequence %s is not a protein" % (i, seq.id))
def setSequenceRaw_slot(self, rawseq): """Set the current sequence to the given one""" rawseq = re.compile('[\s]').sub('', rawseq.upper()) try: seq = Seq(rawseq, IUPAC.unambiguous_dna) if not _verify_alphabet(seq): raise ValueError("Alphabet Verification Failed!") self.currentSequence = seq self.window.gotSequence.emit(str(self.currentSequence)) except ValueError: self.errorMessage("This doesn't look like DNA!\n\n" + str(rawseq[0:64]) + "...\n")
def single_prediction(seq): if len(seq) > 20 and _verify_alphabet(Seq(seq, IUPAC.protein)): seq = seq_repair(seq) full_features = np.array(feature_extraction(seq)) ph_x = ph_f_selection(full_features) opt_ph = phpredict(ph_x)[0] temp_x = temp_f_selection(full_features) opt_temp = tempredict(temp_x)[0] return opt_temp[0], opt_ph[0] else: return 'Not a Valid Protein Sequence!', 'Not a Valid Protein Sequence!'
def clean_sequence(sequence, strict=False, alphabet=None, exception=True): sequence = sequence.strip() sequence = re.sub(r'\s+', '', sequence) if strict: # throws exception if DNA is not valid if alphabet is None: alphabet = IUPAC.unambiguous_dna if not _verify_alphabet(Seq(sequence.upper(), alphabet)): if exception is True: raise Exception("Sequence %s contains illegal character. Expecting %s only." %\ (sequence, alphabet.letters)) else: return None return sequence
def __validate(self, seqdata): """Verifies the user provided input is either a file or a file-like object containing sequence data. Args: seqdata: a string (path to file) or file-like object. Returns: A list with one namedtuple per input sequence. Raises: TypeError: if the input is not a string or a file-like object. ParseError: if the sequence contains others than the 20 canonical AAs. """ _Sequence = namedtuple('Seq', ['name', 'data']) # file-like object # isinstance(obj, file) does not hold in Py3 if hasattr(seqdata, 'read') and hasattr(seqdata, 'name'): self.logger.debug('Reading data from file-like object {}'.format(seqdata.name)) fname = seqdata.name elif isinstance(seqdata, basestring): self.logger.debug('Reading data from file path {}'.format(seqdata)) fname = seqdata # can be file name string or sequence if not os.path.isfile(fname): raise OSError('Sequence file not found: {}'.format(seqdata)) else: raise TypeError('Sequence input format not recognized: {}'.format(seqdata)) # parse and validate sequences # defining these two a prior just in case later we decide to support more stuff _seq_alphabet = IUPACProtein() _seq_format = 'fasta' seq_iterator = SeqIO.parse(seqdata, _seq_format, alphabet=_seq_alphabet) for seq_i, seq_record in enumerate(seq_iterator, start=1): seq_name = seq_record.name seq_raw = str(seq_record.seq) if not _verify_alphabet(seq_record.seq): msg = 'Entry #{} ({}) in {} is not a valid protein sequence' raise ParseError(msg.format(seq_i, seq_name, fname)) self.sequences.append(_Sequence(seq_name, seq_raw)) return self.sequences
def obtain_seq_from_ent(ent): from Bio import Entrez, SeqIO Entrez.email = "*****@*****.**" try: handle = Entrez.efetch(db="protein", id=ent, rettype="gp") record = SeqIO.read(handle, "gb") seq = str(record.seq) except: from pydpi import pypro seq = str(pypro.GetProteinSequence(ent)) if not _verify_alphabet(Seq(seq, IUPAC.protein)): print('Inserted Entry Is Not Valid!') return seq
def site2dna(site): """Convert "site" to a DNA sequence site is a str returns a Bio.Seq.Seq First tries to match site to a restriction enzyme. On failure, tries to convert to DNA sequence and checks strict alphabet """ if site in enzymedict: dna = Seq(enzymedict[site]["site"], unambiguous_dna) else: dna = Seq(site, unambiguous_dna) if not _verify_alphabet(dna): raise ValueError("site is not recognized enzyme and not strict DNA") return dna
def _user_submits( self, seq_str, island_size_str, min_gc_ratio_str, min_obs_exp_cpg_ratio_str, algo_index): """Called when the user submits the form. :param seq_str: the sequence as a string :type seq_str: :class:`str` :param island_size_str: number of bases which an island may contain :type island_size_str: :class:`str` :param min_gc_ratio_str: the ratio of GC to other bases :type min_gc_ratio_str: :class:`str` :param algo_index: the algorithm chosen :type algo_index: :class:`int` """ seq_mixed_case = Seq(seq_str, IUPAC.unambiguous_dna) seq = seq_mixed_case.upper() # Using `_verify_alphabet' is somewhat questionable, since it # is marked as private. However, there are no other documented # ways to verify the sequence. if not _verify_alphabet(seq): self.view.show_error( '''Sequence letters not within alphabet: Alphabet: {0} Sequence: {1}'''.format(seq.alphabet.letters, str(seq))) return try: island_size = int(island_size_str) except ValueError: self.view.show_error( 'Invalid integer for island size: {0}'.format(island_size_str)) return try: min_gc_ratio = float(min_gc_ratio_str) except ValueError: self.view.show_error( 'Invalid ratio for GC: {0}'.format(min_gc_ratio_str)) return try: min_obs_exp_cpg_ratio = float(min_obs_exp_cpg_ratio_str) except ValueError: self.view.show_error( 'Invalid ratio for minimum observed/expected ' 'CpG ratio: {0}'.format(min_obs_exp_cpg_ratio_str)) return self.model.compute_islands( SeqRecord(seq), island_size, min_gc_ratio, min_obs_exp_cpg_ratio, algo_index)
def get_Short(genesList): """Creates a short version of each fasta file with only the 1st allele""" if not genesList: #print("An empty list was provided. Stopping execution...") #return None raise Exception("An empty list was provided. Stopping execution...") else: for gene in genesList: # gene = gene.rstrip('\n') pathtoDir = os.path.join(os.path.dirname(gene), "short") if not os.path.exists(pathtoDir): os.makedirs(pathtoDir) shortgene = os.path.join(os.path.dirname(gene), "short", os.path.basename(gene)) shortgene = shortgene.replace(".fasta", "_short.fasta") first_allele = next( SeqIO.parse(gene, "fasta", IUPAC.unambiguous_dna)) if not _verify_alphabet(first_allele.seq.upper()): print( "The DNA sequence has invalid nucleotides. Execution will not be stopped." ) raise Exception( "The DNA sequence has invalid nucleotides. Execution will not be stopped." ) else: with open(shortgene, "w") as fG: fG.write('>' + str(first_allele.id) + '\n' + str(first_allele.seq.upper()) + '\n') # with open(shortgene, "w") as fG: # first_allele = next(SeqIO.parse(gene, "fasta", generic_dna)) # fG.write('>' + str(first_allele.id) + '\n' + str(first_allele.seq.upper()) + '\n') #gene_fp2 = HTSeq.FastaReader(gene) # for allele in SeqIO.parse(gene, "fasta", generic_dna): # fG = open(shortgene, 'w') # fG.write('>' + str(allele.id) + '\n' + str(allele.seq.upper()) + '\n') # fG.close() # break return True
def clean(self): cleaned_data = super(JobForm, self).clean() fasta = cleaned_data['fasta'] non_field_errors = [] alphabet = AlphabetEncoder(IUPAC.unambiguous_dna, 'N') references = list(SeqIO.parse(fasta, 'fasta', alphabet=alphabet)) if not references: e = ValidationError( "File does not contain any valid fasta records. Descriptor line " "should start with >" ) self.add_error('fasta', e) non_field_errors.append(e) raise ValidationError(non_field_errors) primary_ref = references[0] primary_ref_len = len(primary_ref) if any(abs(len(r) - primary_ref_len) > 500 for r in references): e = ValidationError( "One or more of your references is too different in length to " "the primary (first) reference. The maximum difference is 500 nt", code='invalid') non_field_errors.append(e) if not 1 <= len(references) <= 100: e = ValidationError( "Between 1 and 100 reference genomes are required in your fasta file. " "We recommend selecting a candidate reference from each lineage " "of interest, rather than many similar references.", code='invalid') non_field_errors.append(e) if any(not _verify_alphabet(r.seq) for r in references): e = ValidationError( "One or more of your fasta sequences contain invalid nucleotide codes. " "The supported alphabet is '{}'. Ambiguity codes and gaps are not " "currently supported.".format(alphabet.letters), code='invalid') non_field_errors.append(e) if non_field_errors: raise ValidationError(non_field_errors) return cleaned_data
def _user_submits(self, seq_str, island_size_str, min_gc_ratio_str, min_obs_exp_cpg_ratio_str, algo_index): """Called when the user submits the form. :param seq_str: the sequence as a string :type seq_str: :class:`str` :param island_size_str: number of bases which an island may contain :type island_size_str: :class:`str` :param min_gc_ratio_str: the ratio of GC to other bases :type min_gc_ratio_str: :class:`str` :param algo_index: the algorithm chosen :type algo_index: :class:`int` """ seq_mixed_case = Seq(seq_str, IUPAC.unambiguous_dna) seq = seq_mixed_case.upper() # Using `_verify_alphabet' is somewhat questionable, since it # is marked as private. However, there are no other documented # ways to verify the sequence. if not _verify_alphabet(seq): self.view.show_error('''Sequence letters not within alphabet: Alphabet: {0} Sequence: {1}'''.format(seq.alphabet.letters, str(seq))) return try: island_size = int(island_size_str) except ValueError: self.view.show_error( 'Invalid integer for island size: {0}'.format(island_size_str)) return try: min_gc_ratio = float(min_gc_ratio_str) except ValueError: self.view.show_error( 'Invalid ratio for GC: {0}'.format(min_gc_ratio_str)) return try: min_obs_exp_cpg_ratio = float(min_obs_exp_cpg_ratio_str) except ValueError: self.view.show_error( 'Invalid ratio for minimum observed/expected ' 'CpG ratio: {0}'.format(min_obs_exp_cpg_ratio_str)) return self.model.compute_islands(SeqRecord(seq), island_size, min_gc_ratio, min_obs_exp_cpg_ratio, algo_index)
def process_fasta(file_path): """ Parse and validate the fasta file. """ references = [] alphabet = AlphabetEncoder(IUPAC.unambiguous_dna, 'N') records = SeqIO.parse(file_path, 'fasta') # may raise # Remove gaps, set alphabet for record in records: ref = SeqRecord(Seq( str(record.seq).replace('-', '').upper(), alphabet), id=record.id, description=record.id) references.append(ref) # Check for too few or too many references if not (1 <= len(references) <= 100): raise ValueError('Between 1 and 100 reference genomes are required.') # Check for max difference in length between references primary_ref = references[0] primary_ref_len = len(primary_ref) if any(abs(len(r) - primary_ref_len) > 500 for r in references): raise ValueError( 'One or more of your references is too different in length to ' 'the primary (first) reference. The maximum difference is ' '500 nt.') # Check for a valid alphabet if any(not _verify_alphabet(r.seq) for r in references): raise ValueError( 'One or more of your fasta sequences contain invalid ' "nucleotide codes. The supported alphabet is '{}'. " 'Ambiguity codes and gaps are not currently supported.'.format( alphabet.letters)) return references
def verify_alphabet(records): from Bio.Seq import Seq from Bio.Data import IUPACData from Bio.Alphabet import _verify_alphabet from Bio.Alphabet import IUPAC letters = list(IUPACData.extended_protein_letters) #print letters #print IUPAC.extended_protein.contains for record in records: my_seq = Seq(str(record.seq), IUPAC.extended_protein) if _verify_alphabet(my_seq) is True: continue else: illegal_char = [] for letter in list(str(record.seq)): if letter not in letters: illegal_char.append(letter) print '%s\t%s' % (record.name, '-'.join(illegal_char))
def _get_motif_dict(self, seq_records, motif_size): """Return a dictionary with information on motifs. This internal function essentially does all of the hard work for finding motifs, and returns a dictionary containing the found motifs and their counts. This is internal so it can be reused by find_motif_differences. """ if self.alphabet_strict: alphabet = seq_records[0].seq.alphabet else: alphabet = None # loop through all records to find the motifs in the sequences all_motifs = {} for seq_record in seq_records: # if we are working with alphabets, make sure we are consistent if alphabet is not None: assert seq_record.seq.alphabet == alphabet, "Working with alphabet %s and got %s" % ( alphabet, seq_record.seq.alphabet, ) # now start finding motifs in the sequence for start in range(len(seq_record.seq) - (motif_size - 1)): motif = str(seq_record.seq[start : start + motif_size]) # if we are being alphabet strict, make sure the motif # falls within the specified alphabet if alphabet is not None: motif_seq = Seq(motif, alphabet) if _verify_alphabet(motif_seq): all_motifs = self._add_motif(all_motifs, motif) # if we are not being strict, just add the motif else: all_motifs = self._add_motif(all_motifs, motif) return all_motifs
def _get_motif_dict(self, seq_records, motif_size): """Return a dictionary with information on motifs. This internal function essentially does all of the hard work for finding motifs, and returns a dictionary containing the found motifs and their counts. This is internal so it can be reused by find_motif_differences. """ if self.alphabet_strict: alphabet = seq_records[0].seq.alphabet else: alphabet = None # loop through all records to find the motifs in the sequences all_motifs = {} for seq_record in seq_records: # if we are working with alphabets, make sure we are consistent if alphabet is not None: assert seq_record.seq.alphabet == alphabet, \ "Working with alphabet %s and got %s" % \ (alphabet, seq_record.seq.alphabet) # now start finding motifs in the sequence for start in range(len(seq_record.seq) - (motif_size - 1)): motif = str(seq_record.seq[start:start + motif_size]) # if we are being alphabet strict, make sure the motif # falls within the specified alphabet if alphabet is not None: motif_seq = Seq(motif, alphabet) if _verify_alphabet(motif_seq): all_motifs = self._add_motif(all_motifs, motif) # if we are not being strict, just add the motif else: all_motifs = self._add_motif(all_motifs, motif) return all_motifs
def count_alignment(alignment, columns=None, refidx=None, limit=100): aln = iter(alignment) records = [] for i, r in enumerate(aln, start=len(records)): if len(records) > limit: break if i == refidx: continue records.append((i, r)) alph = None for alph_ in (DNA_ALPHABET, RNA_ALPHABET, AMINO_ALPHABET): for _, r in records: r.seq.alphabet = alph_ if all(_verify_alphabet(r.seq.upper()) for _, r in records): alph = alph_ break if alph is None: raise RuntimeError('unknown alphabet') skips = _GAP if alph in (DNA_ALPHABET, RNA_ALPHABET): T = 'T' if alph == DNA_ALPHABET else 'U' letters = 'ACG' + T ambigs = { 'M': 'AC', 'R': 'AG', 'W': 'A' + T, 'S': 'CG', 'Y': 'C' + T, 'K': 'G' + T, 'V': 'ACG', 'H': 'AC' + T, 'D': 'AG' + T, 'B': 'CG' + T, } skips += 'N' colors = DNA_COLORS elif alph == AMINO_ALPHABET: letters = 'ACDEFGHIKLMNPQRSTVWY' ambigs = { 'B': 'DN', 'J': 'IL', 'Z': 'EQ', } skips += _STOP + 'X' + 'OU' colors = AMINO_COLORS else: raise RuntimeError("sequences with indeterminable alphabet provided") def b(r): return r.upper() def allrecords(): for i, r in records: yield b(str(r.seq)) for i, r in enumerate(aln, start=i): if i == refidx: continue yield b(str(r.seq)) alphabet = letters + ''.join(sorted(ambigs.keys())) nchar = len(alphabet) values = np.zeros((nchar, len(letters)), dtype=float) for i, c in enumerate(alphabet): if i < len(letters): values[i, i] = 1. else: v = 1. / len(ambigs[c]) for d in ambigs[c]: values[i, alphabet.index(d)] = v if columns is not None: columns = np.array(columns, dtype=int) counts = _count( allrecords(), columns, b(alphabet), values ) return counts.transpose(), (letters, colors)
def test_all_even_numbered_lines_are_dna_sequences(self): class DNAdict(): letters = 'GATCN' seq_list = [Seq(x, DNAdict) for x in self.lines[1::2]] self.assertTrue(all(_verify_alphabet(seq) for seq in seq_list))
filename = infl.split("\t")[-1] name = filename.split(".")[0] dirpath = os.path.join(cudir + "/" + name) if os.path.exists(dirpath) and os.path.isdir(dirpath): shutil.rmtree(dirpath) os.mkdir(cudir + "/" + name) subprocess.call(["cp", infl, cudir + "/input_file/"]) fasta_sequence = SeqIO.parse(open(cudir + "/input_file/" + filename), "fasta") for fasta in fasta_sequence: name1, sequence = fasta.id, str(fasta.seq) my_prot = Seq(sequence, IUPAC.protein) if _verify_alphabet(my_prot) == False: with open(cudir + "/false_sequence444.fasta", "a") as handle: count = SeqIO.write(fasta, handle, "fasta") continue else: ##print(my_prot, _verify_alphabet(my_prot)) with open(cudir + "/" + name + "/" + name + ".fasta", "a") as handle: count = SeqIO.write(fasta, handle, "fasta") fasta_sequence = SeqIO.parse( open(cudir + "/" + name + "/" + name + ".fasta"), "fasta") for fasta in fasta_sequence: with open(name + "_Final_Resultant.txt", "a") as final_fl: with open(name + "_Final_Resultant_values.txt", "a") as inter_fl: #print fasta
def global_align(seq_record1, seq_record2): """Global alignment using the Bio.pairwise2 package. Check if sequences are nucleotide or amino acids using the _verify_alphabet function from the Bio.Alphabet module. """ from Bio.Alphabet import IUPAC from Bio.Seq import Seq from Bio.Alphabet import _verify_alphabet #gap_open = -10 #gap_extend = -0.5 seq_record1.seq = seq_record1.seq.upper() seq_record2.seq = seq_record2.seq.upper() seq1_file = NamedTemporaryFile() SeqIO.write(seq_record1, seq1_file, "fasta") seq1_file.flush() seq2_file = NamedTemporaryFile() SeqIO.write(seq_record2, seq2_file, "fasta") seq2_file.flush() seq_record1.seq.alphabet = IUPAC.ambiguous_dna seq_record2.seq.alphabet = IUPAC.ambiguous_dna if _verify_alphabet(seq_record1.seq) and _verify_alphabet(seq_record2.seq): #print "DNA!" # alns = pairwise2.align.globalds(seq1, seq2, DNA_matrix, gap_open, gap_extend) # print ">"+noms[id_seq1] # print alns[0][0] # print ">"+noms[id_seq2] # print alns[0][1] # return alns[0] needle_cline = NeedleCommandline(asequence=seq1_file.name, bsequence=seq2_file.name, stdout=True, gapopen=10, gapextend=0.5, auto=True, aformat="srspair") stdout, stderr = needle_cline() #print stdout align = AlignIO.read(StringIO.StringIO(stdout), "emboss") return align seq_record1.seq.alphabet = IUPAC.protein seq_record2.seq.alphabet = IUPAC.protein #print seq1 #print _verify_alphabet(seq1) if _verify_alphabet(seq_record1.seq) and _verify_alphabet(seq_record2.seq): #print "AA!" # alns = pairwise2.align.globalds(seq1, seq2, matlist.blosum62, gap_open, gap_extend) # return alns[0] needle_cline = NeedleCommandline(asequence=seq1_file.name, bsequence=seq2_file.name, stdout=True, gapopen=10, gapextend=0.5, auto=True, aformat="srspair") stdout, stderr = needle_cline() align = AlignIO.read(StringIO.StringIO(stdout), "emboss") return align else: raise "unkown alphabet!"
def test_all_sequences_are_legitimate_dna_sequences(self): class DNAdict(): letters='GATCN' seq_list = [Seq(x, DNAdict) for x in self.seqs.values()] self.assertTrue(all(_verify_alphabet(seq) for seq in seq_list))
#import Bio import Bio.Seq from Bio.Alphabet import _verify_alphabet #help(Bio.SeqIO) #help(Bio.SeqIO.FastaIO) #from Bio.Align.Applications import ClustalwCommandline #help(Bio.Align) part_name = '/home/nastia/Desktop/fasta/' list_fasta = os.listdir(part_name) list_finish = [] for entry in list_fasta: data = Bio.SeqIO.FastaIO.SimpleFastaParser(open(part_name+entry))#, (alphabet=Bio.Alphabet.generic_protein)) for val in data: my_seq=Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.unambiguous_rna) my_seq_1=Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.unambiguous_dna) my_seq_2=Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.protein) #print(val) #list_finish.append(val) if not _verify_alphabet(my_seq) and not _verify_alphabet(my_seq_1) and _verify_alphabet(my_seq_2): #if not _verify_alphabet(my_seq): list_finish.append(val) print(len(list_finish)) #for record in list_finish: #print(record.alphabet) #print(list_finish) #cline = ClustalwCommandline('clustalo', infile=list_finish) #for entry in list_finish: #cline = MuscleCommandline(input=list_finish, output='/home/nastia/Desktop/fasta.txt') #from Bio import AlignIO #align = AlignIO.read('/home/nastia/fasta/opuntia.aln', 'clustal') #/home/nastia/Desktop/fasta
def build_fasta(self, organism, process_type, output_file): # read IEDB input file iedb = pd.read_csv(self.input_file, skiprows=1) # filter entries filtered_iedb = iedb[(iedb["Name"].str.contains(organism)) & (iedb["Object Type"] == "Linear peptide") & (iedb["Process Type"] == process_type) & (iedb["Qualitative Measure"] == "Positive") & (iedb["Class"] == "I")] # parses peptides and validates them, non-valid peptides are filtered out filtered_iedb.loc[:, "seq"] = filtered_iedb.loc[:, "Description"].transform( lambda x: x.strip()) filtered_iedb.loc[:, "valid_peptide"] = filtered_iedb.loc[:, "seq"].transform( lambda x: _verify_alphabet(Seq(x, IUPAC.protein) )) filtered_iedb = filtered_iedb[filtered_iedb.valid_peptide] # build fasta header: 449|FL-160-2 protein - Trypanosoma cruzi|JH0823|Trypanosoma cruzi|5693 # epitope id|Antigen Name|antigen_id|Organism Name|organism_id filtered_iedb.loc[:, "epitope_id"] = filtered_iedb.loc[:, "Epitope IRI"].transform( lambda x: x.replace( "http://www.iedb.org/epitope/", "", regex=True)) filtered_iedb.loc[:, "antigen_id"] = filtered_iedb.loc[:, "Antigen IRI"].transform( lambda x: x.replace( "http://www.ncbi.nlm.nih.gov/protein/", "", regex=True).replace( "https://ontology.iedb.org/ontology/", "", regex=True)) filtered_iedb.loc[:, "organism_id"] = filtered_iedb.loc[:, "Organism IRI"].transform( lambda x: x.replace( "http://purl.obolibrary.org/obo/NCBITaxon_", "", regex=True)) filtered_iedb.loc[:, "fasta_header"] = filtered_iedb.apply( lambda row: ">{epitope_id}|{antigen_name}|{antigen_id}|{organism_name}|{organism_id}" .format( epitope_id=str(row["epitope_id"]), antigen_name=row["Antigen Name"], antigen_id=str(row["antigen_id"]), organism_name=row["Organism Name"], organism_id=str(row["organism_id"]), ), axis=1, ) filtered_iedb.drop_duplicates(subset="seq", keep="last", inplace=True) # writes output FASTA file with open(output_file, "w") as fasta: for index, row in filtered_iedb.iterrows(): fasta.write( ">{header}\n".format(header=str(row["fasta_header"]))) fasta.write("{sequence}\n".format(sequence=str(row["seq"])))
from Bio.Alphabet import _verify_alphabet #help(Bio.SeqIO) #help(Bio.SeqIO.FastaIO) #from Bio.Align.Applications import ClustalwCommandline #help(Bio.Align) part_name = '/home/nastia/Desktop/fasta/' list_fasta = os.listdir(part_name) list_finish = [] for entry in list_fasta: data = Bio.SeqIO.FastaIO.SimpleFastaParser( open(part_name + entry)) #, (alphabet=Bio.Alphabet.generic_protein)) for val in data: my_seq = Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.unambiguous_rna) my_seq_1 = Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.unambiguous_dna) my_seq_2 = Bio.Seq.Seq(val[1], Bio.Alphabet.IUPAC.protein) #print(val) #list_finish.append(val) if not _verify_alphabet(my_seq) and not _verify_alphabet( my_seq_1) and _verify_alphabet(my_seq_2): #if not _verify_alphabet(my_seq): list_finish.append(val) print(len(list_finish)) #for record in list_finish: #print(record.alphabet) #print(list_finish) #cline = ClustalwCommandline('clustalo', infile=list_finish) #for entry in list_finish: #cline = MuscleCommandline(input=list_finish, output='/home/nastia/Desktop/fasta.txt') #from Bio import AlignIO #align = AlignIO.read('/home/nastia/fasta/opuntia.aln', 'clustal') #/home/nastia/Desktop/fasta
def graph_logo( alignment, columns, filename=None, dpi=None, edgecolor='k', figsize=None, format='pdf', labels=None, linewidth=0., transparent=True, refidx=-1 ): if filename is None: fd, filename = mkstemp(); close(fd) if figsize is None: figsize = (3, 3) if labels is None: labels = ['%d' % (idx + 1) for idx in columns] if refidx >= 0: msa = alignment alignment = msa[:refidx] alignment.extend(msa[refidx + 1:]) M = len(alignment) N = len(columns) alph = None for _alph in (_DNA_ALPHABET, _RNA_ALPHABET, _AMINO_ALPHABET): for r in alignment: r.seq.alphabet = _alph if all([_verify_alphabet(r.seq.upper()) for r in alignment]): alph = _alph break if alph is None: raise RuntimeError("sequences with indeterminable alphabet provided") motif = Motif(alphabet=alph) instances = (''.join(z).upper() for z in zip(*[alignment[:, i] for i in columns])) for instance in instances: motif.add_instance(Seq(instance, alph)) # set laplace = True to include the backgrounds pwm = _fix_ambigs(motif.pwm(laplace=False), alph) # heuristic to determine whether nucleotide or protein alphabet # need to use either base 4 or 20 depending alphlen, _alphkeys = max(((len(pwm[i]), pwm[i].keys()) for i in range(N)), key=itemgetter(0)) s, colors = (4, _DNA_COLORS) if alphlen < 20 else (20, _AMINO_COLORS) alphkeys = [''] alphkeys.extend(_alphkeys) alphmap = dict(zip(alphkeys, range(len(alphkeys)))) # compute the information content at each position maxbits = np.log2(s) e_n = (s - 1) / (2. * np.log(2) * M) R = maxbits * np.ones((N,), dtype=float) R -= [-sum(v * np.log2(v) for _, v in pwm[i].items() if v > 0.) for i in range(N)] R -= e_n heights = np.zeros((alphlen, N), dtype=float) identities = np.zeros((alphlen, N), dtype=int) for j in range(N): i = 0 for k, v in sorted(pwm[j].items(), key=itemgetter(1)): heights[i, j] = R[j] * v identities[i, j] = alphmap[k] i += 1 font = Basefont(join(_HY454_FONT_PATHS[0], 'Roboto-Black.ttf')) fig = plt.figure(figsize=figsize, dpi=dpi) # make each column a vertical golden rect rect = 0.2, 0.2, 0.382 * N, 0.618 ax = fig.add_axes(rect) _adjust_spines_outward(ax, ('left',), 9) ax.set_ylabel('bits', fontproperties=_ROBOTO_REGULAR) if figsize is None: fig.set_figwidth(N) if transparent: fig.patch.set_alpha(0.) ax.patch.set_alpha(0.) # remove the top and right ticks for tick in ax.xaxis.get_major_ticks() + ax.yaxis.get_major_ticks(): tick.tick2On = False # remove the bottom ticks for tick in ax.xaxis.get_major_ticks(): tick.tick1On = False # rotate the x-axis labels by 45 degrees to enhance packing for label in ax.xaxis.get_ticklabels(): label.set_rotation(45) # set font properties for label in ax.xaxis.get_ticklabels() + ax.yaxis.get_ticklabels(): label.set_fontproperties(_ROBOTO_REGULAR) # disable top and right spines, we don't need them ax.spines['bottom'].set_visible(False) ax.spines['top'].set_visible(False) ax.spines['right'].set_visible(False) def format_xlabel(x, pos=None): idx = np.clip(int(x)-1, 0, N-1) return labels[idx] ax.xaxis.set_major_formatter(FuncFormatter(format_xlabel)) # avoid too much precision ax.yaxis.set_major_formatter(FormatStrFormatter('%1.1f')) # set the ticks ysep = 0.5 if alphlen < 20 else 1.0 yticks = np.arange(0, maxbits, ysep, dtype=float) if maxbits - yticks[-1] < ysep: yticks[-1] = maxbits else: yticks = np.append(yticks, maxbits) ax.set_yticks(yticks) ax.set_xticks(np.arange(1, N+1, dtype=float) + 0.5) # set the axes limits here AFTER the ticks, otherwise borkage ax.set_xlim((1, N+1)) ax.set_ylim((0, maxbits)) idxs = np.arange(1, N+1) bottoms = np.zeros((N,), dtype=float) for i in range(alphlen): bars = ax.bar(idxs, heights[i, :], width=1., bottom=bottoms) bottoms += heights[i, :] for j, bar in enumerate(bars): if identities[i, j]: l = alphkeys[identities[i, j]] glyph = font[l] ax.add_patch(glyph) glyph.set_transform(bar.get_transform()) bar.set_visible(False) glyph.set_edgecolor(edgecolor) glyph.set_facecolor(colors[l]) glyph.set_linewidth(linewidth) glyph.set_zorder(-1) # set the remaining spine to show the maximum value ax.spines['left'].set_bounds(0, max(bottoms)) fig.savefig(filename, format=format, transparent=transparent, bbox_inches='tight', pad_inches=0.25) return filename
def count_alignment(alignment, columns='all', refidx=None, limit=100, embedded_counts = None): records = [] if embedded_counts is not None: import re if columns is None or columns == 'all': r = next(iter(alignment)) columns = list(range(len(r))) records.append((0, r)) N = len(columns) for i, r in enumerate(alignment, start=len(records)): if len(records) > limit: break if i == refidx: continue records.append((i, r)) alph = None for alph_ in (DNA_ALPHABET, RNA_ALPHABET, AMINO_ALPHABET): for _, r in records: r.seq.alphabet = alph_ if all(_verify_alphabet(r.seq.upper()) for _, r in records): alph = alph_ break if alph is None: raise RuntimeError('unknown alphabet') skips = _GAP if alph in (DNA_ALPHABET, RNA_ALPHABET): T = 'T' if alph == DNA_ALPHABET else 'U' letters = 'ACG' + T ambigs = { 'M': 'AC', 'R': 'AG', 'W': 'A' + T, 'S': 'CG', 'Y': 'C' + T, 'K': 'G' + T, 'V': 'ACG', 'H': 'AC' + T, 'D': 'AG' + T, 'B': 'CG' + T, } skips += 'N' colors = DNA_COLORS elif alph == AMINO_ALPHABET: letters = 'ACDEFGHIKLMNPQRSTVWY' ambigs = { 'B': 'DN', 'J': 'IL', 'Z': 'EQ', } skips += _STOP + 'X' + 'OU' colors = AMINO_COLORS else: raise RuntimeError("sequences with indeterminable alphabet provided") s = len(letters) counts = np.zeros((s, N), dtype=float) def allrecords(): i = 0 for i, r in records: yield r for i, r in enumerate(alignment, start=i): if i == refidx: continue yield r for r in allrecords(): for j, c in enumerate(columns): if embedded_counts is not None: m = embedded_counts.search (r.name) if m is not None: weight = float (m.group(1)) else: weight = 1. ltr = r[c].upper() if ltr in skips: continue elif ltr in ambigs: frac = weight / len(ambigs[ltr]) for ltr_ in ambigs[ltr]: i = letters.index(ltr_) counts[i, j] += frac elif ltr in letters: i = letters.index(ltr) counts[i, j] += weight else: raise ValueError('unknown letter: {0}'.format(ltr)) return counts, (letters, colors)