def gene_feature(Y): """ Things like the sequence of the gene, the DNA Tm of the gene, etc. """ gene_names = Y["Target gene"] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values == gene] = len(seq) gc_content[gene_names.values == gene] = SeqUtil.GC(seq) temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False) molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight( seq, "DNA") everything = np.concatenate( (gene_length, gc_content, temperature, molecular_weight), axis=1) df = pd.DataFrame( data=everything, index=gene_names.index, columns=[ "gene length", "gene GC content", "gene temperature", "gene molecular weight", ], ) return df
def run_pam_finder(target_fa, seq, PAM, abs_start_pos, chr): # SeqUtils.nt_search("AGGCGGGGG", "NGG") # SeqUtils.nt_search("CCACCA", "NGG") # forward rev_seq = revcomp(target_fa) fwd_search = SeqUtils.nt_search(target_fa, seq + PAM) rev_search = SeqUtils.nt_search(rev_seq, seq + PAM) out = [] if len(fwd_search) > 1: for s in fwd_search[1:]: # out.append([chr,s+abs_start_pos,s+abs_start_pos+len(seq),target_fa[s:(s+len(seq))],".","+"]) out.append([ chr, s + abs_start_pos, s + abs_start_pos + len(seq), target_fa[s:(s + len(seq))], target_fa[s:(s + len(seq) + len(PAM))], "+" ]) if len(rev_search) > 1: for s in rev_search[1:]: # out.append([chr,(len(target_fa)-s)+abs_start_pos-len(seq),(len(target_fa)-s)+abs_start_pos,rev_seq[s:(s+len(seq))],".","-"]) out.append([ chr, (len(target_fa) - s) + abs_start_pos - len(seq), (len(target_fa) - s) + abs_start_pos, rev_seq[s:(s + len(seq))], rev_seq[s:(s + len(seq) + len(PAM))], "-" ]) return pd.DataFrame(out)
def align(seq1, seq2, debug=False): flat1 = seq.seq1(''.join(seq1)).replace('X', '-') flat2 = seq.seq1(''.join(seq2)).replace('X', '-') flats = [flat1, flat2] # aligning 2 to 1 seems to give better results align = pairwise2.align.localxs(flat2, flat1, -1000, -1000, one_alignment_only=True) start = align[0][3] offset = [0, 0] # compute how many gaps had to be inserted at beginning to align for i in range(2): assert len(align[0][0]) == len(align[0][1]) for j in range(len(align[0][0])): # account for the fact that 2 and 1 are switched in alignment results # if there is a gap in 1 if align[0][(i + 1) % 2][j] == '-': # but not the other if flats[i][j - offset[i]] != '-': offset[i] += 1 else: break if debug: print( pairwise2.format_alignment(flat2[offset[0]:], flat1[offset[1]:], 10, 0, len(flat1) - offset[1])) return -offset[0], -offset[1]
def create_res_df_from_bam(input_file, reference): species_list, chr_length_list, read_count_list, basecount_list, gc_ref_list, gc_reads_list = [], [], [], [], [], [] for seq_record in SeqIO.parse(reference, 'fasta'): # joining all reads joined_reads = ''.join([ read.query_sequence for read in pysam.AlignmentFile(input_file, 'rb').fetch( contig=seq_record.name) ]) # appending to all Lists species_list.append(seq_record.name) chr_length_list.append(len(seq_record.seq)) read_count_list.append( pysam.AlignmentFile(input_file, 'rb').count(contig=seq_record.name)) gc_ref_list.append(SeqUtils.GC(seq_record.seq)) gc_reads_list.append(SeqUtils.GC(joined_reads)) basecount_list.append(sum([len(joined_reads)])) # create and return dataframe return pd.DataFrame( data={ 'species': species_list, 'chr_length': chr_length_list, 'gc_ref': gc_ref_list, 'gc_reads': gc_reads_list, 'read_count': read_count_list, 'basecount': basecount_list, })
def count_amplicons(in_name, fprimer, rc): Fprimer = Seq(fprimer, IUPAC.ambiguous_dna) pre_length = Counter() if rc: post_length = Counter() bothfound = 0 Rprimer = Seq(fprimer, IUPAC.ambiguous_dna).reverse_complement() lenRprimer = len(Rprimer) with open(in_name, 'r') as fastqF: for seqRecord in SeqIO.parse(fastqF, "fastq"): Fpos = SeqUtils.nt_search(str(seqRecord.seq), str(Fprimer)) if len(Fpos) > 1: # SeqUtils.nt_search returns the pattern, followed by positions of any matches # Forward primer found: increment pre_length pre_length[Fpos[1]] += 1 if rc: RCpos = SeqUtils.nt_search(str(seqRecord.seq), str(Rprimer)) if len(RCpos) > 1: tail = len(seqRecord) - RCpos[-1] - lenRprimer post_length[tail] += 1 if len(Fpos) > 1: bothfound += 1 print("Primers found:", sum(pre_length.values())) print("Counts of pre_length:", pre_length) if rc: print("Reverse primers found:", sum(post_length.values())) print("Counts of post_length", post_length) print("Both primer and reverse_complement found:", bothfound)
def gene_feature(Y, X, learn_options): ''' Things like the sequence of the gene, the DNA Tm of the gene, etc. ''' gene_names = Y['Target gene'] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values == gene] = len(seq) gc_content[gene_names.values == gene] = SeqUtil.GC(seq) temperature[gene_names.values == gene] = Tm.Tm_NN(seq, rna=False) molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight( seq, 'DNA') all = np.concatenate( (gene_length, gc_content, temperature, molecular_weight), axis=1) df = pandas.DataFrame(data=all, index=gene_names.index, columns=[ 'gene length', 'gene GC content', 'gene temperature', 'gene molecular weight' ]) return df
def filtByPrimer(self, fwd_primer, rvs_primer): with open(self.input_forward) as fh: with open(self.input_reverse) as rh: count_keep = 0 count_discard = 0 for ((title_f, seq_f, qual_f), (title_r, seq_r, qual_r)) in zip(FastqGeneralIterator(fh), FastqGeneralIterator(rh)): try: if (SeqUtils.nt_search(seq_f, fwd_primer)[1] == 0) & ( SeqUtils.nt_search(seq_r, rvs_primer)[1] == 0): with open(self.output_forward, 'a') as ofh: ofh.write( '@' + '\n'.join([title_f, seq_f, '+', qual_f]) + '\n') with open(self.output_reverse, 'a') as orh: orh.write( '@' + '\n'.join([title_r, seq_r, '+', qual_r]) + '\n') count_keep += 1 else: count_discard += 1 except IndexError: count_discard += 1 print(' Number of reads saved: ' + str(count_keep)) print(' Number of reads discard :' + str(count_discard))
def search_motif(sequence): motif = str(args.pam) len_motif = int(len(motif)) len_protospacer = int(args.length_protospacer) full_len = len_motif + len_protospacer len_dna = int(len(sequence.seq)) # Output of nt_search is a list containing the motif and the start position (0-based) # of every hit in the DNA sequence # Search on fw strand matches_fw = SeqUtils.nt_search(str(sequence.seq), motif) # Initialyze final list coordinates_fw = [] if len(matches_fw) > 1: end_positions_fw = matches_fw[1::] start_positions_fw = [ end - len_protospacer for end in end_positions_fw ] # Check if protospacer fits in the sequence before adding the start # and end coordinate to the list for start, end in zip(start_positions_fw, end_positions_fw): if start > 0: coordinates_fw.append([start, end]) # The coordinates are different and need to be corrected to match to fw strand reverse_seq = str(sequence.seq.reverse_complement()) matches_rv = SeqUtils.nt_search(reverse_seq, motif) # Initialyze final list coordinates_rv = [] if len(matches_rv) > 1: end_positions_rv = matches_rv[1::] start_positions_rv = [ end - len_protospacer for end in end_positions_rv ] # Need to convert the coordinates in forward strand end_positions = [len_dna - start for start in start_positions_rv] start_positions = [len_dna - end for end in end_positions_rv] # Check if protospacer fits in the sequence before adding the start # and end coordinate to the list for start, end in zip(start_positions, end_positions): if start > 0 and end < len_dna: coordinates_rv.append([start, end]) # Return a tuple of lists for fw and rv matches return coordinates_fw, coordinates_rv
def compute_stats(seq): stats = SeqStats stats.length = len(seq) stats.gc = SeqUtils.GC(seq) try: stats.weight = SeqUtils.molecular_weight(seq) except ValueError: stats.weight = None return stats
def target_genes_stats( genes=[ 'HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', 'MED12', 'CCDC101' ]): for gene in genes: seq = get_gene_sequence(gene) if seq != None: print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % ( gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc( seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))
def var1_to_var3(var1): refAA1 = variant.get_refAA(var1) newAA1 = variant.get_newAA(var1) pos = variant.get_pos(var1) refAA3 = "*" if refAA1 == "*" else SeqUtils.seq3(refAA1) # refAA1 = SeqUtils.seq1(refAA3) newAA3 = "*" if newAA1 == "*" else SeqUtils.seq3(newAA1) # newAA1 = SeqUtils.seq1(newAA3) var3 = ''.join([refAA3, str(pos), newAA3]) return var3
def var3_to_var1(var3): refAA3 = variant.get_refAA(var3) newAA3 = variant.get_newAA(var3) pos = variant.get_pos(var3) refAA1 = "*" if refAA3 == "*" else SeqUtils.seq1(refAA3) # refAA1 = SeqUtils.seq1(refAA3) newAA1 = "*" if newAA3 == "*" else SeqUtils.seq1(newAA3) # newAA1 = SeqUtils.seq1(newAA3) var1 = ''.join([refAA1, str(pos), newAA1]) return var1
def computeGCContent(seq_file=None, sequence=None): """ computes the GC-content of a given sequence or sequence file. Returns the GC-content in % """ if (seq_file is None) == (sequence is None): raise Exception("Error in computeGCContent: Either seq_file or sequence must be specified") if seq_file is not None: gc_contents = [SeqUtils.GC123(s.seq)[0] for s in SeqIO.parse(seq_file, format="fasta")] # use GC123 instead of GC to cope with dashes if len(gc_contents) > 1: logging.debug("gc_content is averaged over all sequences found in %s"%seq_file) return sum(gc_contents)/len(gc_contents) else: return SeqUtils.GC123(sequence)[0]
def get_distances(res_pairs, get_coords): ''' Get distances for all pairs of residues between two chains res_pairs: generator over tuples ((res_a, res_b), ...) get_coords: function to get residue coordinates Returns a list over 5-tuples: [(resn_a, resn_b, aa_a, aa_b, dist), ...] ''' return [(res_a.id[1], res_b.id[1], distances.calc_residue_distance(res_a, res_b, get_coords), SeqUtils.seq1(res_a.resname), SeqUtils.seq1(res_b.resname)) for (res_a, res_b) in res_pairs]
def writePBS(): global variation, seqRecordToCheck, seqRecordToCheckComplement, difference, newFeature for variation in featureStatistic_container[feature]: primerSeq = str(variation.seq) primerName = variation.note partialPrimerSeq = primerSeq[len(primerSeq) - 15::] seqRecordToCheck = str(record.seq) seqRecordToCheckComplement = str(reverse_complement(record.seq)) matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheck, partialPrimerSeq) if (len(matchingPrimerPositions) > 1): difference = len(primerSeq) - len(partialPrimerSeq) length = len(matchingPrimerPositions) for j in range(1, length): if primerSeq == seqRecordToCheck[matchingPrimerPositions[j] - difference: matchingPrimerPositions[j] - difference + len(primerSeq)]: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], matchingPrimerPositions[j] + len(primerSeq), strand=1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature) else: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition( matchingPrimerPositions[j] + len(primerSeq)), strand=1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature) matchingPrimerPositions = SeqUtils.nt_search(seqRecordToCheckComplement, partialPrimerSeq) if (len(matchingPrimerPositions) > 1): difference = len(primerSeq) - len(partialPrimerSeq) length = len(matchingPrimerPositions) for j in range(1, length): if primerSeq == seqRecordToCheckComplement[matchingPrimerPositions[j] - difference: matchingPrimerPositions[j] - difference + len(primerSeq)]: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], matchingPrimerPositions[j] + len(primerSeq), strand=-1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature) else: newFeature = SeqFeature(FeatureLocation(matchingPrimerPositions[j], AfterPosition( matchingPrimerPositions[j] + len(primerSeq)), strand=-1), type=str(feature)) newFeature.qualifiers['note'] = primerName newRecord.features.append(newFeature)
def get_gen_stats(gbk_list): # NOTE: for now, the coding density do not take overlapping genes # into account. Depending on how many of them are present in a genome, # this may cause an overestimation of the coding density, as each # CDS will be accounted for separately (and a same region will be counted # several times). hsh_gen_stats = {} for gbk_file in gbk_list: ttl_length = 0 gc_cum = 0 cds_length = 0 for record in SeqIO.parse(gbk_file, "genbank"): ttl_length += len(record) gc_cum += SeqUtils.GC(record.seq) * len(record) for fet in record.features: if fet.type in ["CDS", "tmRNA", "rRNA", "ncRNA", "tRNA"]: if "pseudo" in fet.qualifiers: continue location = fet.location # allow to take compoundlocation into account for part in location.parts: cds_length += part.end - part.start gbk_shortened = gbk_file.replace(".gbk", "") hsh_gen_stats[gbk_shortened] = (float(gc_cum) / ttl_length, float(cds_length) / ttl_length, ttl_length) return hsh_gen_stats
def _find_iseq(self, seq: Seq, iseq_str: str, iseq_id: str = "integrated sequence") -> int: """The Function to find index/location of iseq_str within the sequence. Args: seq: Sequence to search. iseq_str: The subsequence you are searching for. iseq_id (optional): The id/name of the subsequence (iseq_str), Defaults to "integrated sequence". Returns: int: The index/location of iseq within sequence. Raises: PartException: If iseq_str can not be found within the sequence, if multiple iseq_str exist within the sequence. """ search_out = SeqUtils.nt_search(str(seq), iseq_str) if len(search_out) < 2: raise PartException(f"{self.id} lacks {iseq_id}") elif len(search_out) > 2: raise PartException(f"{self.id} contains multiple {iseq_id}") return search_out[1]
def my_own_filtering(input_file, output_file, filt_gc=45, filt_arom=0.01): sequences = {} c = 0 with open(input_file, "r") as content: for record in SeqIO.parse(content, "fasta"): c += 1 # calculate GC content using Bio calc_gc = SeqUtils.GC(record.seq) # calculate aromaticity using Bio prot_seq = record.seq.translate() X = ProteinAnalysis(str(prot_seq)) calc_arom = X.aromaticity() # so, now you can filter if calc_gc >= filt_gc and calc_arom >= filt_arom: sequences[record.id] = record.se # write a new fasta file with aminoacids records = [] for seq_id, seq in sequences.items(): records.append(SeqRecord(seq.translate(), id=seq_id, description="")) write_file = open('my_fasta', 'w') SeqIO.write(records, write_file, 'fasta') write_file.close() # print the percentage print(len(records) / c)
def main(): """Main application body""" # Genome sequence and annotations genome = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/fasta/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like_Genome.fasta') annotations = load_file('http://tritrypdb.org/common/downloads/release-6.0/TcruziCLBrenerEsmeraldo-like/gff/data/TriTrypDB-6.0_TcruziCLBrenerEsmeraldo-like.gff') # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013) motifs = load_motifs('najafabadi_table_s1_2013.csv') # Load genome sequence chromosomes = load_fasta(genome) # Parse annotations and return 3'UTR coordinates genes = get_utr_coords(annotations, utr_length=500) # For each gene, return a list of the motifs that are present in its 3'UTR for gene in genes: utr_seq = get_3utr_seq(chromosomes, gene) # check each motif to see if it is present utr3_motifs = [] for motif in motifs: matches = SeqUtils.nt_search(utr_seq, motif)[1:] # save matched motif if len(matches) > 0: utr3_motifs.append(motif) # output results print("%s: %s" % (gene['id'], ", ".join(utr3_motifs)))
def gdps(str_list): ret_list = [] i = 50 for strng in str_list: ret_list.append((i, SeqUtils.GC(strng))) i = i + 100 return ret_list
def get33original_seq(self, aa_code_string): """.""" if self.upper == 1: aa_code_string = aa_code_string.upper() code_original = '' len_sep = len(self.separator) i = 0 while i < len(aa_code_string): aa_code = aa_code_string[i:i + 3] if aa_code == 3 * self.gap_char: aa_code_original = 3 * self.gap_char elif aa_code == self.unknown3: aa_code_original = self.unknown3 else: if Raf.to_one_letter_code.has_key(aa_code): aa_code_original = SeqUtils.seq3( Raf.to_one_letter_code[aa_code]) if aa_code_original in self.blankseq3: aa_code_original = self.unknown3 else: aa_code_original = self.unknown3 code_original = code_original + aa_code_original i = i + 3 if aa_code_string[i:i + len_sep] == self.separator: i = i + len_sep code_original = code_original + self.separator code_original = code_original.upper() return code_original
def generate_wide_table(all_fastas): global args, output_handle basis = [['A', 'T', 'G', 'C']] * args.kmer_length all_kmers = sorted(["".join(x) for x in tuple(itertools.product(*basis))]) records_to_kmer = {} for f in all_fastas: logger.debug("Processing file %s" % (f)) for record in SeqIO.parse(f, "fasta", generic_dna): logger.debug("Processing sequence %s" % (record.description)) seq = str(record.seq) fasta_keys = [f, record.description, str(len(seq))] # Add additional features to the sequence fasta_keys.append(str(SeqUtils.GC(record.seq))) # fasta_keys = tuple(fasta_keys) records_to_kmer[fasta_keys] = collections.defaultdict(int) for i in range(0, len(seq) - args.kmer_length): kmer = seq[i:i + args.kmer_length] records_to_kmer[fasta_keys][kmer] += 1 if not args.append: print >> output_handle, "\t".join( ["path", "sequence_description", "sequence_length", "GC"] + all_kmers) for k, kmer_values in records_to_kmer.items(): all_values = list(k) all_values.extend(map(str, [kmer_values.get(x, 0) for x in all_kmers])) # print len(all_values) print >> output_handle, "\t".join(all_values)
def tbl_format(bed4_rrna, bed4_cds, bed4_trna): """ tbl format : --- >refname # once --- for each term: 2line anntation start\tend\ttype\n\t\t\tkey\tvalue\n --- trna and rrna shows once, but cds show as gene and cds :param bed4_rrna: :param bed4_cds: :param bed4_trna: :return: """ #sanity check if bed4_rrna[0][0]==bed4_cds[0][0]==bed4_trna[0][0]: ref=bed4_rrna[0][0] else: return "Error, annotations not from the same reference!" # type_dict={} for x in bed4_rrna: type_dict[x[3]]="rRNA" for x in bed4_trna: type_dict[x[3]]="tRNA" for x in bed4_cds: type_dict[x[3]]="CDS" bedall=sorted(bed4_rrna+bed4_cds+bed4_trna) out_l=[] for line in bedall: chro, start, end, anno=line if type_dict[anno]=="tRNA": seq3="tRNA-"+str(SeqUtils.seq3(anno)) line2w="{start}\t{end}\t{type}\n\t\t\t{key}\t{value}\n".format( start=start,end=end, type="tRNA",key="product",value=seq3) elif type_dict[anno]=="rRNA": line2w="{start}\t{end}\t{type}\n\t\t\t{key}\t{value}\n".format( start=start,end=end, type="rRNA",key="product",value=anno) elif type_dict[anno]=="CDS": line2w_1="{start}\t{end}\t{type}\n\t\t\t{key}\t{value}\n".format( start=start,end=end, type="gene",key="gene",value=anno) line2w_2="{start}\t{end}\t{type}\n\t\t\t{key1}\t{value1}\n\t\t\t{key2}\t{value2}\n".format( start=start,end=end, type="CDS", key1="product",value1=anno, key2="transl_table",value2=5) line2w="".join([line2w_1, line2w_2]) out_l.append(line2w) return out_l
def get_sequences(pdb_id, chain=None): '''Gets the sequences in a PDB file.''' return [SeqUtils.seq1(''.join([residue.get_resname() for residue in chn if 'CA' in residue.child_dict])) for chn in get_structure(pdb_id).get_chains() if chain is None or chain == chn.get_id()]
def plot(unmappeddict, unmap_stats, out): """ Generates boxplot, distribution plots and join plots from the missing regions summary statistics. They are saved in the output directory as jpg images. Parameters ---------- unmappeddict: dict Dictionary of the coordinates and sequences of the unmapped regions unmap_stats: dataframe Table containing the unmapped regions summary statistics out: str Output directory """ gc_content = list() regions_length = list() for key, values in unmappeddict.items(): gc_content.append(SeqUtils.GC(values)) regions_length.append(len(values)) plt.figure(figsize=(10, 10)) sns.set(style='white', font_scale=2) fig_joint = sns.jointplot(regions_length, gc_content, kind='hex', height=7) fig_joint.set_axis_labels(xlabel='Length', ylabel='GC Content') fig_joint.savefig(os.path.join(out, 'gc_length_joint_missing.jpg')) plt.clf() plt.figure(figsize=(15, 10)) sns.set(style='white', font_scale=1.3) fig_gc = sns.distplot(gc_content, hist=True, rug=False, color='red') fig_gc.set(xlabel='GC Content') fig_gc.set_title('Distribution of GC Content') sns.despine() save = fig_gc.get_figure() save.savefig(os.path.join(out, 'gc_content_missing.jpg')) plt.clf() plt.figure(figsize=(15, 10)) sns.set(style='white', font_scale=1.3) fig_length = sns.distplot(regions_length, hist=True, rug=False, color='green') fig_length.set(xlabel='Length') fig_length.set_title('Distribution of Length') sns.despine() save = fig_length.get_figure() save.savefig(os.path.join(out, 'length_missing.jpg')) plt.clf() plt.figure(figsize=(15, 10)) sns.set(style='white', font_scale=1.2) ax = sns.boxplot(data=unmap_stats.iloc[:, 3:24], palette='Spectral') ax.set_xlabel('Translated Codons') ax.set_ylabel('Mean Percentage per Frame (%)') sns.despine() save = ax.get_figure() save.savefig(os.path.join(out, 'codons_missing.jpg')) plt.clf()
def get_Seq_ORF_features(file_path,input_file,model): seq_id = [] features_dict = {} transcript_sequences = [] for record in SeqIO.parse(input_file, "fasta"): name = record.id name = name.lower() seq_id.append(name) seq = record.seq transcript_sequences.append(seq) features_dict[name] = {} features_dict[name]["length"] = len(record.seq) G_C = SeqUtils.GC(record.seq) features_dict[name]["G+C"] = G_C insta_fe,PI_fe,gra_fe = PP.param(seq) Len,Cov,inte_fe = leng.len_cov(seq) features_dict[name].update({"ORF-integrity":inte_fe,"ORF-coverage":Cov,"Instability":insta_fe,"PI":PI_fe,"Gravy":gra_fe}) A,T,G,C,AT,AG,AC,TG,TC,GC,A0,A1,A2,A3,A4,T0,T1,T2,T3,T4,G0,G1,G2,G3,G4,C0,C1,C2,C3,C4 = CTD(seq) features_dict[name].update({'A':A,'T':T,'G':G,'C':C,'AT':AT,'AG':AG,'AC':AC,'TG':TG,'TC':TC,'GC':GC,'A0':A0,'A1':A1,'A2':A2,'A3':A3,'A4':A4,'T0':T0,'T1':T1,'T2':T2,'T3':T3,'T4':T4,'G0':G0,'G1':G1,'G2':G2,'G3':G3,'G4':G4,'C0':C0,'C1':C1,'C2':C2,'C3':C3,'C4':C4}) os.system("python3 "+file_path+"/feamodule/cpat.py -g "+input_file+" -o temp_cpat.txt -x "+model_reference[model][1]) #Use cpat to get fickett , hexamer , ORF with open("temp_cpat.txt.dat", "r") as tabular: cpat_reader = csv.reader(tabular, delimiter=("\t")) for row in cpat_reader: name = row[0] name = name.lower() ORF = float(row[2]) fickett = float(row[3]) hexamer = float(row[4]) features_dict[name]["ORF"] = ORF features_dict[name]["fickett"] = fickett features_dict[name]["hexamer"] = hexamer os.system("rm temp_cpat.txt.dat") return features_dict,seq_id,transcript_sequences
def main(): args = fetch_args() utility.add_tmp_dir(args) utility.check_input(args) print("\n## Computing mean contig GC content") contigs = {} for id, seq in utility.parse_fasta(args["fna"]): contig = Contig() contig.id = id contig.seq = str(seq) contig.gc = round(SeqUtils.GC(seq), 2) contigs[id] = contig mean = np.mean([c.gc for c in contigs.values()]) print("\n## Computing per-contig deviation from mean") for contig in contigs.values(): contig.values = {} contig.values["delta"] = abs(contig.gc - mean) print("\n## Identifying outlier contigs") flagged = [] for contig in contigs.values(): if contig.values["delta"] > args["cutoff"]: flagged.append(contig.id) out = f"{args['tmp_dir']}/flagged_contigs" print(f" {len(flagged)} flagged contigs: {out}") with open(out, "w") as f: for contig in flagged: f.write(contig + "\n")
def get_stats_from_contigs(contigs_fasta): """ Use BioPython parser and GC calculator to get contig lengths and GCs from contigs fasta """ # initialize lists contigs = [] lengths = [] gcs = [] # loop over fasta records (this is 2-3 times faster than SeqIO.parse) # (and only marginally slower than my custom built parser.) with open(contigs_fasta, 'r') as CF: for title, sequence in SeqIO.FastaIO.SimpleFastaParser(CF): # parse title with RegEx contig = title.split(None, 1)[0] length = len(sequence) contigs.append(contig) lengths.append(length) gcs.append(SeqUtils.GC(sequence)) # convert to DataFrame and return return pandas.DataFrame({'contig': contigs, 'length': lengths, 'GC': gcs}).set_index('contig')
def parseSeqRecordForOligo(record,oligo): '''Parse SeqRecord for oligo and return True if found and False if not.''' results = SeqUtils.nt_search(str(record.seq),oligo) #search in SeqRecord sequence for oligo if (len(results) > 1): return True #if list > 1 item, a match position was found else: #print "Did NOT find %s in %s" % (ol.id, record.id) return False
def gene_feature(Y, X, learn_options): ''' Things like the sequence of the gene, the DNA Tm of the gene, etc. ''' gene_names = Y['Target gene'] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values==gene] = len(seq) gc_content[gene_names.values==gene] = SeqUtil.GC(seq) temperature[gene_names.values==gene] = Tm.Tm_staluc(seq, rna=False) molecular_weight[gene_names.values==gene] = SeqUtil.molecular_weight(seq, 'DNA') all = np.concatenate((gene_length, gc_content, temperature, molecular_weight), axis=1) df = pandas.DataFrame(data=all, index=gene_names.index, columns=['gene length', 'gene GC content', 'gene temperature', 'gene molecular weight']) return df
def candidates_for_seq(seq, descriptor, GC_requirement=[0, 100]): candidates = [] i = 0 while i < len(seq): nextPAM = seq[i:].find(PAM_SEQ) if nextPAM == -1 or (i + nextPAM + len(PAM_SEQ) + SPACER_LENGTH) > len(seq): i += 10000000 break targetSeq = seq[i + nextPAM + len(PAM_SEQ):i + nextPAM + len(PAM_SEQ) + SPACER_LENGTH] GC_content = SeqUtils.GC(targetSeq) if GC_content < GC_requirement[0] or GC_content > GC_requirement[1]: i += nextPAM + 1 continue name = descriptor + str(i + nextPAM + len(PAM_SEQ)) target = SeqRecord(targetSeq, id=name, name=name, description=name) candidate = { 'name': target.id, 'seqrec': target, 'location': i + nextPAM + len(PAM_SEQ) } candidates.append(candidate) i += nextPAM + 1 return candidates
def __init__(self, file, fastaRecord): super(SequenceStat, self).__init__() self.file = file self.length = len(fastaRecord.seq) self.description = fastaRecord.description self.gc = SeqUtils.GC(fastaRecord.seq) self.crc32 = CheckSum.crc32(fastaRecord.seq)
def generate_long_table(all_fastas): global args, output_handle if not args.append: print >> output_handle, "\t".join([ "path", "sequence_description", "sequence_length", "GC", "kmer", "count" ]) for f in all_fastas: logger.debug("Processing file %s" % (f)) for record in SeqIO.parse(f, "fasta", generic_dna): kmer_count = collections.defaultdict(int) logger.debug("Processing sequence %s" % (record.description)) seq = str(record.seq) fasta_keys = [f, record.description, str(len(seq))] # Add additional features to the sequence fasta_keys.append(str(SeqUtils.GC(record.seq))) # # fasta_keys=tuple(fasta_keys) # kmer_count[fasta_keys]=collections.defaultdict(int) for i in range(0, len(seq) - args.kmer_length): kmer = seq[i:i + args.kmer_length] if args.star: kmer = list(kmer) for i in range(2, args.kmer_length, 3): kmer[i] = "*" kmer = "".join(kmer) kmer_count[kmer] += 1 for kmer, count in kmer_count.items(): print >> output_handle, "\t".join(fasta_keys + [kmer, str(count)])
def extractPDBdata(structure, adjustChains, substitutionData, verbose): print('Extracting atoms details from PDB...') pdbData = {} for model in structure: for chain in model: chainID = chain.get_id() if chainID in adjustChains: pdbData[chainID] = {} residueID = 0 for residue in chain: residueName = SeqUtils.seq1(residue.get_resname()) if residueName != substitutionData[chainID][residueID][0]: continue (heteroFlag, sequenceID, insertionCode) = residue.get_id() if heteroFlag != ' ': continue value = substitutionData[chainID][residueID][1] if value != "-": pdbData[chainID][sequenceID] = value if verbose: print("Chain: " + chainID + "\t residue: " + residueName + " " + str(sequenceID) + "\t value: " + value) residueID += 1 if (residueID >= len(substitutionData[chainID])): break print('OK') return pdbData
def GC_content(fasta_file): sequences = SeqUtils.quick_FASTA_reader(fasta_file) GCs = [SeqUtils.GC(k[1]) for k in sequences] ## for i in range(len(sequences)): ## print str(GCs[i]) + '\t' + sequences[i][0] #print "AVERAGE: " + str(float(sum(GCs))/len(GCs)) print str(float(sum(GCs))/len(GCs)) + '\t' + sequences[0][0]
def get_distances(res_pairs, get_coords): ''' Get distances for all pairs of residues between two chains res_pairs: generator over tuples ((res_a, res_b), ...) get_coords: function to get residue coordinates Returns a list over 5-tuples: [(resn_a, resn_b, aa_a, aa_b, dist), ...] ''' return [ (res_a.id[1], res_b.id[1], distances.calc_residue_distance(res_a, res_b, get_coords), SeqUtils.seq1(res_a.resname), SeqUtils.seq1(res_b.resname) ) for (res_a, res_b) in res_pairs ]
def target_genes_stats(genes=["HPRT1", "TADA1", "NF2", "TADA2B", "NF1", "CUL3", "MED12", "CCDC101"]): for gene in genes: seq = get_gene_sequence(gene) if seq != None: print "%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f" % ( gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, "DNA"), )
def get_dihedral( residue_list ): ''' returns phi and psi angles of a residue and the amino acid sidechain present residue_list - []Bio.PDB.Residue - list of 3 *hopefully* continuous residues ''' for one, two in zip( residue_list[:-1], residue_list[1:] ): if ( two.get_id()[1] - one.get_id()[1] ) != 1: raise BackboneError( "Discontinuous residues", two.get_id()[1] ) atoms = ( {"C": False}, {"N": False, "CA": False, "C": False}, {"N": False} ) for i, residue in enumerate( residue_list ): if i == 1: res_name = SeqUtils.seq1( residue.get_resname() ) if not is_aa( res_name ): raise BackboneError( "Not a valid amino acid", residue.get_id()[1] ) for atom in residue.get_unpacked_list(): if atom.name in atoms[i].keys(): atoms[i][ atom.name ] = atom.get_vector() if False in map( check_dict, atoms ): raise BackboneError( "Missing backbone atoms", residue.get_id()[1] ) dihedrals = [ PDB.calc_dihedral( atoms[0]["C"], atoms[1]["N"], atoms[1]["CA"], atoms[1]["C"] ), #phi PDB.calc_dihedral( atoms[1]["N"], atoms[1]["CA"], atoms[1]["C"], atoms[2]["N"] ) #psi ] return ( dihedrals, res_name )
def main(): """Main application body""" # Parse command-line arguments args = parse_args() # Genome sequence and annotations genome = load_file(args.input_genome) annotations = load_file(args.input_annotations) # 3'UTR motifs from supplementary table 2 in Najafabadi et al. (2013) motifs = load_motifs('najafabadi_table_s1_2013.csv') # Load genome sequence chromosomes = load_fasta(genome) # Parse annotations and return 3'UTR coordinates genes = get_utr_coords(annotations, utr_length=args.utr_length) # Create a list to store output rows output = [] # For each gene, return a list of the motifs that are present in its 3'UTR num_genes = len(genes) for i, gene in enumerate(genes): utr_seq = get_3utr_seq(chromosomes, gene) print('Processing gene %d/%d' % (i + 1, num_genes)) # check each motif to see if it is present utr3_motifs = [] for motif in motifs: matches = SeqUtils.nt_search(utr_seq, motif)[1:] # save matched motif if len(matches) > 0: utr3_motifs.append(motif) output.append([gene['id']] + utr3_motifs) # output results with open(args.output, 'w') as output_file: writer = csv.writer(output_file) writer.writerows(output)
def SeqUtilFeatures(data): ''' assuming '30-mer'is a key get melting temperature features from: 0-the 30-mer ("global Tm") 1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM 2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt) 3-the Tm of the DNA:RNA hybrid from position 3 - 7 (i.e. 5 nt) ''' sequence = data['30mer'].values num_features = 1 featarray = np.ones((sequence.shape[0], num_features)) for i, seq in enumerate(sequence): assert len(seq) == 30, "seems to assume 30mer" featarray[i, 0] = SeqUtil.molecular_weight(str(seq)) feat = pandas.DataFrame(pandas.DataFrame(featarray)) return feat
def annotate_primer(primer_name, primer_seq, primer_direction, genome): if type(primer_seq) == SeqRecord: primer_seq = primer_seq.seq if primer_direction == -1: primer_seq = primer_seq.reverse_complement() primer_label = PRIMER_ANNOTATION_PREFIX + primer_name primer_genome_loc_start = SeqUtils.nt_search( str(genome.seq), str(primer_seq))[1] primer_genome_loc = FeatureLocation( primer_genome_loc_start, primer_genome_loc_start+len(primer_seq)) primer_feature = SeqFeature( location=primer_genome_loc, type='misc_feature', strand=primer_direction, qualifiers={'label': [primer_label]}) genome.features.append(primer_feature)
def digest(enzyme, sequence, outfile, count): # search input sequence using enzyme sequence and return results to 'matches' matches = SeqUtils.nt_search(str(sequence.seq).upper(), enzyme[1]) # for each of the items in results 'matches' list from 2nd item on (first item is match string) for match in matches[1:]: # create line for match on query stand line1 = sequence.id+"\t"+`int(match)+int(enzyme[2])`+"\t"+`int(match)+int(enzyme[2])`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t+\n" # look for reverse complement line2 = sequence.id+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+`int(match)+int(len(enzyme[1])-int(enzyme[2]))`+"\t"+enzyme[0]+"\tcut-"+`count`+"\t-\n" # if cut site is past halfway point in enzyme, we should output antisense cut first to keep output BED sorted if len(enzyme[1])/2 < int(enzyme[2]): outfile.write(line2+line1) # if cut site is not past halfway point in enzyme, we can output in logical order else: # write both lines to ouput outfile.write(line1+line2) count += 1 return count
def Chain_to_SeqRecord(chain): ''' Generates a SeqRecord from a Chain entity. chain: a Bio.PDB.Chain object Keeps only residues with blank flags (eg. no HET residues). Returns seqr: a Bio.SeqRecord object with a list of resnums saved in its letter_annotations['resnum']. ''' aas = '' resns = list() for res in get_nonhet_residues(chain): aas += SeqUtils.seq1(res.get_resname()) # get 1-letter resname resns += [res.id[1]] seqr = SeqRecord.SeqRecord(Seq.Seq(aas), id = chain.id, letter_annotations = {"resnum": resns}) return seqr
def createdb(): gis = [100753385, 100689306, 100751648] accession = [] description = [] sequence = [] request = Entrez.epost("nucleotide",id=",".join(map(str,gis))) result = Entrez.read(request) webEnv = result["WebEnv"] queryKey = result["QueryKey"] handle = Entrez.efetch(db="nucleotide",retmode="xml", webenv=webEnv, query_key=queryKey) for r in Entrez.parse(handle): # Grab the GI# try: gi=int([x for x in r['GBSeq_other-seqids'] if "gi" in x][0].split("|")[1]) except ValueError: gi=None fastaseq = ">GI ",gi," "+r["GBSeq_primary-accession"]+" "+r["GBSeq_definition"]+"\n"+r["GBSeq_sequence"][0:20] accession.append(''.join(fastaseq[0].strip() + str(fastaseq[1]))) description.append(' '.join(fastaseq[2].split()[0:3])) sequence.append(fastaseq[2].split()[-1].upper()) alt_map = {'ins':'0'} complement = {'A':'T','G':'C','T':'A','C':'G'} # getting the complementary sequence# def reverse_complement(seq): for k,v in alt_map.iteritems(): seq = seq.replace(k,v) bases = list(seq) bases = reversed([complement.get(base,base) for base in bases]) bases = ''.join(bases) for k,v in alt_map.iteritems(): bases = bases.replace(v,k) return bases complementary_sequence = [reverse_complement(seq) for seq in sequence] #print sequence,complementary_sequence# #fetching the positions of 'GG' from the sequence exon = [] comp_exon = [] pattern = 'GG' for exons in sequence: exon_search = str(SeqUtils.nt_search(exons, pattern)) exon.append(exon_search) for comp in complementary_sequence: comp_exon_search = str(SeqUtils.nt_search(comp, pattern)) comp_exon.append(comp_exon_search) #print exon #print comp_exon conn = sqlite3.connect(sqlite_file) c = conn.cursor() c.execute('CREATE TABLE {tn} ({nf} {ft} PRIMARY KEY)'\ .format (tn=table_name2, nf=new_field, ft=field_type)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=id_column, ct=column_type2)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=description_column, ct=column_type3)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=seq_column, ct=column_type4)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=comp_seq_column, ct=column_type5)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=PAM_column1, ct=column_type6)) c.execute("ALTER TABLE {tn} ADD COLUMN '{cn}' {ct}"\ .format(tn=table_name2, cn=PAM_column2, ct=column_type7)) c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (1, accession[0], description[0], sequence[0],complementary_sequence[0],exon[0],comp_exon[0])) c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (2, accession[1], description[1], sequence[1],complementary_sequence[0],exon[1],comp_exon[1])) c.execute('''INSERT INTO Gather_data(No, gi_accession, description, sequence,Complementary_sequence,PAMsites_exons,PAMsites_complementary) VALUES(?,?,?,?,?,?,?)''', (3, accession[2], description[1], sequence[2],complementary_sequence[0],exon[2],comp_exon[2])) conn.commit() conn.close()
def FindSpacersInterval(chromPos, chromStartRG, chromEndRG, seqStr, cutoff_spacing, referenceGenomeForDAS,spacerLength, distanceToCutSiteFromPAM_bp): from Bio import SeqFeature if PAMside == 3: distanceToCutSiteFrom5pEnd = spacerLength - distanceToCutSiteFromPAM_bp # For SpCas9 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 3bp; distanceToCutSiteFrom5pEnd = 17bp else: distanceToCutSiteFrom5pEnd = distanceToCutSiteFromPAM_bp-1 # For AsCpf1 (as an example): spacerLength = 20bp ; distanceToCutSiteFromPAM_bp = 19bp; distanceToCutSiteFrom5pEnd = 18bp s = coords2fa(chromPos, chromStartRG, chromEndRG, referenceGenomeForDAS); s=s.upper(); PAM = Seq(seqStr, IUPAC.ambiguous_dna) PAM_length = len(seqStr); if seqStr == str(PAM.reverse_complement()): DoRevComp=0 forwardNameString = "{name}_{num:0{width}}" else: DoRevComp=1 forwardNameString = "{name}_F{num:0{width}}" listSpacer=[] listDistBetweenSpacers=[] spacerNum=0 prevStartLocInRefSeq=-9999 if PAMside == 3: gbStringForSearch = s[spacerLength:]; # Cas9 else: gbStringForSearch = s[:-spacerLength]; # Cpf1, get all but last ~20 bases of sequence spacerInds = SeqUtils.nt_search(gbStringForSearch, str(PAM)) if len(spacerInds) > 1: # matches found del spacerInds[0] # first result from nt_search is regexp expansion #print "len line below {fname}".format(fname=len(spacerInds)) formatDigitsN = int(math.ceil(math.log(len(spacerInds),10))); print "Plus strand sgRNAs found: {num}".format(num=len(spacerInds)) for idx, item in enumerate(spacerInds): startPos = SeqFeature.ExactPosition(item) # start and end pos of PAM endPos = SeqFeature.ExactPosition(item+PAM_length) if PAMside == 3: # Cas9-like startLocInRefSeq = startPos+1 endLocInRefSeq = startLocInRefSeq+spacerLength-1 else: # Cpf1-like startLocInRefSeq = endPos #Starts immediately after PAM endLocInRefSeq = startLocInRefSeq+spacerLength startLocInRefGenome = chromStartRG+startLocInRefSeq endLocInRefGenome = chromStartRG+endLocInRefSeq-1 cutSiteInRefGenome = startLocInRefGenome+distanceToCutSiteFrom5pEnd # Only add the spacer if it is a certain distance from the previous spacer if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: spacerNum += 1 strand="+" if spacerNum > 1: distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq else: distFromPrevSpacer = 0 if PAMside == 3: spacerAsStr = str(s[startLocInRefSeq-1:endLocInRefSeq]) exactPAM = s[endLocInRefSeq:endLocInRefSeq+PAM_length]; else: spacerAsStr = str(s[startLocInRefSeq:endLocInRefSeq]) exactPAM = s[startLocInRefSeq-PAM_length:startLocInRefSeq]; # Python slices: second index is first char you *DON'T* want GCcontent = SeqUtils.GC(spacerAsStr); listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent] listSpacer.append(listItem) listDistBetweenSpacers.append(float(distFromPrevSpacer)) prevStartLocInRefSeq=startLocInRefSeq print "Plus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) spacerNumTotal=spacerNum # Search rev complement of PAM # print PAM # print PAM.reverse_complement() prevStartLocInRefSeq=-9999 spacerNum=0 if DoRevComp: if PAMside == 3: gbStringForSearch = s[:-spacerLength]; # get all but last ~20 bases of sequence else: gbStringForSearch = s[spacerLength:]; spacerInds = SeqUtils.nt_search(gbStringForSearch,str(PAM.reverse_complement())) if len(spacerInds) > 1: # matches found del spacerInds[0] # first result from nt_search is regexp expansion #print "len line below {fname}".format(fname=len(spacerInds)) formatDigitsN = int(math.ceil(math.log(len(spacerInds),10))); print "Minus strand sgRNAs found: {num}".format(num=len(spacerInds)) for idx, item in enumerate(spacerInds): startPos = SeqFeature.ExactPosition(item) endPos = SeqFeature.ExactPosition(item+PAM_length) #print "Start pos: {num} End pos: {num2}".format(num=startPos,num2=endPos) # Start and end locations are flipped here due to reverse strand if PAMside == 3: endLocInRefSeq = endPos+1 #flipped for reverse strand startLocInRefSeq = endLocInRefSeq+spacerLength-1 #flipped for reverse strand else: # startLocInRefSeq is 5' end of spacer on PAM-containing strand # endLocInRefSeq is 3' end of spacer on PAM-containing strand # Hence endLocInRefSeq < startLocInRefSeq since this is reverse strand startLocInRefSeq = startPos + spacerLength # b/c spacer length is the offset between gbStringForSearch to RefSeq endLocInRefSeq = startLocInRefSeq - spacerLength +1 startLocInRefGenome = chromStartRG+startLocInRefSeq-1 endLocInRefGenome = chromStartRG+endLocInRefSeq-1 cutSiteInRefGenome = startLocInRefGenome-distanceToCutSiteFrom5pEnd # Only add the spacer if it is a certain distance from the previous spacer if (startLocInRefSeq-prevStartLocInRefSeq) > cutoff_spacing: spacerNum += 1 strand="-" if spacerNum > 1: distFromPrevSpacer = startLocInRefSeq-prevStartLocInRefSeq else: distFromPrevSpacer = 0 if PAMside == 3:# Cas9-like spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna) spacerAsStr = str(spacerRC.reverse_complement()) exactPAM = str(Seq(str(s[endLocInRefSeq-(PAM_length+1):endLocInRefSeq-1]), IUPAC.ambiguous_dna).reverse_complement()) else: # Cpf1-like spacerRC = Seq(str(s[endLocInRefSeq-1:startLocInRefSeq]), IUPAC.ambiguous_dna) spacerAsStr = str(spacerRC.reverse_complement()) exactPAM = str(Seq(str(s[startLocInRefSeq:startLocInRefSeq+PAM_length]), IUPAC.ambiguous_dna).reverse_complement()) GCcontent = SeqUtils.GC(spacerAsStr); listItem = [spacerNum, strand, startLocInRefSeq, endLocInRefSeq, chromPos, startLocInRefGenome, endLocInRefGenome, cutSiteInRefGenome, distFromPrevSpacer, spacerAsStr, exactPAM, GCcontent] listSpacer.append(listItem) listDistBetweenSpacers.append(float(distFromPrevSpacer)) prevStartLocInRefSeq=startLocInRefSeq print "Minus strand sgRNAs included after minimum spacing (> {limit}bp between sgRNAs): {num}".format(limit=cutoff_spacing,num=spacerNum) spacerNumTotal=spacerNumTotal+spacerNum; arrDistBetweenSpacers = np.asarray(listDistBetweenSpacers) meanDist = np.mean(arrDistBetweenSpacers) return (listSpacer, spacerNumTotal, meanDist)
def target_genes_stats(genes=['HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', 'MED12', 'CCDC101']): for gene in genes: seq = get_gene_sequence(gene) if seq != None: print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))
for randomRec in range(1,2): record = records[random.randint(1, len(records))] newRecord = SeqRecord(record.seq) #writing Header newRecord.seq.alphabet = generic_dna newRecord.id = record.id newRecord.name = record.name newRecord.description = record.description recordSeq = str(record.seq) for feature in featureStatistic_container: if feature not in ["PBS", "STF"]: for variation in featureStatistic_container[feature]: featureSeq = str(variation.seq) occurrence = SeqUtils.nt_search(recordSeq, featureSeq) writeFeature(strand=1) featureSeqComplement = str(variation.seq.complement()) occurrence = SeqUtils.nt_search(recordSeq, featureSeqComplement) writeFeature(strand=-1) else: if(feature == "STF"): writeSTF() if(feature == "PBS"): writePBS() SeqIO.write(newRecord, output_handle, "genbank")
def clean(self): ''' Clean data, adding an unsaved InchwormAssembly model ('assembly') and a list of stages ('stages') to self.cleaned_data ''' cleaned_data = super(PathwayForm, self).clean() # Don't do anything if some fields are missing if not all(x in cleaned_data.keys() for x in ['file', 'rbs_annotation_type', 'cds_annotation_type']): return cleaned_data def validate_contiguity(features): for i in range(len(features) - 1): if features[i].location.end != features[i+1].location.start: raise forms.ValidationError( 'Features {} (of type {}) and {} (of type {}) must be contiguous.'.format( features[i].qualifiers['label'][0], features[i].type, features[i+1].qualifiers['label'][0], features[i+1].type, )) record = SeqIO.read(cleaned_data['file'], 'genbank') feature_dict = { (feature.qualifiers['label'][0], feature.type): feature for feature in record.features } # Make sure all the required features are present pathway_features = [] for stage_name in self.stage_names: rbs_key = (stage_name, cleaned_data['rbs_annotation_type']) cds_key = (stage_name, cleaned_data['cds_annotation_type']) try: pathway_features.append(feature_dict[rbs_key]) pathway_features.append(feature_dict[cds_key]) except KeyError as e: raise forms.ValidationError( 'Stage {} has no feature of type {}.'.format(*e.args[0])) # Make sure all the features are contiguous validate_contiguity(pathway_features) # Save all the annealable sequences annealable_seqs = [] for i, stage_name in enumerate(self.stage_names): cds_feature = pathway_features[2*i + 1] annealable_seq = None for sequence_context in self.sequence_contexts: annealable_seq_name = '{} from {}'.format( stage_name, sequence_context['name']) sequence_context['file'].seek(0) context_record = SeqIO.read(sequence_context['file'], 'genbank') search_result = SeqUtils.nt_search( str(context_record.seq), str(cds_feature.extract(record).seq), ) if len(search_result) > 1: annealable_seq = Gene( file=sequence_context['file'], start=search_result[1] + 1, end=search_result[1] + len(cds_feature), strand=1, name=annealable_seq_name, type=Gene.ANNEALABLE_SEQ, ) annealable_seq.save() break # No forward match found, so search the reverse strand rev_search_result = SeqUtils.nt_search( str(context_record.seq), str(cds_feature.extract(record).seq.reverse_complement()), ) if len(rev_search_result) > 1: annealable_seq = Gene( file=sequence_context['file'], start=rev_search_result[1] + 1, end=rev_search_result[1] + len(cds_feature), strand=-1, name=annealable_seq_name, type=Gene.ANNEALABLE_SEQ, ) annealable_seq.save() break if annealable_seq is None: # No sequence context matched, so do non-nested PCR directly off # the coding sequence seq_file = ContentFile('') annealable_seq = Gene( file=seq_file, start=1, end=len(cds_feature), strand=1, name=stage_name, type=Gene.ANNEALABLE_SEQ, ) annealable_seq.save() seq_record = cds_feature.extract(record) seq_record.id = '' seq_record.name = '' SeqIO.write(seq_record, seq_file, 'genbank') annealable_seq.file.save(stage_name, seq_file) annealable_seqs.append(annealable_seq) # Save the genome if len(record[:pathway_features[0].location.start]) < self.fwd_ha_len: raise forms.ValidationError( '5’ genome context must be at least {} bp long.'.format( self.fwd_ha_len)) if len(record[pathway_features[-1].location.end:]) < self.rev_ha_len: raise forms.ValidationError( '3’ genome context must be at least {} bp long.'.format( self.rev_ha_len)) genome_record = record[:pathway_features[0].location.start] + \ record[pathway_features[-1].location.end:] genome_record.name = 'genome' genome_file = ContentFile('') genome = Gene( file=genome_file, start=pathway_features[0].location.start + 1, end=pathway_features[0].location.start, strand=1, name='Genome context', ) genome.save() SeqIO.write(genome_record, genome_file, 'genbank') genome.file.save('genome', genome_file) # Save the stages cleaned_data['stages'] = [] for i, stage_name in enumerate(self.stage_names): rbs_feature = pathway_features[2*i] stage = Stage( degeneracy=str(rbs_feature.extract(record).seq), annealable_seq = annealable_seqs[i], selection_cassette=self.selection_cassettes[i], name=stage_name, ) cleaned_data['stages'].append(stage) stage.save() # Save the InchwormAssembly object cleaned_data['assembly'] = InchwormAssembly( genome=genome, enzyme=self.enzyme, library_size=self.library_size, dna_required=self.dna_required, fwd_ha_len=self.fwd_ha_len, rev_ha_len=self.rev_ha_len, ) return cleaned_data
def get_context_data(self, **kwargs): output = self.object.output primers = self.object.primers library_sizes = self.get_library_sizes() primer_names_by_sequence = dict() for name, sequence in primers: primer_names_by_sequence[sequence] = name def primer_name(primer): return primer_names_by_sequence[str(primer.full_seq().seq)] for i, stage_output in enumerate(output): stage_output['gg_primer_names'] = [ (primer_name(primer1), primer_name(primer2)) for primer1, primer2 in stage_output['gg'].primers ] stage_output['integration_primer_names'] = [ primer_name(primer) for primer in stage_output['insert'].generate_primers() ] stage_output['phenotype'] = \ self.object.stages.order_by('pk')[i].selection_cassette.phenotype if library_sizes: stage_output['dna_required'] = \ library_sizes[i] * self.object.dna_required # Compile unique Golden Gate PCR reactions for the tabular view gg_pcrs_by_primers_and_template = dict() gg_pcr_details = [] for i, stage_output in enumerate(output): for j in range(3): primer_names = map(primer_name, stage_output['gg'].primers[j]) primer_names_and_template = tuple( primer_names + [str( stage_output['gg'].genes[j].subrecord().seq.upper())]) if primer_names_and_template in gg_pcrs_by_primers_and_template.keys(): continue else: # Get length of PCR product primer1 = stage_output['gg'].primers[j][0] primer2 = stage_output['gg'].primers[j][1] search_template = str( stage_output['gg'].genes[j].subrecord().seq.upper()) forward_search_result = SeqUtils.nt_search( search_template, primer1.anneal_seq().upper(), ) reverse_search_result = SeqUtils.nt_search( search_template, primer2.anneal_seq().reverse_complement().upper(), ) assert len(forward_search_result) > 1 and \ len(reverse_search_result) > 1 # Get name of template stage = self.get_object().stages.order_by('pk')[i] if j == 0: template_name = stage.annealable_seq.name elif j == 1: template_name = stage.selection_cassette.name else: template_name = 'Genome' # Get primer Tm forward_tm = recombineering.utils.Tm( str(primer1.anneal_seq().seq)) reverse_tm = recombineering.utils.Tm( str(primer2.anneal_seq().seq)) details = { 'product': 'gg{}-{}'.format(i+1, j+1), 'size': len(primer1.overhang) + (reverse_search_result[1] - forward_search_result[1]) + len(primer2.full_seq()), 'primer_names': primer_names_and_template, 'template': template_name, 'forward_tm': forward_tm, 'reverse_tm': reverse_tm, } gg_pcrs_by_primers_and_template[ primer_names_and_template] = details gg_pcr_details.append(details) # Compile information about second-round PCRs round2_pcr_details = [] for i, stage_output in enumerate(output): insert = stage_output['insert'] insert_len = sum([ insert.fwd_ha_len, len(insert.degeneracy), len(insert.sequence), insert.rev_ha_len, ]) details = { 'product': 'stage{}'.format(i+1), 'size': insert_len, 'primer_names': map(primer_name, insert.generate_primers()), 'template': 'gg{}'.format(i+1), 'forward_tm': recombineering.utils.Tm( str(insert.generate_primers()[0].anneal_seq().seq)), 'reverse_tm': recombineering.utils.Tm( str(insert.generate_primers()[1].anneal_seq().seq)), } if library_sizes: details['dna_required'] = \ library_sizes[i] * self.object.dna_required round2_pcr_details.append(details) # Determine what goes into which Golden Gate reaction gg_details = [] for i, stage_output in enumerate(output): fragments = [] for j, (primer1, primer2) in enumerate(stage_output['gg'].primers): template = str(stage_output['gg'].genes[j].subrecord().seq.upper()) primer_names_and_template = ( primer_name(primer1), primer_name(primer2), template, ) fragments.append( gg_pcrs_by_primers_and_template[primer_names_and_template]['product']) gg_details.append({ 'product': 'gg{}'.format(i+1), 'size': len(stage_output['gg'].product), 'fragments': fragments, }) # Transformation details transformation_details = [] for i in range(len(output)): stage = self.get_object().stages.order_by('pk')[i] transformation_details.append({ 'insert_name': round2_pcr_details[i]['product'], 'phenotype': stage.selection_cassette.phenotype, }) context = super(OutputView, self).get_context_data(**kwargs) context['output'] = output context['primers'] = primers context['gg_pcr_details'] = gg_pcr_details context['gg_details'] = gg_details context['round2_pcr_details'] = round2_pcr_details context['transformation_details'] = transformation_details return context
if fastafile=="test3prime.fasta": output_fh_name="output2.fasta" output_fh = open(output_fh_name, mode='w+') output_text_name = "output.txt" if fastafile=="test3prime.fasta": output_text_name="output2.txt" output_text_fh = open(output_text_name, mode='w+') for record in parsed: try: sequence = str(record.seq) search = SeqUtils.nt_search(sequence, adapter) #This will search the index = int(search[1]) #If it finds the adapter, is the starting index from which it was found. adapter_start = index adapter_end = index+len_adapter count_adapter_found +=1 total_seq_count+=1 if removeadapters == "True": #if the value is true, it removes the adapters from the sequences. if end_defn=="5": record = record[adapter_end:] #If a 5' adapter, you remove adapter from beginning elif end_defn=="3": record = record[:adapter_start] #If it is a 3' adapter, you remove the adapter at the end elif removeadapters == "False": #if the value is false, it does not remove the adapters from the sequences. record = record SeqIO.write(record, output_fh, format="fasta") #No matter what, write the reads. except IndexError: count_adapter_not_found+=1
def molecular_weight(self): return SeqUtils.molecular_weight(self.sequence, 'protein')