def gene_feature(Y, X, learn_options): ''' Things like the sequence of the gene, the DNA Tm of the gene, etc. ''' gene_names = Y['Target gene'] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values==gene] = len(seq) gc_content[gene_names.values==gene] = SeqUtil.GC(seq) temperature[gene_names.values==gene] = Tm.Tm_staluc(seq, rna=False) molecular_weight[gene_names.values==gene] = SeqUtil.molecular_weight(seq, 'DNA') all = np.concatenate((gene_length, gc_content, temperature, molecular_weight), axis=1) df = pandas.DataFrame(data=all, index=gene_names.index, columns=['gene length', 'gene GC content', 'gene temperature', 'gene molecular weight']) return df
def compute_stats(seq): stats = SeqStats stats.length = len(seq) stats.gc = SeqUtils.GC(seq) try: stats.weight = SeqUtils.molecular_weight(seq) except ValueError: stats.weight = None return stats
def target_genes_stats(genes=["HPRT1", "TADA1", "NF2", "TADA2B", "NF1", "CUL3", "MED12", "CCDC101"]): for gene in genes: seq = get_gene_sequence(gene) if seq != None: print "%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f" % ( gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, "DNA"), )
def SeqUtilFeatures(data): ''' assuming '30-mer'is a key get melting temperature features from: 0-the 30-mer ("global Tm") 1-the Tm (melting temperature) of the DNA:RNA hybrid from positions 16 - 20 of the sgRNA, i.e. the 5nts immediately proximal of the NGG PAM 2-the Tm of the DNA:RNA hybrid from position 8 - 15 (i.e. 8 nt) 3-the Tm of the DNA:RNA hybrid from position 3 - 7 (i.e. 5 nt) ''' sequence = data['30mer'].values num_features = 1 featarray = np.ones((sequence.shape[0], num_features)) for i, seq in enumerate(sequence): assert len(seq) == 30, "seems to assume 30mer" featarray[i, 0] = SeqUtil.molecular_weight(str(seq)) feat = pandas.DataFrame(pandas.DataFrame(featarray)) return feat
def target_genes_stats(genes=['HPRT1', 'TADA1', 'NF2', 'TADA2B', 'NF1', 'CUL3', 'MED12', 'CCDC101']): for gene in genes: seq = get_gene_sequence(gene) if seq != None: print '%s \t\t\t\t len: %d \t GCcont: %.3f \t Temp: %.4f \t molweight: %.4f' % (gene, len(seq), SeqUtil.GC(seq), Tm.Tm_staluc(seq, rna=False), SeqUtil.molecular_weight(seq, 'DNA'))
def molecular_weight(self): return SeqUtils.molecular_weight(self.sequence, 'protein')
def average_molecule_weight(): weights = [bsu.molecular_weight(r, 'protein') for r in residues_list] return sum(weights)/len(weights)