def gene_feature(Y, X, learn_options): ''' Things like the sequence of the gene, the DNA Tm of the gene, etc. ''' gene_names = Y['Target gene'] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values == gene] = len(seq) gc_content[gene_names.values == gene] = SeqUtil.GC(seq) temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False) molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight( seq, 'DNA') all = np.concatenate( (gene_length, gc_content, temperature, molecular_weight), axis=1) df = pandas.DataFrame(data=all, index=gene_names.index, columns=[ 'gene length', 'gene GC content', 'gene temperature', 'gene molecular weight' ]) return df
def gene_feature(Y, X, learn_options): ''' Things like the sequence of the gene, the DNA Tm of the gene, etc. ''' gene_names = Y['Target gene'] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values==gene] = len(seq) gc_content[gene_names.values==gene] = SeqUtil.GC(seq) temperature[gene_names.values==gene] = Tm.Tm_staluc(seq, rna=False) molecular_weight[gene_names.values==gene] = SeqUtil.molecular_weight(seq, 'DNA') all = np.concatenate((gene_length, gc_content, temperature, molecular_weight), axis=1) df = pandas.DataFrame(data=all, index=gene_names.index, columns=['gene length', 'gene GC content', 'gene temperature', 'gene molecular weight']) return df
def gene_feature(Y): """ Things like the sequence of the gene, the DNA Tm of the gene, etc. """ gene_names = Y["Target gene"] gene_length = np.zeros((gene_names.values.shape[0], 1)) gc_content = np.zeros((gene_names.shape[0], 1)) temperature = np.zeros((gene_names.shape[0], 1)) molecular_weight = np.zeros((gene_names.shape[0], 1)) for gene in gene_names.unique(): seq = util.get_gene_sequence(gene) gene_length[gene_names.values == gene] = len(seq) gc_content[gene_names.values == gene] = SeqUtil.GC(seq) temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False) molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight( seq, "DNA") everything = np.concatenate( (gene_length, gc_content, temperature, molecular_weight), axis=1) df = pd.DataFrame( data=everything, index=gene_names.index, columns=[ "gene length", "gene GC content", "gene temperature", "gene molecular weight", ], ) return df
def local_gene_seq_features(gene_names, learn_options, X): print(f"building local gene sequence features") feat = pd.DataFrame(index=X.index) feat["gene_left_win"] = "" feat["gene_right_win"] = "" # number of nulceotides to take to the left and right of the guide k_mer_length = learn_options["include_gene_guide_feature"] for gene in gene_names.unique(): gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement() for ps in np.where(gene_names.values == gene)[0]: guide_seq = Seq.Seq(X["30mer"][ps]) strand = X["Strand"][ps] if strand == "sense": guide_seq = guide_seq.reverse_complement() # figure out the sequence to the left and right of this guide, in the gene ind = gene_seq.find(guide_seq) if ind == -1: if ind == -1: raise AssertionError("could not find guide in gene") if gene_seq[ind:(ind + len(guide_seq))] != guide_seq: raise AssertionError("match not right") left_win = gene_seq[(ind - k_mer_length):ind] right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length)] if strand == "antisense": # it's arbitrary which of sense and anti-sense we flip, we just want # to keep them in the same relative alphabet/direction left_win = left_win.reverse_complement() right_win = right_win.reverse_complement() if left_win.tostring() == "": raise AssertionError( f"k_mer_context, {k_mer_length}, is too large") if len(left_win) != len(right_win): raise AssertionError( f"k_mer_context, {k_mer_length}, is too large") feat.ix[ps, "gene_left_win"] = left_win.tostring() feat.ix[ps, "gene_right_win"] = right_win.tostring() print(f"featurizing local context of {gene}") feature_sets = {} get_all_order_nuc_features( feat["gene_left_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=maxsize, prefix="gene_left_win", ) get_all_order_nuc_features( feat["gene_right_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=maxsize, prefix="gene_right_win", ) return feature_sets
def local_gene_seq_features(gene_names, learn_options, X): print "building local gene sequence features" feat = pandas.DataFrame(index=X.index) feat["gene_left_win"] = "" feat["gene_right_win"] = "" # number of nulceotides to take to the left and right of the guide k_mer_length = learn_options['include_gene_guide_feature'] for gene in gene_names.unique(): gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement() for ps in np.where(gene_names.values == gene)[0]: guide_seq = Seq.Seq(X['30mer'][ps]) strand = X['Strand'][ps] if strand == 'sense': guide_seq = guide_seq.reverse_complement() #gene_seq = gene_seq.reverse_complement() # figure out the sequence to the left and right of this guide, in the gene ind = gene_seq.find(guide_seq) if ind == -1: #gene_seq = gene_seq.reverse_complement() #ind = gene_seq.find(guide_seq) assert ind != -1, "could not find guide in gene" assert gene_seq[ind:( ind + len(guide_seq))] == guide_seq, "match not right" left_win = gene_seq[(ind - k_mer_length):ind] right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length)] if strand == 'antisense': # it's arbitrary which of sense and anti-sense we flip, we just want # to keep them in the same relative alphabet/direction left_win = left_win.reverse_complement() right_win = right_win.reverse_complement() assert not left_win.tostring( ) == "", "k_mer_context, %s, is too large" % k_mer_length assert not left_win.tostring( ) == "", "k_mer_context, %s, is too large" % k_mer_length assert len(left_win) == len( right_win), "k_mer_context, %s, is too large" % k_mer_length feat.ix[ps, "gene_left_win"] = left_win.tostring() feat.ix[ps, "gene_right_win"] = right_win.tostring() print "featurizing local context of %s" % (gene) feature_sets = {} get_all_order_nuc_features(feat["gene_left_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_left_win") get_all_order_nuc_features(feat["gene_right_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_right_win") return feature_sets
def get_micro_homology_features(gene_names, X): # originally was flipping the guide itself as necessary, but now flipping the gene instead print("building microhomology features") feat = pd.DataFrame(index=X.index) feat["mh_score"] = "" feat["oof_score"] = "" # number of nulceotides to take to the left and right of the guide K_MER_LENGTH_LEFT = 9 K_MER_LENGTH_RIGHT = 21 for gene in gene_names.unique(): gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement() guide_inds = np.where(gene_names.values == gene)[0] print( f"getting microhomology for all {len(guide_inds)} guides in gene {gene}" ) for ps in guide_inds: guide_seq = Seq.Seq(X["30mer"][ps]) strand = X["Strand"][ps] if strand == "sense": gene_seq = gene_seq.reverse_complement() # figure out the sequence to the left and right of this guide, in the gene ind = gene_seq.find(guide_seq) if ind == -1: gene_seq = gene_seq.reverse_complement() ind = gene_seq.find(guide_seq) else: pass if ind == -1: mh_score = 0 oof_score = 0 else: if gene_seq[ind:(ind + len(guide_seq))] != guide_seq: raise AssertionError("match not right") left_win = gene_seq[(ind - K_MER_LENGTH_LEFT):ind] right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + K_MER_LENGTH_RIGHT)] if len(left_win.tostring()) != K_MER_LENGTH_LEFT: raise AssertionError() if len(right_win.tostring()) != K_MER_LENGTH_RIGHT: raise AssertionError() sixtymer = str(left_win) + str(guide_seq) + str(right_win) if len(sixtymer) != 60: raise AssertionError("should be of length 60") mh_score, oof_score = compute_score(sixtymer) feat.ix[ps, "mh_score"] = mh_score feat.ix[ps, "oof_score"] = oof_score print(f"computed microhomology of {str(gene)}") return pd.DataFrame(feat, dtype="float")
def local_gene_seq_features(gene_names, learn_options, X): print "building local gene sequence features" feat = pandas.DataFrame(index=X.index) feat["gene_left_win"] = "" feat["gene_right_win"] = "" # number of nulceotides to take to the left and right of the guide k_mer_length = learn_options['include_gene_guide_feature'] for gene in gene_names.unique(): gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement() for ps in np.where(gene_names.values==gene)[0]: guide_seq = Seq.Seq(X['30mer'][ps]) strand = X['Strand'][ps] if strand=='sense': guide_seq = guide_seq.reverse_complement() #gene_seq = gene_seq.reverse_complement() # figure out the sequence to the left and right of this guide, in the gene ind = gene_seq.find(guide_seq) if ind ==-1: #gene_seq = gene_seq.reverse_complement() #ind = gene_seq.find(guide_seq) assert ind != -1, "could not find guide in gene" assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right" left_win = gene_seq[(ind - k_mer_length):ind] right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length)] if strand=='antisense': # it's arbitrary which of sense and anti-sense we flip, we just want # to keep them in the same relative alphabet/direction left_win = left_win.reverse_complement() right_win = right_win.reverse_complement() assert not left_win.tostring()=="", "k_mer_context, %s, is too large" % k_mer_length assert not left_win.tostring()=="", "k_mer_context, %s, is too large" % k_mer_length assert len(left_win)==len(right_win), "k_mer_context, %s, is too large" % k_mer_length feat.ix[ps,"gene_left_win"] = left_win.tostring() feat.ix[ps,"gene_right_win"] = right_win.tostring() print "featurizing local context of %s" % (gene) feature_sets = {} get_all_order_nuc_features(feat["gene_left_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_left_win") get_all_order_nuc_features(feat["gene_right_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_right_win") return feature_sets
def get_micro_homology_features(gene_names, learn_options, X): # originally was flipping the guide itself as necessary, but now flipping the gene instead print "building microhomology features" feat = pandas.DataFrame(index=X.index) feat["mh_score"] = "" feat["oof_score"] = "" #with open(r"tmp\V%s_gene_mismatches.csv" % learn_options["V"],'wb') as f: if True: # number of nulceotides to take to the left and right of the guide k_mer_length_left = 9 k_mer_length_right = 21 for gene in gene_names.unique(): gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement() guide_inds = np.where(gene_names.values == gene)[0] print "getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene) for j, ps in enumerate(guide_inds): guide_seq = Seq.Seq(X['30mer'][ps]) strand = X['Strand'][ps] if strand=='sense': gene_seq = gene_seq.reverse_complement() # figure out the sequence to the left and right of this guide, in the gene ind = gene_seq.find(guide_seq) if ind==-1: gene_seq = gene_seq.reverse_complement() ind = gene_seq.find(guide_seq) #assert ind != -1, "still didn't work" #print "shouldn't get here" else: #print "all good" pass #assert ind != -1, "could not find guide in gene" if ind==-1: #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene)) #if.write(str(gene) + "," + str(guide_seq)) mh_score = 0 oof_score = 0 else: #print "worked" assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right" left_win = gene_seq[(ind - k_mer_length_left):ind] right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length_right)] #if strand=='antisense': # # it's arbitrary which of sense and anti-sense we flip, we just want # # to keep them in the same relative alphabet/direction # left_win = left_win.reverse_complement() # right_win = right_win.reverse_complement() assert len(left_win.tostring())==k_mer_length_left assert len(right_win.tostring())==k_mer_length_right sixtymer = str(left_win) + str(guide_seq) + str(right_win) assert len(sixtymer)==60, "should be of length 60" mh_score, oof_score = microhomology.compute_score(sixtymer) feat.ix[ps,"mh_score"] = mh_score feat.ix[ps,"oof_score"] = oof_score print "computed microhomology of %s" % (str(gene)) return pandas.DataFrame(feat, dtype='float')
def get_micro_homology_features(gene_names, learn_options, X): # originally was flipping the guide itself as necessary, but now flipping the gene instead print "building microhomology features" feat = pandas.DataFrame(index=X.index) feat["mh_score"] = "" feat["oof_score"] = "" #with open(r"tmp\V%s_gene_mismatches.csv" % learn_options["V"],'wb') as f: if True: # number of nulceotides to take to the left and right of the guide k_mer_length_left = 9 k_mer_length_right = 21 for gene in gene_names.unique(): gene_seq = Seq.Seq( util.get_gene_sequence(gene)).reverse_complement() guide_inds = np.where(gene_names.values == gene)[0] print "getting microhomology for all %d guides in gene %s" % ( len(guide_inds), gene) for j, ps in enumerate(guide_inds): guide_seq = Seq.Seq(X['30mer'][ps]) strand = X['Strand'][ps] if strand == 'sense': gene_seq = gene_seq.reverse_complement() # figure out the sequence to the left and right of this guide, in the gene ind = gene_seq.find(guide_seq) if ind == -1: gene_seq = gene_seq.reverse_complement() ind = gene_seq.find(guide_seq) #assert ind != -1, "still didn't work" #print "shouldn't get here" else: #print "all good" pass #assert ind != -1, "could not find guide in gene" if ind == -1: #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene)) #if.write(str(gene) + "," + str(guide_seq)) mh_score = 0 oof_score = 0 else: #print "worked" assert gene_seq[ind:( ind + len(guide_seq))] == guide_seq, "match not right" left_win = gene_seq[(ind - k_mer_length_left):ind] right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length_right)] #if strand=='antisense': # # it's arbitrary which of sense and anti-sense we flip, we just want # # to keep them in the same relative alphabet/direction # left_win = left_win.reverse_complement() # right_win = right_win.reverse_complement() assert len(left_win.tostring()) == k_mer_length_left assert len(right_win.tostring()) == k_mer_length_right sixtymer = str(left_win) + str(guide_seq) + str(right_win) assert len(sixtymer) == 60, "should be of length 60" mh_score, oof_score = microhomology.compute_score(sixtymer) feat.ix[ps, "mh_score"] = mh_score feat.ix[ps, "oof_score"] = oof_score print "computed microhomology of %s" % (str(gene)) return pandas.DataFrame(feat, dtype='float')