Beispiel #1
0
def gene_feature(Y, X, learn_options):
    '''
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    '''

    gene_names = Y['Target gene']

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_NN(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, 'DNA')

    all = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pandas.DataFrame(data=all,
                          index=gene_names.index,
                          columns=[
                              'gene length', 'gene GC content',
                              'gene temperature', 'gene molecular weight'
                          ])
    return df
def gene_feature(Y, X, learn_options):
    '''
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    '''

    gene_names = Y['Target gene']

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values==gene] = len(seq)
        gc_content[gene_names.values==gene] = SeqUtil.GC(seq)
        temperature[gene_names.values==gene] = Tm.Tm_staluc(seq, rna=False)
        molecular_weight[gene_names.values==gene] = SeqUtil.molecular_weight(seq, 'DNA')

    all = np.concatenate((gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pandas.DataFrame(data=all, index=gene_names.index, columns=['gene length',
                                                                     'gene GC content',
                                                                     'gene temperature',
                                                                     'gene molecular weight'])
    return df
Beispiel #3
0
def local_gene_seq_features(gene_names, learn_options, X):

    print "building local gene sequence features"
    feat = pandas.DataFrame(index=X.index)
    feat["gene_left_win"] = ""
    feat["gene_right_win"] = ""

    # number of nulceotides to take to the left and right of the guide
    k_mer_length = learn_options['include_gene_guide_feature']
    for gene in gene_names.unique():
        gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
        for ps in np.where(gene_names.values == gene)[0]:
            guide_seq = Seq.Seq(X['30mer'][ps])
            strand = X['Strand'][ps]
            if strand == 'sense':
                guide_seq = guide_seq.reverse_complement()
                #gene_seq = gene_seq.reverse_complement()
            # figure out the sequence to the left and right of this guide, in the gene
            ind = gene_seq.find(guide_seq)
            if ind == -1:
                #gene_seq = gene_seq.reverse_complement()
                #ind = gene_seq.find(guide_seq)
                assert ind != -1, "could not find guide in gene"
            assert gene_seq[ind:(
                ind + len(guide_seq))] == guide_seq, "match not right"
            left_win = gene_seq[(ind - k_mer_length):ind]
            right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) +
                                                         k_mer_length)]

            if strand == 'antisense':
                # it's arbitrary which of sense and anti-sense we flip, we just want
                # to keep them in the same relative alphabet/direction
                left_win = left_win.reverse_complement()
                right_win = right_win.reverse_complement()
            assert not left_win.tostring(
            ) == "", "k_mer_context, %s, is too large" % k_mer_length
            assert not left_win.tostring(
            ) == "", "k_mer_context, %s, is too large" % k_mer_length
            assert len(left_win) == len(
                right_win), "k_mer_context, %s, is too large" % k_mer_length
            feat.ix[ps, "gene_left_win"] = left_win.tostring()
            feat.ix[ps, "gene_right_win"] = right_win.tostring()
        print "featurizing local context of %s" % (gene)

    feature_sets = {}
    get_all_order_nuc_features(feat["gene_left_win"],
                               feature_sets,
                               learn_options,
                               learn_options["order"],
                               max_index_to_use=sys.maxint,
                               prefix="gene_left_win")
    get_all_order_nuc_features(feat["gene_right_win"],
                               feature_sets,
                               learn_options,
                               learn_options["order"],
                               max_index_to_use=sys.maxint,
                               prefix="gene_right_win")
    return feature_sets
def local_gene_seq_features(gene_names, learn_options, X):
    
    print "building local gene sequence features"
    feat = pandas.DataFrame(index=X.index)
    feat["gene_left_win"] = ""
    feat["gene_right_win"] = ""

    # number of nulceotides to take to the left and right of the guide
    k_mer_length = learn_options['include_gene_guide_feature']
    for gene in gene_names.unique():
        gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
        for ps in np.where(gene_names.values==gene)[0]:
            guide_seq = Seq.Seq(X['30mer'][ps])
            strand = X['Strand'][ps]
            if strand=='sense':
                guide_seq = guide_seq.reverse_complement()
                #gene_seq = gene_seq.reverse_complement()
            # figure out the sequence to the left and right of this guide, in the gene
            ind = gene_seq.find(guide_seq)
            if ind ==-1:
                #gene_seq = gene_seq.reverse_complement()
                #ind = gene_seq.find(guide_seq)
                assert ind != -1, "could not find guide in gene"
            assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
            left_win = gene_seq[(ind - k_mer_length):ind]
            right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length)]

            if strand=='antisense':
                # it's arbitrary which of sense and anti-sense we flip, we just want
                # to keep them in the same relative alphabet/direction
                left_win = left_win.reverse_complement()
                right_win = right_win.reverse_complement()
            assert not left_win.tostring()=="", "k_mer_context, %s, is too large" % k_mer_length
            assert not left_win.tostring()=="", "k_mer_context, %s, is too large" % k_mer_length
            assert len(left_win)==len(right_win), "k_mer_context, %s, is too large" % k_mer_length
            feat.ix[ps,"gene_left_win"] = left_win.tostring()
            feat.ix[ps,"gene_right_win"] = right_win.tostring()
        print "featurizing local context of %s" % (gene)

    feature_sets = {}
    get_all_order_nuc_features(feat["gene_left_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_left_win")
    get_all_order_nuc_features(feat["gene_right_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_right_win")
    return feature_sets
Beispiel #5
0
def guide_positional_features(guide_seq, gene, strand):
    '''
    Given a guide sequence, a gene name, and strand (e.g. "sense"), return the (absolute) nucleotide cut position, and the percent amino acid.
    From John's email:
    the cut site is always 3nts upstream of the NGG PAM:
    5' - 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 <cut> 18 19 20 N G G - 3'
    To calculate percent protein, we determined what amino acid number was being cut and just divided by the total number of amino acids. In the case where the cutsite was between two amino acid codons, I believe we rounded down

    '''
    guide_seq = Seq.Seq(guide_seq)
    gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()    
    if strand=='sense':
        guide_seq = guide_seq.reverse_complement()
    ind = gene_seq.find(guide_seq)
    if ind ==-1:
        print "returning None, could not find guide %s in gene %s" % (guide_seq, gene)
        return ""
    assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
    ## now get what we want from this:
    import ipdb; ipdb.set_trace()
    raise NotImplementedError("incomplete implentation for now")
Beispiel #6
0
def get_micro_homology_features(gene_names, learn_options, X):
    # originally was flipping the guide itself as necessary, but now flipping the gene instead

    print "building microhomology features"
    feat = pandas.DataFrame(index=X.index)
    feat["mh_score"] = ""
    feat["oof_score"] = ""

    #with open(r"tmp\V%s_gene_mismatches.csv" % learn_options["V"],'wb') as f:
    if True:
        # number of nulceotides to take to the left and right of the guide
        k_mer_length_left = 9
        k_mer_length_right = 21
        for gene in gene_names.unique():
            gene_seq = Seq.Seq(
                util.get_gene_sequence(gene)).reverse_complement()
            guide_inds = np.where(gene_names.values == gene)[0]
            print "getting microhomology for all %d guides in gene %s" % (
                len(guide_inds), gene)
            for j, ps in enumerate(guide_inds):
                guide_seq = Seq.Seq(X['30mer'][ps])
                strand = X['Strand'][ps]
                if strand == 'sense':
                    gene_seq = gene_seq.reverse_complement()
                # figure out the sequence to the left and right of this guide, in the gene
                ind = gene_seq.find(guide_seq)
                if ind == -1:
                    gene_seq = gene_seq.reverse_complement()
                    ind = gene_seq.find(guide_seq)
                    #assert ind != -1, "still didn't work"
                    #print "shouldn't get here"
                else:
                    #print "all good"
                    pass
                #assert ind != -1, "could not find guide in gene"
                if ind == -1:
                    #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene))
                    #if.write(str(gene) + "," + str(guide_seq))
                    mh_score = 0
                    oof_score = 0
                else:
                    #print "worked"

                    assert gene_seq[ind:(
                        ind + len(guide_seq))] == guide_seq, "match not right"
                    left_win = gene_seq[(ind - k_mer_length_left):ind]
                    right_win = gene_seq[(ind +
                                          len(guide_seq)):(ind +
                                                           len(guide_seq) +
                                                           k_mer_length_right)]

                    #if strand=='antisense':
                    #    # it's arbitrary which of sense and anti-sense we flip, we just want
                    #    # to keep them in the same relative alphabet/direction
                    #    left_win = left_win.reverse_complement()
                    #    right_win = right_win.reverse_complement()
                    assert len(left_win.tostring()) == k_mer_length_left
                    assert len(right_win.tostring()) == k_mer_length_right

                    sixtymer = str(left_win) + str(guide_seq) + str(right_win)
                    assert len(sixtymer) == 60, "should be of length 60"
                    mh_score, oof_score = microhomology.compute_score(sixtymer)

                feat.ix[ps, "mh_score"] = mh_score
                feat.ix[ps, "oof_score"] = oof_score
            print "computed microhomology of %s" % (str(gene))

    return pandas.DataFrame(feat, dtype='float')
def get_micro_homology_features(gene_names, learn_options, X):
    # originally was flipping the guide itself as necessary, but now flipping the gene instead

    print "building microhomology features"
    feat = pandas.DataFrame(index=X.index)
    feat["mh_score"] = ""
    feat["oof_score"] = ""

    #with open(r"tmp\V%s_gene_mismatches.csv" % learn_options["V"],'wb') as f:
    if True:
        # number of nulceotides to take to the left and right of the guide
        k_mer_length_left = 9
        k_mer_length_right = 21
        for gene in gene_names.unique():
            gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
            guide_inds = np.where(gene_names.values == gene)[0]
            print "getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene)
            for j, ps in enumerate(guide_inds):
                guide_seq = Seq.Seq(X['30mer'][ps])
                strand = X['Strand'][ps]
                if strand=='sense':              
                    gene_seq = gene_seq.reverse_complement()
                # figure out the sequence to the left and right of this guide, in the gene
                ind = gene_seq.find(guide_seq)                        
                if ind==-1:
                    gene_seq = gene_seq.reverse_complement()
                    ind = gene_seq.find(guide_seq)                        
                    #assert ind != -1, "still didn't work"
                    #print "shouldn't get here"
                else:
                    #print "all good"
                    pass
                #assert ind != -1, "could not find guide in gene"
                if ind==-1:
                    #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene))
                    #if.write(str(gene) + "," + str(guide_seq))
                    mh_score = 0
                    oof_score = 0
                else:
                    #print "worked"

                    assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
                    left_win = gene_seq[(ind - k_mer_length_left):ind]
                    right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length_right)]

                    #if strand=='antisense':
                    #    # it's arbitrary which of sense and anti-sense we flip, we just want
                    #    # to keep them in the same relative alphabet/direction
                    #    left_win = left_win.reverse_complement()
                    #    right_win = right_win.reverse_complement()
                    assert len(left_win.tostring())==k_mer_length_left
                    assert len(right_win.tostring())==k_mer_length_right
                                    
                    sixtymer = str(left_win) + str(guide_seq) + str(right_win)
                    assert len(sixtymer)==60, "should be of length 60"
                    mh_score, oof_score = microhomology.compute_score(sixtymer)

                feat.ix[ps,"mh_score"] = mh_score
                feat.ix[ps,"oof_score"] = oof_score
            print "computed microhomology of %s" % (str(gene))           
    
    return pandas.DataFrame(feat, dtype='float')