def gene_feature(Y, X, learn_options):
    '''
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    '''

    gene_names = Y['Target gene']

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, 'DNA')

    all = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pandas.DataFrame(data=all,
                          index=gene_names.index,
                          columns=[
                              'gene length', 'gene GC content',
                              'gene temperature', 'gene molecular weight'
                          ])
    return df
Example #2
0
def gene_feature(Y, X, learn_options):
    '''
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    '''

    gene_names = Y['Target gene']

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values==gene] = len(seq)
        gc_content[gene_names.values==gene] = SeqUtil.GC(seq)
        temperature[gene_names.values==gene] = Tm.Tm_staluc(seq, rna=False)
        molecular_weight[gene_names.values==gene] = SeqUtil.molecular_weight(seq, 'DNA')

    all = np.concatenate((gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pandas.DataFrame(data=all, index=gene_names.index, columns=['gene length',
                                                                     'gene GC content',
                                                                     'gene temperature',
                                                                     'gene molecular weight'])
    return df
Example #3
0
def gene_feature(Y):
    """
    Things like the sequence of the gene, the DNA Tm of the gene, etc.
    """

    gene_names = Y["Target gene"]

    gene_length = np.zeros((gene_names.values.shape[0], 1))
    gc_content = np.zeros((gene_names.shape[0], 1))
    temperature = np.zeros((gene_names.shape[0], 1))
    molecular_weight = np.zeros((gene_names.shape[0], 1))

    for gene in gene_names.unique():
        seq = util.get_gene_sequence(gene)
        gene_length[gene_names.values == gene] = len(seq)
        gc_content[gene_names.values == gene] = SeqUtil.GC(seq)
        temperature[gene_names.values == gene] = Tm.Tm_staluc(seq, rna=False)
        molecular_weight[gene_names.values == gene] = SeqUtil.molecular_weight(
            seq, "DNA")

    everything = np.concatenate(
        (gene_length, gc_content, temperature, molecular_weight), axis=1)
    df = pd.DataFrame(
        data=everything,
        index=gene_names.index,
        columns=[
            "gene length",
            "gene GC content",
            "gene temperature",
            "gene molecular weight",
        ],
    )
    return df
Example #4
0
def local_gene_seq_features(gene_names, learn_options, X):
    print(f"building local gene sequence features")
    feat = pd.DataFrame(index=X.index)
    feat["gene_left_win"] = ""
    feat["gene_right_win"] = ""

    # number of nulceotides to take to the left and right of the guide
    k_mer_length = learn_options["include_gene_guide_feature"]
    for gene in gene_names.unique():
        gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
        for ps in np.where(gene_names.values == gene)[0]:
            guide_seq = Seq.Seq(X["30mer"][ps])
            strand = X["Strand"][ps]
            if strand == "sense":
                guide_seq = guide_seq.reverse_complement()
            # figure out the sequence to the left and right of this guide, in the gene
            ind = gene_seq.find(guide_seq)
            if ind == -1:
                if ind == -1:
                    raise AssertionError("could not find guide in gene")
            if gene_seq[ind:(ind + len(guide_seq))] != guide_seq:
                raise AssertionError("match not right")
            left_win = gene_seq[(ind - k_mer_length):ind]
            right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) +
                                                         k_mer_length)]

            if strand == "antisense":
                # it's arbitrary which of sense and anti-sense we flip, we just want
                # to keep them in the same relative alphabet/direction
                left_win = left_win.reverse_complement()
                right_win = right_win.reverse_complement()
            if left_win.tostring() == "":
                raise AssertionError(
                    f"k_mer_context, {k_mer_length}, is too large")
            if len(left_win) != len(right_win):
                raise AssertionError(
                    f"k_mer_context, {k_mer_length}, is too large")
            feat.ix[ps, "gene_left_win"] = left_win.tostring()
            feat.ix[ps, "gene_right_win"] = right_win.tostring()
        print(f"featurizing local context of {gene}")

    feature_sets = {}
    get_all_order_nuc_features(
        feat["gene_left_win"],
        feature_sets,
        learn_options,
        learn_options["order"],
        max_index_to_use=maxsize,
        prefix="gene_left_win",
    )
    get_all_order_nuc_features(
        feat["gene_right_win"],
        feature_sets,
        learn_options,
        learn_options["order"],
        max_index_to_use=maxsize,
        prefix="gene_right_win",
    )
    return feature_sets
def local_gene_seq_features(gene_names, learn_options, X):

    print "building local gene sequence features"
    feat = pandas.DataFrame(index=X.index)
    feat["gene_left_win"] = ""
    feat["gene_right_win"] = ""

    # number of nulceotides to take to the left and right of the guide
    k_mer_length = learn_options['include_gene_guide_feature']
    for gene in gene_names.unique():
        gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
        for ps in np.where(gene_names.values == gene)[0]:
            guide_seq = Seq.Seq(X['30mer'][ps])
            strand = X['Strand'][ps]
            if strand == 'sense':
                guide_seq = guide_seq.reverse_complement()
                #gene_seq = gene_seq.reverse_complement()
            # figure out the sequence to the left and right of this guide, in the gene
            ind = gene_seq.find(guide_seq)
            if ind == -1:
                #gene_seq = gene_seq.reverse_complement()
                #ind = gene_seq.find(guide_seq)
                assert ind != -1, "could not find guide in gene"
            assert gene_seq[ind:(
                ind + len(guide_seq))] == guide_seq, "match not right"
            left_win = gene_seq[(ind - k_mer_length):ind]
            right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) +
                                                         k_mer_length)]

            if strand == 'antisense':
                # it's arbitrary which of sense and anti-sense we flip, we just want
                # to keep them in the same relative alphabet/direction
                left_win = left_win.reverse_complement()
                right_win = right_win.reverse_complement()
            assert not left_win.tostring(
            ) == "", "k_mer_context, %s, is too large" % k_mer_length
            assert not left_win.tostring(
            ) == "", "k_mer_context, %s, is too large" % k_mer_length
            assert len(left_win) == len(
                right_win), "k_mer_context, %s, is too large" % k_mer_length
            feat.ix[ps, "gene_left_win"] = left_win.tostring()
            feat.ix[ps, "gene_right_win"] = right_win.tostring()
        print "featurizing local context of %s" % (gene)

    feature_sets = {}
    get_all_order_nuc_features(feat["gene_left_win"],
                               feature_sets,
                               learn_options,
                               learn_options["order"],
                               max_index_to_use=sys.maxint,
                               prefix="gene_left_win")
    get_all_order_nuc_features(feat["gene_right_win"],
                               feature_sets,
                               learn_options,
                               learn_options["order"],
                               max_index_to_use=sys.maxint,
                               prefix="gene_right_win")
    return feature_sets
Example #6
0
def get_micro_homology_features(gene_names, X):
    # originally was flipping the guide itself as necessary, but now flipping the gene instead

    print("building microhomology features")
    feat = pd.DataFrame(index=X.index)
    feat["mh_score"] = ""
    feat["oof_score"] = ""

    # number of nulceotides to take to the left and right of the guide
    K_MER_LENGTH_LEFT = 9
    K_MER_LENGTH_RIGHT = 21
    for gene in gene_names.unique():
        gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
        guide_inds = np.where(gene_names.values == gene)[0]
        print(
            f"getting microhomology for all {len(guide_inds)} guides in gene {gene}"
        )
        for ps in guide_inds:
            guide_seq = Seq.Seq(X["30mer"][ps])
            strand = X["Strand"][ps]
            if strand == "sense":
                gene_seq = gene_seq.reverse_complement()
            # figure out the sequence to the left and right of this guide, in the gene
            ind = gene_seq.find(guide_seq)
            if ind == -1:
                gene_seq = gene_seq.reverse_complement()
                ind = gene_seq.find(guide_seq)
            else:
                pass
            if ind == -1:
                mh_score = 0
                oof_score = 0
            else:

                if gene_seq[ind:(ind + len(guide_seq))] != guide_seq:
                    raise AssertionError("match not right")

                left_win = gene_seq[(ind - K_MER_LENGTH_LEFT):ind]
                right_win = gene_seq[(ind +
                                      len(guide_seq)):(ind + len(guide_seq) +
                                                       K_MER_LENGTH_RIGHT)]
                if len(left_win.tostring()) != K_MER_LENGTH_LEFT:
                    raise AssertionError()
                if len(right_win.tostring()) != K_MER_LENGTH_RIGHT:
                    raise AssertionError()
                sixtymer = str(left_win) + str(guide_seq) + str(right_win)
                if len(sixtymer) != 60:
                    raise AssertionError("should be of length 60")
                mh_score, oof_score = compute_score(sixtymer)

            feat.ix[ps, "mh_score"] = mh_score
            feat.ix[ps, "oof_score"] = oof_score
        print(f"computed microhomology of {str(gene)}")

    return pd.DataFrame(feat, dtype="float")
Example #7
0
def local_gene_seq_features(gene_names, learn_options, X):

    print "building local gene sequence features"
    feat = pandas.DataFrame(index=X.index)
    feat["gene_left_win"] = ""
    feat["gene_right_win"] = ""

    # number of nulceotides to take to the left and right of the guide
    k_mer_length = learn_options['include_gene_guide_feature']
    for gene in gene_names.unique():
        gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
        for ps in np.where(gene_names.values==gene)[0]:
            guide_seq = Seq.Seq(X['30mer'][ps])
            strand = X['Strand'][ps]
            if strand=='sense':
                guide_seq = guide_seq.reverse_complement()
                #gene_seq = gene_seq.reverse_complement()
            # figure out the sequence to the left and right of this guide, in the gene
            ind = gene_seq.find(guide_seq)
            if ind ==-1:
                #gene_seq = gene_seq.reverse_complement()
                #ind = gene_seq.find(guide_seq)
                assert ind != -1, "could not find guide in gene"
            assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
            left_win = gene_seq[(ind - k_mer_length):ind]
            right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length)]

            if strand=='antisense':
                # it's arbitrary which of sense and anti-sense we flip, we just want
                # to keep them in the same relative alphabet/direction
                left_win = left_win.reverse_complement()
                right_win = right_win.reverse_complement()
            assert not left_win.tostring()=="", "k_mer_context, %s, is too large" % k_mer_length
            assert not left_win.tostring()=="", "k_mer_context, %s, is too large" % k_mer_length
            assert len(left_win)==len(right_win), "k_mer_context, %s, is too large" % k_mer_length
            feat.ix[ps,"gene_left_win"] = left_win.tostring()
            feat.ix[ps,"gene_right_win"] = right_win.tostring()
        print "featurizing local context of %s" % (gene)

    feature_sets = {}
    get_all_order_nuc_features(feat["gene_left_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_left_win")
    get_all_order_nuc_features(feat["gene_right_win"], feature_sets, learn_options, learn_options["order"], max_index_to_use=sys.maxint, prefix="gene_right_win")
    return feature_sets
Example #8
0
def get_micro_homology_features(gene_names, learn_options, X):
    # originally was flipping the guide itself as necessary, but now flipping the gene instead

    print "building microhomology features"
    feat = pandas.DataFrame(index=X.index)
    feat["mh_score"] = ""
    feat["oof_score"] = ""

    #with open(r"tmp\V%s_gene_mismatches.csv" % learn_options["V"],'wb') as f:
    if True:
        # number of nulceotides to take to the left and right of the guide
        k_mer_length_left = 9
        k_mer_length_right = 21
        for gene in gene_names.unique():
            gene_seq = Seq.Seq(util.get_gene_sequence(gene)).reverse_complement()
            guide_inds = np.where(gene_names.values == gene)[0]
            print "getting microhomology for all %d guides in gene %s" % (len(guide_inds), gene)
            for j, ps in enumerate(guide_inds):
                guide_seq = Seq.Seq(X['30mer'][ps])
                strand = X['Strand'][ps]
                if strand=='sense':
                    gene_seq = gene_seq.reverse_complement()
                # figure out the sequence to the left and right of this guide, in the gene
                ind = gene_seq.find(guide_seq)
                if ind==-1:
                    gene_seq = gene_seq.reverse_complement()
                    ind = gene_seq.find(guide_seq)
                    #assert ind != -1, "still didn't work"
                    #print "shouldn't get here"
                else:
                    #print "all good"
                    pass
                #assert ind != -1, "could not find guide in gene"
                if ind==-1:
                    #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene))
                    #if.write(str(gene) + "," + str(guide_seq))
                    mh_score = 0
                    oof_score = 0
                else:
                    #print "worked"

                    assert gene_seq[ind:(ind+len(guide_seq))]==guide_seq, "match not right"
                    left_win = gene_seq[(ind - k_mer_length_left):ind]
                    right_win = gene_seq[(ind + len(guide_seq)):(ind + len(guide_seq) + k_mer_length_right)]

                    #if strand=='antisense':
                    #    # it's arbitrary which of sense and anti-sense we flip, we just want
                    #    # to keep them in the same relative alphabet/direction
                    #    left_win = left_win.reverse_complement()
                    #    right_win = right_win.reverse_complement()
                    assert len(left_win.tostring())==k_mer_length_left
                    assert len(right_win.tostring())==k_mer_length_right

                    sixtymer = str(left_win) + str(guide_seq) + str(right_win)
                    assert len(sixtymer)==60, "should be of length 60"
                    mh_score, oof_score = microhomology.compute_score(sixtymer)

                feat.ix[ps,"mh_score"] = mh_score
                feat.ix[ps,"oof_score"] = oof_score
            print "computed microhomology of %s" % (str(gene))

    return pandas.DataFrame(feat, dtype='float')
def get_micro_homology_features(gene_names, learn_options, X):
    # originally was flipping the guide itself as necessary, but now flipping the gene instead

    print "building microhomology features"
    feat = pandas.DataFrame(index=X.index)
    feat["mh_score"] = ""
    feat["oof_score"] = ""

    #with open(r"tmp\V%s_gene_mismatches.csv" % learn_options["V"],'wb') as f:
    if True:
        # number of nulceotides to take to the left and right of the guide
        k_mer_length_left = 9
        k_mer_length_right = 21
        for gene in gene_names.unique():
            gene_seq = Seq.Seq(
                util.get_gene_sequence(gene)).reverse_complement()
            guide_inds = np.where(gene_names.values == gene)[0]
            print "getting microhomology for all %d guides in gene %s" % (
                len(guide_inds), gene)
            for j, ps in enumerate(guide_inds):
                guide_seq = Seq.Seq(X['30mer'][ps])
                strand = X['Strand'][ps]
                if strand == 'sense':
                    gene_seq = gene_seq.reverse_complement()
                # figure out the sequence to the left and right of this guide, in the gene
                ind = gene_seq.find(guide_seq)
                if ind == -1:
                    gene_seq = gene_seq.reverse_complement()
                    ind = gene_seq.find(guide_seq)
                    #assert ind != -1, "still didn't work"
                    #print "shouldn't get here"
                else:
                    #print "all good"
                    pass
                #assert ind != -1, "could not find guide in gene"
                if ind == -1:
                    #print "***could not find guide %s for gene %s" % (str(guide_seq), str(gene))
                    #if.write(str(gene) + "," + str(guide_seq))
                    mh_score = 0
                    oof_score = 0
                else:
                    #print "worked"

                    assert gene_seq[ind:(
                        ind + len(guide_seq))] == guide_seq, "match not right"
                    left_win = gene_seq[(ind - k_mer_length_left):ind]
                    right_win = gene_seq[(ind +
                                          len(guide_seq)):(ind +
                                                           len(guide_seq) +
                                                           k_mer_length_right)]

                    #if strand=='antisense':
                    #    # it's arbitrary which of sense and anti-sense we flip, we just want
                    #    # to keep them in the same relative alphabet/direction
                    #    left_win = left_win.reverse_complement()
                    #    right_win = right_win.reverse_complement()
                    assert len(left_win.tostring()) == k_mer_length_left
                    assert len(right_win.tostring()) == k_mer_length_right

                    sixtymer = str(left_win) + str(guide_seq) + str(right_win)
                    assert len(sixtymer) == 60, "should be of length 60"
                    mh_score, oof_score = microhomology.compute_score(sixtymer)

                feat.ix[ps, "mh_score"] = mh_score
                feat.ix[ps, "oof_score"] = oof_score
            print "computed microhomology of %s" % (str(gene))

    return pandas.DataFrame(feat, dtype='float')