def calculateScore(seq, model):
    if seq[25:27] == 'GG':
        score = model_comparison.predict(seq, -1, -1, model)
        #print 'Rule set 2 score: %.4f'% (score)
        return str(score)
    else:
        print >> sys.stderr, 'Calculates on-target scores for sgRNAs with NGG PAM only.'
def main():
	g, cp, pp = testdata.get_test_data("data/testdata.xlsx")
	results = model_comparison.predict(g, cp, pp, model_file=sys.argv[1])
	writer = csv.writer(open(sys.argv[2], "w"))
	#results = model_comparison.predict(g, cp, pp, model_file='saved_models/PAM_nopos.pickle')
	#writer = csv.writer(open("NGGXX.csv", "w"))
	writer.writerow(results)
def calculateScore(seq, model):
    if seq[25:27] == 'GG':
        score = model_comparison.predict(seq, -1, -1, model)
        #print 'Rule set 2 score: %.4f'% (score)
        return str(score)
    else:
        print >> sys.stderr, 'Calculates on-target scores for sgRNAs with NGG PAM only.'
Exemple #4
0
def calcFusiDoench(seqs):
    """
    Input is a 30mer: 4bp 5', 20bp guide, 3bp PAM, 3bp 5'
    based on source code sent by John Doench
    {'include_strand': False, 'weighted': None, 'num_thread_per_proc': None, 'extra pairs': False, 'gc_features': True, 'test_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13',
       'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1',
       u'CUL3', u'NF1', u'NF2'], dtype=object), 'testing_non_binary_target_name': 'ranks', 'train_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13',
       'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1',
       u'CUL3', u'NF1', u'NF2'], dtype=object), 'cv': 'gene', 'adaboost_alpha': 0.5, 'all pairs': False, 'binary target name': 'score_drug_gene_threshold', 'normalize_features': False, 'nuc_features': True, 'include_gene_effect': False, 'num_genes_remove_train': None, 'include_gene_guide_feature': 0, 'include_known_pairs': False, 'include_gene_feature': False, 'training_metric': 'spearmanr', 'num_proc': 8, 'include_drug': False, 'include_microhomology': False, 'V': 3, 'include_Tm': True, 'adaboost_loss': 'ls', 'rank-transformed target name': 'score_drug_gene_rank', 'include_pi_nuc_feat': True, 'include_sgRNAscore': False, 'flipV1target': False, 'include_NGGX_interaction': True, 'seed': 1, 'NDGC_k': 10, 'raw target name': None, 'all_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13',
       'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1',
       u'CUL3', u'NF1', u'NF2'], dtype=object), 'order': 2, 'include_gene_position': False}
    """
    #aa_cut = 0
    #percent_peptide=0
    #learn_options["V"] = 2
    #model, learn_options = pickle.load(f)
    #for seq in seqs:
    #get_all_order_nuc_features(seq, feature_sets, learn_options, learn_options["order"], max_index_to_use=30)
    #assert(not learn_options["gc_features"])
    #assert(not learn_options["gene_position"])
    aa_cut = 0
    per_peptide = 0
    f = open(join(fusiDir, 'saved_models/V3_model_nopos.pickle'))
    model = pickle.load(
        f
    )  # if this fails, install sklearn like this: pip install scikit-learn==0.16.1
    res = []
    for seq in seqs:
        if "N" in seq:
            res.append(-1)  # can't do Ns
            continue

        pam = seq[25:27]
        if pam != "GG":
            #res.append(-1)
            #continue
            seq = list(seq)
            seq[25] = "G"
            seq[26] = "G"
            seq = "".join(seq)
        if "N" in seq:
            res.append(-1)
            continue
        score = model_comparison.predict(seq, aa_cut, per_peptide, model=model)
        res.append(int(round(100 * score)))
    return res
Exemple #5
0
def get_rs2_score(seq, model_file):
    seq = seq.upper()
    if len(seq) != 30:
        print("Please enter a 30mer sequence.")
        return None
    try:
        with open(model_file, 'rb') as f:
            model = pickle.load(f)
    except:
        raise Exception(
            "could not find model stored to file %s. Perhaps the scikit-learn package is of another version."
            % model_file)
    if seq[25:27] == 'GG':
        score = model_comparison.predict(seq, -1, -1, model=model)
        return score
    else:
        print >> sys.stderr, 'Calculates on-target scores for sgRNAs with NGG PAM only.'
        return None
def calcFusiDoench(seqs):
    """
    Input is a 30mer: 4bp 5', 20bp guide, 3bp PAM, 3bp 5'
    based on source code sent by John Doench
    {'include_strand': False, 'weighted': None, 'num_thread_per_proc': None, 'extra pairs': False, 'gc_features': True, 'test_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13',
       'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1',
       u'CUL3', u'NF1', u'NF2'], dtype=object), 'testing_non_binary_target_name': 'ranks', 'train_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13',
       'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1',
       u'CUL3', u'NF1', u'NF2'], dtype=object), 'cv': 'gene', 'adaboost_alpha': 0.5, 'all pairs': False, 'binary target name': 'score_drug_gene_threshold', 'normalize_features': False, 'nuc_features': True, 'include_gene_effect': False, 'num_genes_remove_train': None, 'include_gene_guide_feature': 0, 'include_known_pairs': False, 'include_gene_feature': False, 'training_metric': 'spearmanr', 'num_proc': 8, 'include_drug': False, 'include_microhomology': False, 'V': 3, 'include_Tm': True, 'adaboost_loss': 'ls', 'rank-transformed target name': 'score_drug_gene_rank', 'include_pi_nuc_feat': True, 'include_sgRNAscore': False, 'flipV1target': False, 'include_NGGX_interaction': True, 'seed': 1, 'NDGC_k': 10, 'raw target name': None, 'all_genes': array([u'CD5', u'CD45', u'THY1', u'H2-K', u'CD28', u'CD43', 'CD33', 'CD13',
       'CD15', u'HPRT1', u'CCDC101', u'MED12', u'TADA2B', u'TADA1',
       u'CUL3', u'NF1', u'NF2'], dtype=object), 'order': 2, 'include_gene_position': False}
    """
    # aa_cut = 0
    # percent_peptide=0
    # learn_options["V"] = 2
    # model, learn_options = pickle.load(f)
    # for seq in seqs:
    # get_all_order_nuc_features(seq, feature_sets, learn_options, learn_options["order"], max_index_to_use=30)
    # assert(not learn_options["gc_features"])
    # assert(not learn_options["gene_position"])
    aa_cut = 0
    per_peptide = 0
    f = open(join(fusiDir, "saved_models/V3_model_nopos.pickle"))
    model = pickle.load(f)  # if this fails, install sklearn like this: pip install scikit-learn==0.16.1
    res = []
    for seq in seqs:
        pam = seq[25:27]
        if pam != "GG":
            # res.append(-1)
            # continue
            seq = list(seq)
            seq[25] = "G"
            seq[26] = "G"
            seq = "".join(seq)
        if "N" in seq:
            res.append(-1)
            continue
        score = model_comparison.predict(seq, aa_cut, per_peptide, model=model)
        res.append(int(round(100 * score)))
    return res
        help='Amino acid cut position of sgRNA')
    parser.add_argument('--per-peptide',
        type=float,
        default=None,
        help='Percentage of protein cut by sgRNA')
    return parser

if __name__ == '__main__':
    args = get_parser().parse_args()
    seq = args.seq.upper()
    if len(seq)!=30: 
        print "Please enter a 30mer sequence."
        sys.exit(1)
    aa_cut = args.aa_cut
    per_peptide = args.per_peptide
    model_file_1 = '../saved_models/V3_model_nopos.pickle'
    model_file_2 = '../saved_models/V3_model_full.pickle'
    if (aa_cut == None) or (per_peptide == None):
        model_file = model_file_1
    else:
        model_file = model_file_2
    try:
        with open(model_file, 'rb') as f:
            model= pickle.load(f)    
    except:
        raise Exception("could not find model stored to file %s" % model_file)
    if seq[25:27] == 'GG':
        score = model_comparison.predict(seq, aa_cut, per_peptide, model=model)
        print 'Rule set 2 score: %.4f'% (score)
    else:
        print >> sys.stderr, 'Calculates on-target scores for sgRNAs with NGG PAM only.'
Exemple #8
0
NF1_PREDICT = predict(np.asarray(table['Construct IDs'][3366:4212].apply(lambda x: x[1:26] + 'G' + x[27:31]).values, dtype=str),
                                                       np.asarray(table["Amino Acid"][3366:4212].values, dtype=float),
                                                       np.asarray(table["Pct Pep"][3366:4212].values, dtype=float))
NF1_SCORE = np.asarray(table["PctRank"][3366:4212])
print "NF1"
print scipy.stats.spearmanr(NF1_PREDICT, NF1_SCORE)
NF2_PREDICT = predict(np.asarray(table['Construct IDs'][4212:4515].apply(lambda x: x[1:26] + 'G' + x[27:31]).values, dtype=str),
                                                       np.asarray(table["Amino Acid"][4212:4515].values, dtype=float),
                                                       np.asarray(table["Pct Pep"][4212:4515].values, dtype=float))
NF2_SCORE = np.asarray(table["PctRank"][4212:4515])
print "NF2"
print scipy.stats.spearmanr(NF2_PREDICT, NF2_SCORE)
"""
for i in xrange(7):
    NUDT5_PREDICT = predict(np.asarray(table['Construct IDs'].apply(lambda x: x[i:25+i] + 'GG' + x[27+i:30+i]).values, dtype=str),
                                                           np.asarray(table["Amino Acid"].values, dtype=float),
                                                           np.asarray(table["Pct Pep"].values, dtype=float))
    NUDT5_SCORE = np.asarray(table["LFC"].values, dtype=float)
    # print "NUDT5"
    print i
    print scipy.stats.spearmanr(NUDT5_PREDICT, NUDT5_SCORE)
"""
PELP1_PREDICT = predict(np.asarray(table['Construct IDs'][4606:6925].apply(lambda x: x[1:26] + 'G' + x[27:31]).values, dtype=str),
                                                       np.asarray(table["Amino Acid"][4606:6925].values, dtype=float),
                                                       np.asarray(table["Pct Pep"][4606:6925].values, dtype=float))
PELP1_SCORE = np.asarray(table["PctRank"][4606:6925])
print "PELP1"
print scipy.stats.spearmanr(PELP1_PREDICT, PELP1_SCORE)
TFRC_PREDICT = predict(np.asarray(table['Construct IDs'][6925:7465].apply(lambda x: x[1:26] + 'G' + x[27:31]).values, dtype=str),
                                                       np.asarray(table["Amino Acid"][6925:7465].values, dtype=float),
                                                       np.asarray(table["Pct Pep"][6925:7465].values, dtype=float))