Exemple #1
0
def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=True, length_audit=True):

    num_proc = shared_setup(learn_options, order, test)

    assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error"
    if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']:
        raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]')

    Xdf, Y, gene_position, target_genes = azimuth.load_data.from_file(data_file, learn_options)
    learn_options['all_genes'] = target_genes

    if test:
        learn_options["order"] = 1

    if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True:
        print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)"
        for i in range(Xdf.shape[0]):
            Xdf['30mer'].iloc[i] = azimuth.util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"])
        # to_keep = Xdf['30mer'].isnull() == False
        # Xdf = Xdf[to_keep]
        # gene_position = gene_position[to_keep]
        # Y = Y[to_keep]
        Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[1:]) # chop the first nucleotide

    if learn_options.has_key('left_right_guide_ind') and learn_options['left_right_guide_ind'] is not None:
        seq_start, seq_end, expected_length = learn_options['left_right_guide_ind']
        Xdf['30mer'] = Xdf['30mer'].apply(lambda seq: seq[seq_start:seq_end])

    feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position, pam_audit=pam_audit, length_audit=length_audit)
    np.random.seed(learn_options['seed'])

    return Y, feature_sets, target_genes, learn_options, num_proc
def setup(test=False, order=1, learn_options=None, data_file=None, pam_audit=True):

    num_proc = shared_setup(learn_options, order, test)

    assert "testing_non_binary_target_name" in learn_options.keys(), "need this in order to get metrics, though used to be not needed, so you may newly see this error"
    if learn_options["testing_non_binary_target_name"] not in ['ranks', 'raw', 'thrs']:
        raise Exception('learn_otions["testing_non_binary_target_name"] must be in ["ranks", "raw", "thrs"]')
    
    Xdf, Y, gene_position, target_genes = azimuth.load_data.from_file(data_file, learn_options)
    learn_options['all_genes'] = target_genes

    if test:
        learn_options["order"] = 1

    if 'convert_30mer_to_31mer' in learn_options and learn_options['convert_30mer_to_31mer'] is True:
        print "WARNING!!! converting 30 mer to 31 mer (and then cutting off first nucleotide to go back to 30mer with a right shift)"
        for i in range(Xdf.shape[0]):
            Xdf['30mer'].iloc[i] = azimuth.util.convert_to_thirty_one(Xdf.iloc[i]["30mer"], Xdf.index.values[i][1], Xdf.iloc[i]["Strand"])
        # to_keep = Xdf['30mer'].isnull() == False
        # Xdf = Xdf[to_keep]
        # gene_position = gene_position[to_keep]
        # Y = Y[to_keep]
        Xdf["30mer"] = Xdf["30mer"].apply(lambda x: x[1:]) # chop the first nucleotide

    feature_sets = feat.featurize_data(Xdf, learn_options, Y, gene_position, pam_audit)
    np.random.seed(learn_options['seed'])

    return Y, feature_sets, target_genes, learn_options, num_proc
def predict(seq,
            aa_cut=-1,
            percent_peptide=-1,
            model=None,
            model_file=None,
            pam_audit=True):
    """
    if pam_audit==False, then it will not check for GG in the expected position
    this is useful if predicting on PAM mismatches, such as with off-target
    """
    # assert not (model is None and model_file is None), "you have to specify either a model or a model_file"
    print aa_cut, percent_peptide
    if model_file is None:
        azimuth_saved_model_dir = os.path.join(
            os.path.dirname(azimuth.__file__), 'saved_models')
        if np.any(percent_peptide == -1) or (percent_peptide is None
                                             and aa_cut is None):
            print("No model file specified, using V3_model_nopos")
            model_name = 'V3_model_nopos.pickle'
        else:
            print("No model file specified, using V3_model_full")
            model_name = 'V3_model_full.pickle'

        model_file = os.path.join(azimuth_saved_model_dir, model_name)

    if model is None:
        with open(model_file, 'rb') as f:
            model, learn_options = pickle.load(f)
    else:
        model, learn_options = model

    learn_options["V"] = 2

    # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename)
    # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets)

    Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'],
                           data=zip(seq, ['NA' for x in range(len(seq))]))

    if np.all(percent_peptide != -1) and (percent_peptide is not None
                                          and aa_cut is not None):
        gene_position = pandas.DataFrame(
            columns=[u'Percent Peptide', u'Amino Acid Cut position'],
            data=zip(percent_peptide, aa_cut))
    else:
        gene_position = pandas.DataFrame(
            columns=[u'Percent Peptide', u'Amino Acid Cut position'],
            data=zip(np.ones(seq.shape[0]) * -1,
                     np.ones(seq.shape[0]) * -1))

    feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(),
                                       gene_position, pam_audit)
    inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(
        feature_sets)

    # call to scikit-learn, returns a vector of predicted values
    return model.predict(inputs)
def predict(seq, aa_cut=-1, percent_peptide=-1, model=None, model_file=None, pam_audit=True):
    """
    if pam_audit==False, then it will not check for GG in the expected position
    this is useful if predicting on PAM mismatches, such as with off-target
    """
    # assert not (model is None and model_file is None), "you have to specify either a model or a model_file"
    assert isinstance(seq, (str, np.ndarray)), "Please ensure seq is a numpy array"
    if isinstance(seq, np.ndarray) and len(seq) > 0:
        assert isinstance(seq[0], str) or isinstance(seq[0], unicode), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations"
    assert isinstance(aa_cut, (int, long, np.ndarray)), "Please ensure aa_cut is a numpy array"
    if isinstance(aa_cut, np.ndarray) and len(aa_cut) > 0:
        assert isinstance(aa_cut[0], (int, long))
    assert isinstance(percent_peptide, (int, long, np.ndarray)), "Please ensure percent_peptide is a numpy array"
    if isinstance(percent_peptide, np.ndarray) and len(percent_peptide) > 0:
        assert isinstance(percent_peptide[0], (int, long))
        
    
    
    
    print aa_cut, percent_peptide
    if model_file is None:
        azimuth_saved_model_dir = os.path.join(os.path.dirname(azimuth.__file__), 'saved_models')
        if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None):
            print("No model file specified, using V3_model_nopos")
            model_name = 'V3_model_nopos.pickle'
        else:
            print("No model file specified, using V3_model_full")
            model_name = 'V3_model_full.pickle'

        model_file = os.path.join(azimuth_saved_model_dir, model_name)

    if model is None:
        with open(model_file, 'rb') as f:
            model, learn_options = pickle.load(f)
    else:
        model, learn_options = model

    learn_options["V"] = 2

    # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename)
    # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets)

    Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=zip(seq, ['NA' for x in range(len(seq))]))

    if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None):
        gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(percent_peptide, aa_cut))
    else:
        gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(np.ones(seq.shape[0])*-1, np.ones(seq.shape[0])*-1))

    feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit)
    inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(feature_sets)

    # call to scikit-learn, returns a vector of predicted values
    return model.predict(inputs)
Exemple #5
0
def extract_features(Xdf, Y, gene_position, conservation_scores, order=2):
    learn_options = {
        'nuc_features': True,
        'num_proc': 1,
        'order': order,
        'gc_features': True,
        'include_pi_nuc_feat': True,
        "include_gene_position": True,
        "include_NGGX_interaction": True,
        "include_Tm": True,
        'include_known_pairs': False,
        'include_microhomology': False,
        'ignore_gene_level_for_inner_loop': True,  # <- what?
        "include_strand": False,
        "include_gene_feature": False,
        "include_gene_guide_feature": 0,
        "include_gene_effect": False,
        "include_drug": False,
        "include_sgRNAscore": False,
        "normalize_features": False
    }

    features = featurize_data(Xdf,
                              learn_options,
                              Y,
                              gene_position,
                              pam_audit=True,
                              length_audit=True)

    conservation_scores.index = features['_nuc_pd_Order1'].index
    features['conservation_scores'] = conservation_scores
    y = Y['score_drug_gene_rank'].astype('float32').as_matrix()

    # we need the genes associated to the features to do cv data selection
    genes = features['conservation_scores'].index.get_level_values(
        1).to_series().reset_index(drop=True)

    combined_features, dim, dimsum, feature_names = concatenate_feature_sets(
        features)
    combined_features = combined_features.astype('float32')

    return combined_features, y, genes, feature_names
Exemple #6
0
def predict(seq,
            aa_cut=None,
            percent_peptide=None,
            model=None,
            model_file=None,
            pam_audit=True,
            length_audit=False,
            learn_options_override=None):
    """
    Args:
        seq: numpy array of 30 nt sequences.
        aa_cut: numpy array of amino acid cut positions (optional).
        percent_peptide: numpy array of percent peptide (optional).
        model: model instance to use for prediction (optional).
        model_file: file name of pickled model to use for prediction (optional).
        pam_audit: check PAM of each sequence.
        length_audit: check length of each sequence.
        learn_options_override: a dictionary indicating which learn_options to override (optional).

    Returns: a numpy array of predictions.
    """
    # assert not (model is None and model_file is None), "you have to specify either a model or a model_file"
    assert isinstance(seq, (np.ndarray)), "Please ensure seq is a numpy array"
    assert len(seq[0]) > 0, "Make sure that seq is not empty"
    assert isinstance(
        seq[0], str
    ), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations"

    if aa_cut is not None:
        assert len(aa_cut) > 0, "Make sure that aa_cut is not empty"
        assert isinstance(
            aa_cut, (np.ndarray)), "Please ensure aa_cut is a numpy array"
        assert np.all(np.isreal(
            aa_cut)), "amino-acid cut position needs to be a real number"

    if percent_peptide is not None:
        assert len(
            percent_peptide) > 0, "Make sure that percent_peptide is not empty"
        assert isinstance(
            percent_peptide,
            (np.ndarray)), "Please ensure percent_peptide is a numpy array"
        assert np.all(np.isreal(
            percent_peptide)), "percent_peptide needs to be a real number"

    if model_file is None:
        if np.any(percent_peptide == -1) or (percent_peptide is None
                                             and aa_cut is None):
            print("No model file specified, using V3_model_nopos")
            model_name = 'V3_model_nopos.pickle'
        else:
            print("No model file specified, using V3_model_full")
            model_name = 'V3_model_full.pickle'

        model_file = os.path.join('saved_models', model_name)
        print(model_file)
        with pkg_resources.resource_stream(__package__, model_file) as f:
            model = pickle.load(f, encoding='bytes')

    if model is None:
        with open(model_file, 'rb') as f:
            model, learn_options = pickle.load(f, encoding='bytes')
    else:
        model, learn_options = model

    learn_options["V"] = 2

    learn_options = override_learn_options(learn_options_override,
                                           learn_options)

    # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename)
    # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets)

    Xdf = pandas.DataFrame(columns=['30mer', 'Strand'],
                           data=list(zip(seq,
                                         ['NA' for x in range(len(seq))])))

    if np.all(percent_peptide != -1) and (percent_peptide is not None
                                          and aa_cut is not None):
        gene_position = pandas.DataFrame(
            columns=['Percent Peptide', 'Amino Acid Cut position'],
            data=list(zip(percent_peptide, aa_cut)))
    else:
        gene_position = pandas.DataFrame(
            columns=['Percent Peptide', 'Amino Acid Cut position'],
            data=list(
                zip(np.ones(seq.shape[0]) * -1,
                    np.ones(seq.shape[0]) * -1)))

    feature_sets = feat.featurize_data(Xdf,
                                       learn_options,
                                       pandas.DataFrame(),
                                       gene_position,
                                       pam_audit=pam_audit,
                                       length_audit=length_audit)
    inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(
        feature_sets)

    # print "CRISPR"
    # pandas.DataFrame(inputs).to_csv("CRISPR.inputs.test.csv")
    # import ipdb; ipdb.set_trace()

    # call to scikit-learn, returns a vector of predicted values
    preds = model.predict(inputs)

    # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function())
    unique_preds = np.unique(preds)
    ok = False
    for pr in preds:
        if pr not in [0, 1]:
            ok = True
    assert ok, "model returned only 0s and 1s"
    return preds
Exemple #7
0
def predict(seq,
            aa_cut=-1,
            percent_peptide=-1,
            model=None,
            model_file=None,
            pam_audit=True,
            length_audit=False,
            learn_options_override=None):
    """
    if pam_audit==False, then it will not check for GG in the expected position
    this is useful if predicting on PAM mismatches, such as with off-target
    """
    # assert not (model is None and model_file is None), "you have to specify either a model or a model_file"
    assert isinstance(seq, (np.ndarray)), "Please ensure seq is a numpy array"
    assert len(seq[0]) > 0, "Make sure that seq is not empty"
    assert isinstance(
        seq[0], str
    ), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations"

    if aa_cut is not None:
        assert len(aa_cut) > 0, "Make sure that aa_cut is not empty"
        assert isinstance(
            aa_cut, (np.ndarray)), "Please ensure aa_cut is a numpy array"
        assert np.all(np.isreal(
            aa_cut)), "amino-acid cut position needs to be a real number"

    if percent_peptide is not None:
        assert len(
            percent_peptide) > 0, "Make sure that percent_peptide is not empty"
        assert isinstance(
            percent_peptide,
            (np.ndarray)), "Please ensure percent_peptide is a numpy array"
        assert np.all(np.isreal(
            percent_peptide)), "percent_peptide needs to be a real number"

    if model_file is None:
        azimuth_saved_model_dir = os.path.join(
            os.path.dirname(azimuth.__file__), 'saved_models')
        if np.any(percent_peptide == -1) or (percent_peptide is None
                                             and aa_cut is None):
            print("No model file specified, using V3_model_nopos")
            model_name = 'V3_model_nopos.pickle'
        else:
            print("No model file specified, using V3_model_full")
            model_name = 'V3_model_full.pickle'

        model_file = os.path.join(azimuth_saved_model_dir, model_name)

    if model is None:
        with open(model_file, 'rb') as f:
            model, learn_options = pickle.load(f)
    else:
        model, learn_options = model

    learn_options["V"] = 2

    learn_options = override_learn_options(learn_options_override,
                                           learn_options)

    # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename)
    # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets)

    Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'],
                           data=zip(seq, ['NA' for x in range(len(seq))]))

    if np.all(percent_peptide != -1) and (percent_peptide is not None
                                          and aa_cut is not None):
        gene_position = pandas.DataFrame(
            columns=[u'Percent Peptide', u'Amino Acid Cut position'],
            data=zip(percent_peptide, aa_cut))
    else:
        gene_position = pandas.DataFrame(
            columns=[u'Percent Peptide', u'Amino Acid Cut position'],
            data=zip(np.ones(seq.shape[0]) * -1,
                     np.ones(seq.shape[0]) * -1))

    feature_sets = feat.featurize_data(Xdf,
                                       learn_options,
                                       pandas.DataFrame(),
                                       gene_position,
                                       pam_audit=pam_audit,
                                       length_audit=length_audit)
    inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(
        feature_sets)

    # call to scikit-learn, returns a vector of predicted values
    preds = model.predict(inputs)

    # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function())
    unique_preds = np.unique(preds)
    ok = False
    for pr in preds:
        if pr not in [0, 1]:
            ok = True
    assert ok, "model returned only 0s and 1s"
    return preds
def predict(seq, aa_cut=-1, percent_peptide=-1, model=None, model_file=None, pam_audit=True, length_audit=False, learn_options_override=None):
    """
    if pam_audit==False, then it will not check for GG in the expected position
    this is useful if predicting on PAM mismatches, such as with off-target
    """
    # assert not (model is None and model_file is None), "you have to specify either a model or a model_file"
    assert isinstance(seq, (np.ndarray)), "Please ensure seq is a numpy array"
    assert len(seq[0]) > 0, "Make sure that seq is not empty"
    assert isinstance(seq[0], str), "Please ensure input sequences are in string format, i.e. 'AGAG' rather than ['A' 'G' 'A' 'G'] or alternate representations"

    if aa_cut is not None:
        assert len(aa_cut) > 0, "Make sure that aa_cut is not empty"
        assert isinstance(aa_cut, (np.ndarray)), "Please ensure aa_cut is a numpy array"
        assert np.all(np.isreal(aa_cut)), "amino-acid cut position needs to be a real number"

    if percent_peptide is not None:
        assert len(percent_peptide) > 0, "Make sure that percent_peptide is not empty"
        assert isinstance(percent_peptide, (np.ndarray)), "Please ensure percent_peptide is a numpy array"
        assert np.all(np.isreal(percent_peptide)), "percent_peptide needs to be a real number"


    if model_file is None:
        azimuth_saved_model_dir = os.path.join(os.path.dirname(azimuth.__file__), 'saved_models')
        if np.any(percent_peptide == -1) or (percent_peptide is None and aa_cut is None):
            print("No model file specified, using V3_model_nopos")
            model_name = 'V3_model_nopos.pickle'
        else:
            print("No model file specified, using V3_model_full")
            model_name = 'V3_model_full.pickle'

        model_file = os.path.join(azimuth_saved_model_dir, model_name)

    if model is None:
        with open(model_file, 'rb') as f:
            model, learn_options = pickle.load(f)
    else:
        model, learn_options = model
        
    learn_options["V"] = 2

    learn_options = override_learn_options(learn_options_override, learn_options)

    # Y, feature_sets, target_genes, learn_options, num_proc = setup(test=False, order=2, learn_options=learn_options, data_file=test_filename)
    # inputs, dim, dimsum, feature_names = pd.concatenate_feature_sets(feature_sets)

    Xdf = pandas.DataFrame(columns=[u'30mer', u'Strand'], data=zip(seq, ['NA' for x in range(len(seq))]))

    if np.all(percent_peptide != -1) and (percent_peptide is not None and aa_cut is not None):
        gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(percent_peptide, aa_cut))
    else:
        gene_position = pandas.DataFrame(columns=[u'Percent Peptide', u'Amino Acid Cut position'], data=zip(np.ones(seq.shape[0])*-1, np.ones(seq.shape[0])*-1))

    feature_sets = feat.featurize_data(Xdf, learn_options, pandas.DataFrame(), gene_position, pam_audit=pam_audit, length_audit=length_audit)
    inputs, dim, dimsum, feature_names = azimuth.util.concatenate_feature_sets(feature_sets)

    # call to scikit-learn, returns a vector of predicted values
    preds = model.predict(inputs)

    # also check that predictions are not 0/1 from a classifier.predict() (instead of predict_proba() or decision_function())
    unique_preds = np.unique(preds)
    ok = False
    for pr in preds:
        if pr not in [0,1]:
            ok = True
    assert ok, "model returned only 0s and 1s"
    return preds