Example #1
0
def train_model():

    TIL_n     = feat.count_TIL_corpus()
    decoy_n   = TIL_n*_DECOY_PROPORTION
    FP_n      = feat.count_TIL_false_pos()

    wiki_n    = feat.count_WIKI_corpus()
    skip_wiki_n =  wiki_n // decoy_n

    # Keep the number of false positives in about the same Order-of-Mag
    skip_FP  = FP_n // TIL_n
    print "Skipping every {} value in FP".format(skip_FP)

    if FLAG_BUILD_DECOY_LIST:
        build_skip_query(skip_wiki_n)

    print "Loading features"
    features = Word2Vec.load(feat.f_features)
    dimension = 100 # default dimension
  

    ITR_decoy = query_skip_decoys()

    print "Building training set"
    ITR_train = list(feat.TIL_full_corpus_iter())

    print "Building the false positive set"
    ITR_FP    = list(feat.TIL_false_pos_iter(skip_FP))

    print "Building corpus iter"
    ITR = feat.chainer(ITR_train, ITR_FP, ITR_decoy)
    ITR = list(ITR)
    
    Y = np.zeros(len(ITR))
    Y[:TIL_n] = 1.0

    TTS = train_test_split
    x_train, x_test, y_train, y_test = TTS(ITR, Y, test_size=0.2)

    print "Proportion of answers {}/{}".format(y_train.sum(),
                                               y_test.sum())

    print "Calculating the wordVecs for train"
    vec_train = np.concatenate([getWordVecs(text,weight,
                                            features,dimension)
                                for text,weight in x_train])
        
    print "Building the scalar"
    scaler = preprocessing.StandardScaler().fit(vec_train)

    print "Saving the scaler"
    joblib.dump(scaler, f_norm_scale)

    print "Scaling train vectors"
    vec_train = scaler.transform(vec_train)

    print "Calculating the wordVecs for test"
    vec_test = np.concatenate([getWordVecs(text,weight,features,dimension)
                               for text,weight in x_test])

    print "Scaling test vectors"
    vec_test = scaler.transform(vec_test)

    print "Train size/TP in sample", vec_train.shape, (y_train==1).sum()
    print "Test  size/TP in sample", vec_test.shape, (y_test==1).sum()
    print "Training classifer"

    #from sklearn.linear_model import SGDClassifier as Classifier
    #from sklearn.linear_model import LogisticRegression as Classifier
    #from sklearn.linear_model import BayesianRidge as Classifier
    #from sklearn.naive_bayes import BernoulliNB as Classifier
    #from sklearn.naive_bayes import GaussianNB as Classifier
    #from sklearn.naive_bayes import GaussianNB as Classifier
    #from sklearn.ensemble import RandomForestClassifier as Classifier
    from sklearn.ensemble import ExtraTreesClassifier as Classifier
    
    # This seems to be the best... but high FP rate
    #from sklearn.naive_bayes import BernoulliNB as Classifier    
 
    #clf = Classifier(loss='log', penalty='l1',verbose=2) # SGD
    #clf =  Classifier(C=2500,verbose=2) # LogisiticRegression
    #clf =  Classifier() # Naive Bayes
    clf = Classifier(n_estimators=200,n_jobs=8) # ExtraTrees
    
    clf.fit(vec_train, y_train)  

    print 'Test Accuracy: %.3f'%clf.score(vec_test, y_test)

    idx_TP = np.array(y_test) > 0
    vec_TP = np.array(vec_test)[idx_TP]
    y_TP   = np.array(y_test)[idx_TP]
    print 'Test Accuracy on TP: %.3f'%clf.score(vec_TP, y_TP)

    vec_FP = np.array(vec_test)[~idx_TP]
    y_FP   = np.array(y_test)[~idx_TP]
    print 'Test Accuracy on FP: %.3f'%clf.score(vec_FP, y_FP)

    print "Saving the classifer"
    joblib.dump(clf, f_clf)

    #Create ROC curve
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt

    pred_probas = clf.predict_proba(vec_test)[:,1]
    fpr,tpr,_ = roc_curve(y_test, pred_probas)
    roc_auc = auc(fpr,tpr)
    plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.legend(loc='lower right')
    plt.show()
Example #2
0
# Create the score table
cmd_template = '''
DROP TABLE IF EXISTS score;
CREATE TABLE IF NOT EXISTS score (
    idx INTEGER PRIMARY KEY NOT NULL,
    score FLOAT DEFAULT NULL
); 

INSERT INTO score (idx)
SELECT decoy_idx FROM decoy;
'''
#print "Templating"
#conn_decoy.executescript(cmd_template)

total_samples = feat.count_WIKI_corpus()

cmd_check_remaining = '''SELECT COUNT(*) FROM score WHERE score IS NULL'''
score_remaining = conn_decoy.execute(cmd_check_remaining).next()
print "There are {} left to score".format(score_remaining)


def getWordVecs(text, weight, model, dimension):

    # The vector is the entropy-weighted average of each word
    vec = np.zeros(dimension).reshape((1, dimension))

    tokens = text.split()
    count = 0.0

    for single_entropy, word in zip(weight, tokens):
Example #3
0
from sklearn import preprocessing
import build_features as feat

#FLAG_BUILD_DECOY_LIST = True
FLAG_BUILD_DECOY_LIST = False

# When training, how big of a decoy set to use
_DECOY_PROPORTION = 5

conn_decoy = feat.conn_decoy
conn_train = feat.conn_train

f_clf   = "db/clf.joblib"
f_norm_scale = "db/scale.joblib"

total_samples = feat.count_WIKI_corpus()

def query_skip_decoys():

    cmd_select = '''SELECT tokens,weights FROM decoy AS A
    JOIN skip_decoy_query AS B ON A.decoy_idx=B.decoy_idx
    '''
    cursor = conn_decoy.execute(cmd_select)
    for text,weight_str in cursor:
        # Load the entropy weight
        w = np.fromstring(weight_str[1:-1],sep=',')
        yield text, w
            

def build_skip_query(skip_n):
    cmd_template = '''
Example #4
0
def train_model():

    TIL_n = feat.count_TIL_corpus()
    decoy_n = TIL_n * _DECOY_PROPORTION
    FP_n = feat.count_TIL_false_pos()

    wiki_n = feat.count_WIKI_corpus()
    skip_wiki_n = wiki_n // decoy_n

    # Keep the number of false positives in about the same Order-of-Mag
    skip_FP = FP_n // TIL_n
    print "Skipping every {} value in FP".format(skip_FP)

    if FLAG_BUILD_DECOY_LIST:
        build_skip_query(skip_wiki_n)

    print "Loading features"
    features = Word2Vec.load(feat.f_features)
    dimension = 100  # default dimension

    ITR_decoy = query_skip_decoys()

    print "Building training set"
    ITR_train = list(feat.TIL_full_corpus_iter())

    print "Building the false positive set"
    ITR_FP = list(feat.TIL_false_pos_iter(skip_FP))

    print "Building corpus iter"
    ITR = feat.chainer(ITR_train, ITR_FP, ITR_decoy)
    ITR = list(ITR)

    Y = np.zeros(len(ITR))
    Y[:TIL_n] = 1.0

    TTS = train_test_split
    x_train, x_test, y_train, y_test = TTS(ITR, Y, test_size=0.2)

    print "Proportion of answers {}/{}".format(y_train.sum(), y_test.sum())

    print "Calculating the wordVecs for train"
    vec_train = np.concatenate([
        getWordVecs(text, weight, features, dimension)
        for text, weight in x_train
    ])

    print "Building the scalar"
    scaler = preprocessing.StandardScaler().fit(vec_train)

    print "Saving the scaler"
    joblib.dump(scaler, f_norm_scale)

    print "Scaling train vectors"
    vec_train = scaler.transform(vec_train)

    print "Calculating the wordVecs for test"
    vec_test = np.concatenate([
        getWordVecs(text, weight, features, dimension)
        for text, weight in x_test
    ])

    print "Scaling test vectors"
    vec_test = scaler.transform(vec_test)

    print "Train size/TP in sample", vec_train.shape, (y_train == 1).sum()
    print "Test  size/TP in sample", vec_test.shape, (y_test == 1).sum()
    print "Training classifer"

    #from sklearn.linear_model import SGDClassifier as Classifier
    #from sklearn.linear_model import LogisticRegression as Classifier
    #from sklearn.linear_model import BayesianRidge as Classifier
    #from sklearn.naive_bayes import BernoulliNB as Classifier
    #from sklearn.naive_bayes import GaussianNB as Classifier
    #from sklearn.naive_bayes import GaussianNB as Classifier
    #from sklearn.ensemble import RandomForestClassifier as Classifier
    from sklearn.ensemble import ExtraTreesClassifier as Classifier

    # This seems to be the best... but high FP rate
    #from sklearn.naive_bayes import BernoulliNB as Classifier

    #clf = Classifier(loss='log', penalty='l1',verbose=2) # SGD
    #clf =  Classifier(C=2500,verbose=2) # LogisiticRegression
    #clf =  Classifier() # Naive Bayes
    clf = Classifier(n_estimators=200, n_jobs=8)  # ExtraTrees

    clf.fit(vec_train, y_train)

    print 'Test Accuracy: %.3f' % clf.score(vec_test, y_test)

    idx_TP = np.array(y_test) > 0
    vec_TP = np.array(vec_test)[idx_TP]
    y_TP = np.array(y_test)[idx_TP]
    print 'Test Accuracy on TP: %.3f' % clf.score(vec_TP, y_TP)

    vec_FP = np.array(vec_test)[~idx_TP]
    y_FP = np.array(y_test)[~idx_TP]
    print 'Test Accuracy on FP: %.3f' % clf.score(vec_FP, y_FP)

    print "Saving the classifer"
    joblib.dump(clf, f_clf)

    #Create ROC curve
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt

    pred_probas = clf.predict_proba(vec_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, pred_probas)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='area = %.2f' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.legend(loc='lower right')
    plt.show()