Beispiel #1
0
    "k11": 1,
    "k12": 1,
    "k13": 1,
    "k14": 1,
    "k15": 1,
}

test_data = pd.read_csv(open("data/test.csv", "r"), quotechar='"')

sub_data = pd.read_csv(open("data/sampleSubmission.csv", "r"), quotechar='"')

if not np.alltrue(test_data["id"] == sub_data["id"]):
    raise Exception("IDs do not match")

yh = Yhat(username, apikey)

variabless = sub_data.columns[1:]
raw_tweets = test_data["tweet"].tolist()

for variable in variables:
    model_version = best_model[variable]
    model_name = "TweetClassifier_%s" % (variable,)
    results_from_server = yh.raw_predict(model_name, model_version, raw_tweets)
    pred = results_from_server["prediction"]["scores"]
    sub_data[variable] = pred

try:
    sub_data.to_csv(open(sub_file, "w"), index=False)
except IOError:
    sys.stderr.write("IO error: could not write data to file")
Beispiel #2
0
    print "Uploading to yhat"
    upload_status = yh.upload(model_name,tweet_clf)
    model_version = upload_status['version'] 

    print "'%s':'%s' uploaded to yhat" % (model_name,model_version)

    # Sanity check uploaded classifier by comparing remote against local scores

    print "Preforming sanity check"
    print "Predicting local scores"
    local_sanity = tweet_clf.predict(tweet_clf.transform(sanity_raw))['scores']
    local_sanity = np.array(local_sanity)

    print "Getting scores from server"
    results_from_server = yh.raw_predict(model_name,model_version,sanity_raw)
    try:
        server_sanity = results_from_server['prediction']['scores']
    except:
        print results_from_server
        sys.exit(3)
    server_sanity = np.array(server_sanity)

    # Because of float point scores compare difference of scores to some level
    # of tolerance rather than checking equality
    score_diff = np.abs(local_sanity - server_sanity)

    sanity_tolerance = 1e-3
    sanity_status = np.alltrue(score_diff < sanity_tolerance)

    if not sanity_status:
Beispiel #3
0
    print "Uploading to yhat"
    upload_status = yh.upload(model_name, tweet_clf)
    model_version = upload_status['version']

    print "'%s':'%s' uploaded to yhat" % (model_name, model_version)

    # Sanity check uploaded classifier by comparing remote against local scores

    print "Preforming sanity check"
    print "Predicting local scores"
    local_sanity = tweet_clf.predict(tweet_clf.transform(sanity_raw))['scores']
    local_sanity = np.array(local_sanity)

    print "Getting scores from server"
    results_from_server = yh.raw_predict(model_name, model_version, sanity_raw)
    try:
        server_sanity = results_from_server['prediction']['scores']
    except:
        print results_from_server
        sys.exit(3)
    server_sanity = np.array(server_sanity)

    # Because of float point scores compare difference of scores to some level
    # of tolerance rather than checking equality
    score_diff = np.abs(local_sanity - server_sanity)

    sanity_tolerance = 1e-3
    sanity_status = np.alltrue(score_diff < sanity_tolerance)

    if not sanity_status:
Beispiel #4
0
    's1':1, 's2':1, 's3':1, 's4':1, 's5':1,
    'w1':1, 'w2':1, 'w3':1, 'w4':1,
    'k1':1, 'k2':1, 'k3':1, 'k4':1, 'k5':1,
    'k6':1, 'k7':1, 'k8':1, 'k9':1, 'k10':1,
    'k11':1, 'k12':1, 'k13':1, 'k14':1, 'k15':1
}

test_data = pd.read_csv(open('data/test.csv', 'r'), quotechar='"')

sub_data = pd.read_csv(open('data/sampleSubmission.csv', 'r'), quotechar='"')

if not np.alltrue(test_data['id'] == sub_data['id']):
    raise Exception("IDs do not match")

yh = Yhat(username, apikey)

variabless = sub_data.columns[1:]
raw_tweets = test_data['tweet'].tolist()

for variable in variables:
    model_version = best_model[variable]
    model_name = "TweetClassifier_%s" % (variable, )
    results_from_server = yh.raw_predict(model_name, model_version, raw_tweets)
    pred = results_from_server['prediction']['scores']
    sub_data[variable] = pred

try:
    sub_data.to_csv(open(sub_file, 'w'), index=False)
except IOError:
    sys.stderr.write("IO error: could not write data to file")
Beispiel #5
0
# <codecell>

yh = Yhat("YOUR USERNAME", "YOUR API KEY")

# <codecell>

print yh.upload("NamedEntityFindr", clf)

# <codecell>

[model for model in yh.show_models()['models'] if model['name'] == "NamedEntityFindr"]

# <codecell>

results_from_server = yh.raw_predict("NamedEntityFindr", 1, data)
results_from_server

# <codecell>

print 'sanity check.'
print 'results all match => %s' \
    % np.all(np.array(results['entities']) == np.array(results_from_server['prediction']['entities']))

# <markdowncell>

# <h2>Final Thoughts</h2>
# <ul>
#     <li><a href="http://nltk.googlecode.com/svn/trunk/doc/book/ch05.html" title="Categorizing and Tagging Words - NLTK docs" target="_blank">Categorizing and Tagging Words with NLTK</a> (NLTK docs)</li>
#     <li><a href="http://pixelmonkey.org/pub/nlp-training/" title="Just Enough NLP with Python" target="_blank">Just Enough NLP with Python</a> (slides)</li>
#     <li><a href="http://cdn.preterhuman.net/texts/science_and_technology/artificial_intelligence/Foundations%20of%20Statistical%20Natural%20Language%20Processing%20-%20Christopher%20D.%20Manning.pdf" title="Foundations of Statistical Natural Language Processing by Christopher Manning &amp; Hinrich Schiitze" target="_blank">Foundations of Statistical Natural Language Processing</a> by Christopher Manning &amp; Hinrich Schiitze (PDF)</li>