Ejemplo n.º 1
0
def get_estimates():
    estimate_file = data_file('Estimates','estimates.csv')
    if os.path.exists(estimate_file):
        v = pandas.read_table(estimate_file,sep=',')
        return zip(v.submission, v.estimate)
    else:
        return []
Ejemplo n.º 2
0
 def _rate_comments(cls, comments):
     if not isinstance(comments, list):
         comments = [comments]
     # import pdb; pdb.set_trace()
     # predictions = cls.clf.predict(pd.Series(comments, name="Comment"))
     stuff = pd.read_table(data_file('Inputs', "final.csv"), sep=',')
     predictions = cls.clf.predict(stuff.Comment.append(
         pd.Series(comments)))
     return predictions[
         -len(comments):]  # Hack to get around scale_predictions()
Ejemplo n.º 3
0
def predict(folds, arguments):
    """
    Train on training file, predict on test file.
    """
    logging.info("Starting predictions")
    print(arguments)
    clf = make_pipeline(arguments)
    # work out how long to train for final step.
    clf.steps[-1][-1].max_iter,estimated_score = choose_n_iterations(folds)
    clf.steps[-1][-1].reset_args()
    clf.fit(train.Comment, train.Insult) # train the classifier
    ypred = clf.predict(test_examples.Comment) # use the trained classifier to classify comments

    submission = pandas.DataFrame(dict(Insult=ypred, Comment=test_examples.Comment, Date=test_examples.Date), columns=('Insult', 'Date', 'Comment'))

    if arguments.predictions == None:
        estimates = get_estimates()
        for x in itertools.count(1):
            filename = data_file("Submissions", "submission%d.csv" % x)
            if os.path.exists(filename):
                next
            else:
                submission.to_csv(filename,index=False)
                estimates.append((filename,estimated_score))
                save_estimates(estimates)
                logging.info('Saved %s' % filename)
                break
    else:
            submission.to_csv(arguments.predictions, index=False)
            logging.info('Saved %s' % arguments.predictions)

    try:
        save_model(clf) # Save the classifier
    except:
        return(clf)
    return(clf)
    logging.info("Finished predictions")
Ejemplo n.º 4
0

sys.path



import pandas as pd
import numpy as np

from insults.util import data_file
import matplotlib.pyplot as plt
from sklearn import linear_model, metrics



df = pd.read_table(data_file('Inputs','train.csv'),sep=',')



df.columns


# ### Feature Engineering

# This is going to be a really basic model to just have as a baseline to compare against more sophisticated models that SHOULD perform much better. 
# 
# This model will proceed from basic hypothesis: 
# 
# H: Comments which contains a relatively high number of curse words along with use of the word "you"/"u" are more likely to be insulting.

Ejemplo n.º 5
0
def load_insults_data(train, test):
    df1 = pd.read_table(data_file('Inputs','train.csv'), sep=',')
    df2 = pd.read_table(data_file('Inputs','test_with_solutions.csv'), sep=',')
    df = pd.concat([df1,df2])

    return df
Ejemplo n.º 6
0
from keras.layers import LSTM, Lambda
from keras.layers import TimeDistributed, Bidirectional
import numpy as np
import keras.callbacks
import sys
import os

from insults.util import data_file
from insults.nn_model.util import setup_logging, LossHistory, binarize, binarize_outshape
from insults.nn_model.plumbing import load_data, load_insults_data, build_examples_with_their_targets
from insults.nn_model.plumbing import sentence_count_per_doc, charset, chars_to_indices_vec
from insults.nn_model.plumbing import shuffle_dataset, dataset_split, strip_quotes

logger = setup_logging(__name__)

INSULTS_TRAIN_DATA_FILE = data_file('Inputs','train.csv')
INSULTS_TEST_DATA_FILE = data_file('Inputs','test_with_solutions.csv')
THIS_FILE = os.path.basename(sys.argv[0]).split('.')[0]

total = len(sys.argv)
cmdargs = str(sys.argv)

logger.info("Script name: %s" % str(sys.argv[0]))

insults_data = load_insults_data(INSULTS_TRAIN_DATA_FILE, INSULTS_TEST_DATA_FILE)

comments, targets = build_examples_with_their_targets(insults_data.Comment, insults_data.Insult)
comments = strip_quotes(comments)

num_sent = sentence_count_per_doc(comments)
chars = charset(comments)
Ejemplo n.º 7
0
def make_full_training():
    df1 = pandas.read_table(data_file('Inputs','train.csv'),sep=',')
    df2 = pandas.read_table(data_file('Inputs','test_with_solutions.csv'),sep=',')
    df = pandas.concat([df1,df2])
    df.to_csv(data_file('Inputs','fulltrain.csv'),index=False)
Ejemplo n.º 8
0
def save_estimates(se):
    estimate_file = data_file('Estimates', 'estimates.csv')
    submissions,estimates = zip(*se)
    pandas.DataFrame(dict(submission=submissions, estimate=estimates)).to_csv(estimate_file, index=False)
Ejemplo n.º 9
0
def test_data_file():
    path = util.data_file("test_category")
    path_two = util.data_file("test_category", name="test_name")

    assert path == 'insults/Data/test_category'
    assert path_two == 'insults/Data/test_category/test_name'
Ejemplo n.º 10
0
def test_data_directory():
    path = util.data_file("test_category")

    assert path == 'insults/Data/test_category'