def get_estimates(): estimate_file = data_file('Estimates','estimates.csv') if os.path.exists(estimate_file): v = pandas.read_table(estimate_file,sep=',') return zip(v.submission, v.estimate) else: return []
def _rate_comments(cls, comments): if not isinstance(comments, list): comments = [comments] # import pdb; pdb.set_trace() # predictions = cls.clf.predict(pd.Series(comments, name="Comment")) stuff = pd.read_table(data_file('Inputs', "final.csv"), sep=',') predictions = cls.clf.predict(stuff.Comment.append( pd.Series(comments))) return predictions[ -len(comments):] # Hack to get around scale_predictions()
def predict(folds, arguments): """ Train on training file, predict on test file. """ logging.info("Starting predictions") print(arguments) clf = make_pipeline(arguments) # work out how long to train for final step. clf.steps[-1][-1].max_iter,estimated_score = choose_n_iterations(folds) clf.steps[-1][-1].reset_args() clf.fit(train.Comment, train.Insult) # train the classifier ypred = clf.predict(test_examples.Comment) # use the trained classifier to classify comments submission = pandas.DataFrame(dict(Insult=ypred, Comment=test_examples.Comment, Date=test_examples.Date), columns=('Insult', 'Date', 'Comment')) if arguments.predictions == None: estimates = get_estimates() for x in itertools.count(1): filename = data_file("Submissions", "submission%d.csv" % x) if os.path.exists(filename): next else: submission.to_csv(filename,index=False) estimates.append((filename,estimated_score)) save_estimates(estimates) logging.info('Saved %s' % filename) break else: submission.to_csv(arguments.predictions, index=False) logging.info('Saved %s' % arguments.predictions) try: save_model(clf) # Save the classifier except: return(clf) return(clf) logging.info("Finished predictions")
sys.path import pandas as pd import numpy as np from insults.util import data_file import matplotlib.pyplot as plt from sklearn import linear_model, metrics df = pd.read_table(data_file('Inputs','train.csv'),sep=',') df.columns # ### Feature Engineering # This is going to be a really basic model to just have as a baseline to compare against more sophisticated models that SHOULD perform much better. # # This model will proceed from basic hypothesis: # # H: Comments which contains a relatively high number of curse words along with use of the word "you"/"u" are more likely to be insulting.
def load_insults_data(train, test): df1 = pd.read_table(data_file('Inputs','train.csv'), sep=',') df2 = pd.read_table(data_file('Inputs','test_with_solutions.csv'), sep=',') df = pd.concat([df1,df2]) return df
from keras.layers import LSTM, Lambda from keras.layers import TimeDistributed, Bidirectional import numpy as np import keras.callbacks import sys import os from insults.util import data_file from insults.nn_model.util import setup_logging, LossHistory, binarize, binarize_outshape from insults.nn_model.plumbing import load_data, load_insults_data, build_examples_with_their_targets from insults.nn_model.plumbing import sentence_count_per_doc, charset, chars_to_indices_vec from insults.nn_model.plumbing import shuffle_dataset, dataset_split, strip_quotes logger = setup_logging(__name__) INSULTS_TRAIN_DATA_FILE = data_file('Inputs','train.csv') INSULTS_TEST_DATA_FILE = data_file('Inputs','test_with_solutions.csv') THIS_FILE = os.path.basename(sys.argv[0]).split('.')[0] total = len(sys.argv) cmdargs = str(sys.argv) logger.info("Script name: %s" % str(sys.argv[0])) insults_data = load_insults_data(INSULTS_TRAIN_DATA_FILE, INSULTS_TEST_DATA_FILE) comments, targets = build_examples_with_their_targets(insults_data.Comment, insults_data.Insult) comments = strip_quotes(comments) num_sent = sentence_count_per_doc(comments) chars = charset(comments)
def make_full_training(): df1 = pandas.read_table(data_file('Inputs','train.csv'),sep=',') df2 = pandas.read_table(data_file('Inputs','test_with_solutions.csv'),sep=',') df = pandas.concat([df1,df2]) df.to_csv(data_file('Inputs','fulltrain.csv'),index=False)
def save_estimates(se): estimate_file = data_file('Estimates', 'estimates.csv') submissions,estimates = zip(*se) pandas.DataFrame(dict(submission=submissions, estimate=estimates)).to_csv(estimate_file, index=False)
def test_data_file(): path = util.data_file("test_category") path_two = util.data_file("test_category", name="test_name") assert path == 'insults/Data/test_category' assert path_two == 'insults/Data/test_category/test_name'
def test_data_directory(): path = util.data_file("test_category") assert path == 'insults/Data/test_category'