def get_data(): train_data = get_dataset('url-versions-2015-06-14-clean-train.csv') X, y = split_data(train_data) X = p.pipeline.fit_transform(X) train_data1 = train_data[:1489] train_data2 = train_data[1489:] X1 = X[:1489] X2 = X[1489:] test_data = get_dataset('url-versions-2015-06-14-clean-test.csv') X_test, y_test = split_data(test_data) X_test = p.pipeline.transform(X_test) # return train/ validation/ test return (train_data1, X1, train_data2, X2, test_data, X_test)
import os from model.utils import get_dataset, split_data from model.cross_validation import ClaimKFold if __name__ == '__main__': train_data = get_dataset('url-versions-2015-06-14-clean-train.csv') X, y = split_data(train_data) ckf = ClaimKFold(X) fold = 1 for train_index, test_index in ckf: Z_test = X.iloc[test_index, :].copy() Z_test['articleHeadlineStance'] = y.iloc[test_index] Z_test.to_csv(os.path.join('..', 'data', 'emergent', 'url-versions-2015-06-14-clean-test-fold-{0:d}.csv'.format(fold))) Z_train = X.iloc[train_index, :].copy() Z_train['articleHeadlineStance'] = y.iloc[train_index] Z_train.to_csv(os.path.join('..', 'data', 'emergent', 'url-versions-2015-06-14-clean-train-fold-{0:d}.csv'.format(fold))) fold += 1
import sys import os sys.path.append(os.path.join('..', 'src')) from model.utils import get_dataset, split_data, run_test from model.baseline.baseline_predictors import ProbabilityPredictor, ChancePredictor, \ MajorityPredictor, WordOverlapBaselinePredictor if __name__ == '__main__': train_data = get_dataset('url-versions-2015-06-14-clean-train.csv') X, y = split_data(train_data) test_data = get_dataset('url-versions-2015-06-14-clean-test.csv') print('\n>> Chance predictor <<\n') print(run_test(X, y, test_data, ChancePredictor())) print('\n>> Majority predictor <<\n') print(run_test(X, y, test_data, MajorityPredictor())) print('\n>> Probability predictor <<\n') print(run_test(X, y, test_data, ProbabilityPredictor())) print('\n>> Word overlap predictor <<\n') print(run_test(X, y, test_data, WordOverlapBaselinePredictor()))
def get_snopes(): test_data = get_dataset("my_claims_csv_cleaned.csv") X_test, y_test = split_data(test_data) X_test = p.pipeline.transform(X_test) return(test_data, X_test)