def __init__(self, name="train", path="fnc-1"): params = parse_params() self.path = path print("Reading dataset") bodies = name + "_bodies.csv" stances = name + "_stances.csv" self.stances = self.read(stances) articles = self.read(bodies) self.articles = dict() #make the body ID an integer value for s in self.stances: s['Body ID'] = int(s['Body ID']) if params.run_2_class and s[ 'Stance'] != 'unrelated' and name != 'competition_test': s['Stance'] = 'related' #copy all bodies into a dictionary for article in articles: self.articles[int(article['Body ID'])] = article['articleBody'] print("Total stances: " + str(len(self.stances))) print("Total bodies: " + str(len(self.articles)))
import os # from xgboost import XGBClassifier from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier from feature_engineering import refuting_features, polarity_features, hand_features, gen_or_load_feats, word_tfidf_features, word_overlap_split_bodies_features from feature_engineering import word_overlap_features, word_overlap_pos_features, word_overlap_quotes_features, word_tfidf_pos_ss_features, word_overlap_bpe_features from utils.dataset import DataSet from utils.generate_test_splits import kfold_split, get_stances_for_folds from utils.score import report_score, LABELS, LABELS_RELATED, score_submission, score_cal from utils.system import parse_params, check_version from test_dl_model import get_predictions_from_FNC_1_Test import torch DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') params = parse_params() def generate_features(stances, dataset, name): h, b, y = [], [], [] for stance in stances: if params.run_2_class: if name != 'competition': y.append(LABELS_RELATED.index(stance['Stance'])) else: y.append(LABELS.index(stance['Stance'])) else: y.append(LABELS.index(stance['Stance'])) h.append(stance['Headline'])
X_overlap = gen_or_load_feats(word_overlap_features, h, b, "features/overlap." + name + ".npy") X_refuting = gen_or_load_feats(refuting_features, h, b, "features/refuting." + name + ".npy") X_polarity = gen_or_load_feats(polarity_features, h, b, "features/polarity." + name + ".npy") X_hand = gen_or_load_feats(hand_features, h, b, "features/hand." + name + ".npy") X = np.c_[X_hand, X_polarity, X_refuting, X_overlap] return X, y if __name__ == "__main__": check_version() parse_params() #Load the training dataset and generate folds d = DataSet() folds, hold_out = kfold_split(d, n_folds=10) fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out) # Load the competition dataset competition_dataset = DataSet("competition_test") X_competition, y_competition = generate_features( competition_dataset.stances, competition_dataset, "competition") Xs = dict() ys = dict() # Load/Precompute all features now
with open(os.path.join(out_dir, tier +'.headline'), 'w') as headline_file, \ open(os.path.join(out_dir, tier +'.body'), 'w') as body_file,\ open(os.path.join(out_dir, tier +'.stance'), 'w') as stance_file: for i in indices: (headline, body, stance) = examples[i] # write tokenized data to file write_to_file(headline_file, headline) write_to_file(body_file, body) write_to_file(stance_file, stance) if __name__ == "__main__": #check_version() args = parse_params() #Load the training dataset d = DataSet() #competition_dataset = DataSet(name="competition_test") train_body_ids, dev_body_ids = get_body_ids(d, training=0.8, base_dir='splits') train_stances = get_stances(d, train_body_ids) dev_stances = get_stances(d, dev_body_ids) #test_body_ids = list(competition_dataset.articles.keys()) #test_stances = get_stances(competition_dataset, test_body_ids) print("Train data has %i examples total" % len(train_stances))