def training_holdout_split(dataset, training = 0.8, base_dir="splits"): if not (os.path.exists(base_dir+ "/"+ "training_ids.txt") and os.path.exists(base_dir+ "/"+ "hold_out_ids.txt")): generate_hold_out_split(dataset,training,base_dir) training_ids = read_ids("training_ids.txt", base_dir) hold_out_ids = read_ids("hold_out_ids.txt", base_dir) return training_ids, hold_out_ids
from utils.generate_test_splits import generate_hold_out_split from utils.dataset import DataSet #from utils import dataset, generate_test_splits dataset = DataSet() generate_hold_out_split(dataset, training=0.8, base_dir="splits/")
# Load the dataset using the utility provided by the FNC d = DataSet() # Create Keras tokenizer tokenizer = Tokenizer(nb_words=MAX_NB_WORDS) # Collect all text from bodies and headlines of data provided for FNC body_text = [v for k,v in d.articles.items()] headline_text = [s['Headline'] for s in d.stances] # Updates internal vocabulary based on a list of texts tokenizer.fit_on_texts(headline_text + body_text) # Create lists of headline text, body text, and stances generate_hold_out_split(d) base_dir = "splits" training_ids = read_ids("training_ids.txt", base_dir) hold_out_ids = read_ids("hold_out_ids.txt", base_dir) X_headline, X_body, y = generate_data(training_ids, d) X_headline_test, X_body_test, y_test = generate_data(hold_out_ids, d) # Create sequences from the lists of texts # Transforms each text in texts in a sequence of integers. # Only top "nb_words" most frequent words will be taken into account. # Only words known by the tokenizer will be taken into account. headline_sequences = tokenizer.texts_to_sequences(X_headline) body_sequences = tokenizer.texts_to_sequences(X_body) headline_sequences_test = tokenizer.texts_to_sequences(X_headline_test) body_sequences_test = tokenizer.texts_to_sequences(X_body_test)