def build_tf_records(train, test, vocab, subjects, workspace): """ Construct the TFRecords file for the provided train and test breakout of GutenbergIndices. :param train: List of GutenbergIndex training records. :param test: List of GutenbergIndex testing records. :param vocab: The counter of vocabulary found from this corpus. :param subjects: The set of classes which are targeted in these records. :param workspace: The directory path where the records will be stored. :return: """ # ensure the TF records get written to the current workspace train_records_file = os.path.join(workspace, TRAIN_TITLE_RECORDS) if tf.gfile.Exists(train_records_file): tf.gfile.Remove(train_records_file) train_writer = tf.python_io.TFRecordWriter(train_records_file) test_records_file = os.path.join(workspace, TEST_TITLE_RECORDS) if tf.gfile.Exists(test_records_file): tf.gfile.Remove(test_records_file) test_writer = tf.python_io.TFRecordWriter(test_records_file) sorted_vocab_list = input_util.get_sorted_vocab(vocab) # write a record for each book title into the training set or test based on the distribution for gidx in train: write_record(gidx, sorted_vocab_list, subjects, train_writer) for gidx in test: write_record(gidx, sorted_vocab_list, subjects, test_writer) train_writer.close() test_writer.close()
def __init__(self, model_dir, workspace, dataset_wkspc): """ Initialize the GB Titles university. :param model_dir: The directory where this model is stored. :param workspace: The workspace directory of this university. :param dataset_wkspc: The GB input workspace where all inputs are stored. """ super().__init__(model_dir, workspace) self.dataset_wkspc = dataset_wkspc self.subjects = gb_input.get_subjects(self.dataset_wkspc) self.vocab = input_util.get_sorted_vocab(gb_input.get_vocabulary(self.dataset_wkspc)) self.vocab = self.vocab[:FLAGS.vocab_count + 1]
def __init__(self, model_dir, workspace, dataset_wkspc): """ Initialize the GB chapters university. :param model_dir: The directory where this model is stored. :param workspace: The workspace directory of this university. :param dataset_wkspc: The GB input workspace where all inputs are stored. index used for extracting the text window. """ super().__init__(model_dir, workspace) self.dataset_wkspc = dataset_wkspc self.subjects = gb_input.get_subjects(self.dataset_wkspc) self.vocab = input_util.get_sorted_vocab( gb_input.get_vocabulary(self.dataset_wkspc)) self.vocab = self.vocab[:FLAGS.vocab_count] self.evaluation_aggrs = {}
def build_tf_records(train, test, vocab, subjects, max_chapter_len, max_para_len, workspace): """ Construct the TFRecords file for the provided train and test breakout of GutenbergIndices. :param train: List of GutenbergIndex training records. :param test: List of GutenbergIndex testing records. :param vocab: The counter of vocabulary found from this corpus. :param subjects: The set of classes which are targeted in these records. :param workspace: The directory path where the records will be stored. :param max_chapter_len: The maximum number of padded paragraphs to include :param max_para_len: The maximum number of padded paragraphs to include :return: nothing """ # ensure the TF records get written to the current workspace train_records_file = os.path.join(workspace, TRAIN_TITLE_RECORDS) if tf.gfile.Exists(train_records_file): tf.gfile.Remove(train_records_file) train_writer = tf.python_io.TFRecordWriter(train_records_file) test_records_file = os.path.join(workspace, TEST_TITLE_RECORDS) if tf.gfile.Exists(test_records_file): tf.gfile.Remove(test_records_file) test_writer = tf.python_io.TFRecordWriter(test_records_file) sorted_vocab_list = input_util.get_sorted_vocab(vocab) # write a record for each book title into the training set or test based on the distribution for gidx in train: write_chapter_records(gidx, sorted_vocab_list, subjects, max_chapter_len, max_para_len, train_writer) for gidx in test: write_chapter_records(gidx, sorted_vocab_list, subjects, max_chapter_len, max_para_len, test_writer) train_writer.close() test_writer.close() chapter_index_file = os.path.join(workspace, CHAPTER_INDEX_JSON) if tf.gfile.Exists(chapter_index_file): tf.gfile.Remove(chapter_index_file) with open(chapter_index_file, 'w') as f: json.dump(title_map, f, indent=4)
def __init__(self, model_dir, workspace, dataset_wkspc, text_window=TextWindow.beginning, starting_idx=0): """ Initialize the GB Full university. :param model_dir: The directory where this model is stored. :param workspace: The workspace directory of this university. :param dataset_wkspc: The GB input workspace where all inputs are stored. :param text_window: The window (beginning, middle, end) where the text is pulled from. :param starting_idx: When using text_window.end, this parameter will define the starting index used for extracting the text window. """ super().__init__(model_dir, workspace) self.dataset_wkspc = dataset_wkspc self.subjects = gb_input.get_subjects(self.dataset_wkspc) self.vocab = input_util.get_sorted_vocab( gb_input.get_vocabulary(self.dataset_wkspc)) self.vocab = self.vocab[:FLAGS.vocab_count] self.text_window = text_window self.starting_idx = starting_idx