Ejemplo n.º 1
0
def build_tf_records(train, test, vocab, subjects, workspace):
    """
    Construct the TFRecords file for the provided train and test breakout of GutenbergIndices.
    :param train: List of GutenbergIndex training records.
    :param test: List of GutenbergIndex testing records.
    :param vocab: The counter of vocabulary found from this corpus.
    :param subjects: The set of classes which are targeted in these records.
    :param workspace: The directory path where the records will be stored.
    :return:
    """

    # ensure the TF records get written to the current workspace
    train_records_file = os.path.join(workspace, TRAIN_TITLE_RECORDS)
    if tf.gfile.Exists(train_records_file):
        tf.gfile.Remove(train_records_file)
    train_writer = tf.python_io.TFRecordWriter(train_records_file)

    test_records_file = os.path.join(workspace, TEST_TITLE_RECORDS)
    if tf.gfile.Exists(test_records_file):
        tf.gfile.Remove(test_records_file)
    test_writer = tf.python_io.TFRecordWriter(test_records_file)

    sorted_vocab_list = input_util.get_sorted_vocab(vocab)

    # write a record for each book title into the training set or test based on the distribution
    for gidx in train:
        write_record(gidx, sorted_vocab_list, subjects, train_writer)
    for gidx in test:
        write_record(gidx, sorted_vocab_list, subjects, test_writer)

    train_writer.close()
    test_writer.close()
Ejemplo n.º 2
0
 def __init__(self, model_dir, workspace, dataset_wkspc):
     """
     Initialize the GB Titles university.
     :param model_dir: The directory where this model is stored.
     :param workspace: The workspace directory of this university.
     :param dataset_wkspc: The GB input workspace where all inputs are stored.
     """
     super().__init__(model_dir, workspace)
     self.dataset_wkspc = dataset_wkspc
     self.subjects = gb_input.get_subjects(self.dataset_wkspc)
     self.vocab = input_util.get_sorted_vocab(gb_input.get_vocabulary(self.dataset_wkspc))
     self.vocab = self.vocab[:FLAGS.vocab_count + 1]
Ejemplo n.º 3
0
 def __init__(self, model_dir, workspace, dataset_wkspc):
     """
     Initialize the GB chapters university.
     :param model_dir: The directory where this model is stored.
     :param workspace: The workspace directory of this university.
     :param dataset_wkspc: The GB input workspace where all inputs are stored.
     index used for extracting the text window.
     """
     super().__init__(model_dir, workspace)
     self.dataset_wkspc = dataset_wkspc
     self.subjects = gb_input.get_subjects(self.dataset_wkspc)
     self.vocab = input_util.get_sorted_vocab(
         gb_input.get_vocabulary(self.dataset_wkspc))
     self.vocab = self.vocab[:FLAGS.vocab_count]
     self.evaluation_aggrs = {}
Ejemplo n.º 4
0
def build_tf_records(train, test, vocab, subjects, max_chapter_len, max_para_len, workspace):
    """
    Construct the TFRecords file for the provided train and test breakout of GutenbergIndices.
    :param train: List of GutenbergIndex training records.
    :param test: List of GutenbergIndex testing records.
    :param vocab: The counter of vocabulary found from this corpus.
    :param subjects: The set of classes which are targeted in these records.
    :param workspace: The directory path where the records will be stored.
    :param max_chapter_len: The maximum number of padded paragraphs to include
    :param max_para_len: The maximum number of padded paragraphs to include
    :return: nothing
    """

    # ensure the TF records get written to the current workspace
    train_records_file = os.path.join(workspace, TRAIN_TITLE_RECORDS)
    if tf.gfile.Exists(train_records_file):
        tf.gfile.Remove(train_records_file)
    train_writer = tf.python_io.TFRecordWriter(train_records_file)

    test_records_file = os.path.join(workspace, TEST_TITLE_RECORDS)
    if tf.gfile.Exists(test_records_file):
        tf.gfile.Remove(test_records_file)
    test_writer = tf.python_io.TFRecordWriter(test_records_file)

    sorted_vocab_list = input_util.get_sorted_vocab(vocab)

    # write a record for each book title into the training set or test based on the distribution
    for gidx in train:
        write_chapter_records(gidx, sorted_vocab_list, subjects, max_chapter_len, max_para_len, train_writer)
    for gidx in test:
        write_chapter_records(gidx, sorted_vocab_list, subjects, max_chapter_len, max_para_len, test_writer)

    train_writer.close()
    test_writer.close()

    chapter_index_file = os.path.join(workspace, CHAPTER_INDEX_JSON)
    if tf.gfile.Exists(chapter_index_file):
        tf.gfile.Remove(chapter_index_file)
    with open(chapter_index_file, 'w') as f:
        json.dump(title_map, f, indent=4)
Ejemplo n.º 5
0
 def __init__(self,
              model_dir,
              workspace,
              dataset_wkspc,
              text_window=TextWindow.beginning,
              starting_idx=0):
     """
     Initialize the GB Full university.
     :param model_dir: The directory where this model is stored.
     :param workspace: The workspace directory of this university.
     :param dataset_wkspc: The GB input workspace where all inputs are stored.
     :param text_window: The window (beginning, middle, end) where the text is pulled from.
     :param starting_idx: When using text_window.end, this parameter will define the starting
     index used for extracting the text window.
     """
     super().__init__(model_dir, workspace)
     self.dataset_wkspc = dataset_wkspc
     self.subjects = gb_input.get_subjects(self.dataset_wkspc)
     self.vocab = input_util.get_sorted_vocab(
         gb_input.get_vocabulary(self.dataset_wkspc))
     self.vocab = self.vocab[:FLAGS.vocab_count]
     self.text_window = text_window
     self.starting_idx = starting_idx