def text_to_binary(input_directories, output_filenames, split_fractions): filenames = get_filenames(input_directories) random_shuffle(filenames) start_from_index = 0 counter = collections.Counter() # for the vocab counts count = 0 for index, output_filename in enumerate(output_filenames): sample_count = int(len(filenames) * split_fractions[index]) print(output_filename + ': ' + str(sample_count)) end_index = min(start_from_index + sample_count, len(filenames)) convert_files_to_binary(filenames[start_from_index:end_index], output_filename, counter) count += len(filenames[start_from_index:end_index]) print("num files handled: ", count) start_from_index = end_index # create vocab file with open('vocab', 'w+') as vocab_f: for word, count in counter.most_common(VOCAB_LIMIT - 4): vocab_f.write(word + ' ' + str(count) + '\n') vocab_f.write('<s> 0\n') vocab_f.write('</s> 0\n') vocab_f.write('<UNK> 0\n') vocab_f.write('<PAD> 0\n')
def generator(file_name): """Returns a tuple (inputs, targets) All arrays should contain the same number of samples. The generator is expected to loop over its data indefinitely. An epoch finishes when samples_per_epoch samples have been seen by the model. """ ctable = CharacterTable(read_top_chars()) batch_of_answers = [] while True: with open(file_name) as answers: for answer in answers: batch_of_answers.append(answer.strip().decode('utf-8')) if len(batch_of_answers) == CONFIG.batch_size: random_shuffle(batch_of_answers) batch_of_questions = [] for answer_index, answer in enumerate(batch_of_answers): question, answer = generate_question(answer) batch_of_answers[answer_index] = answer assert len(answer) == CONFIG.max_input_len question = question[:: -1] if CONFIG.inverted else question batch_of_questions.append(question) X, y = _vectorize(batch_of_questions, batch_of_answers, ctable) yield X, y batch_of_answers = []
def generate_examples(self, corpus): """Generate examples of misspellings""" print("Generating examples") questions, answers, seen_answers = [], [], set() while corpus: line = corpus.pop() while len(line) > MIN_INPUT_LEN: if len(line) <= MAX_INPUT_LEN: answer = line line = "" else: #print(line) space_location = line.rfind(" ", MIN_INPUT_LEN, MAX_INPUT_LEN - 1) #print(space_location) if space_location > -1: answer = line[:space_location] line = line[len(answer) + 1:] else: space_location = line.rfind(" ") # no limits this time if space_location == -1: break # we are done with this line else: line = line[space_location + 1:] continue if answer and answer in seen_answers: continue seen_answers.add(answer) answers.append(answer) print('Shuffle') random_shuffle(answers) print("Shuffled") for answer_index, answer in enumerate(answers): question = self.add_noise_to_string(answer, AMOUNT_OF_NOISE) question += '.' * (MAX_INPUT_LEN - len(question)) answer += "." * (MAX_INPUT_LEN - len(answer)) answers[answer_index] = answer assert len(answer) == MAX_INPUT_LEN question = question[::-1] if self.inverted else question questions.append(question) print("Generated questions and answers") return questions, answers
def preprocess_partition_data(): """Set asside data for validation""" answers = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n") print('shuffle', end=" ") random_shuffle(answers) print("Done") # Explicitly set apart 10% for validation data that we never train over split_at = len(answers) - len(answers) // 10 with open(NEWS_FILE_NAME_TRAIN, "wb") as output_file: output_file.write("\n".join(answers[:split_at]).encode('utf-8')) with open(NEWS_FILE_NAME_VALIDATE, "wb") as output_file: output_file.write("\n".join(answers[split_at:]).encode('utf-8'))
def WriteBin(stories_directory, bin_directory, outfiles, fraction, makevocab=True): stories = _get_filenames(stories_directory) random_shuffle(stories) if makevocab: vocab_counter = collections.Counter() print("Writing bin file") index_start = 0 progress_bar = ProgressBar(len(stories)) for index, outfile in enumerate(outfiles): counts = int(len(stories) * fraction[index]) index_stop = min(index_start + counts, len(stories)) index1 = index_start fileindex = 0 while index1< index_stop: index1 = min(index_start + CHUNK_SIZE, index_stop) story_files = stories[index_start:index1] with open(join(bin_directory, outfile + '_' + str(fileindex) + '.bin' ), 'wb') as writer: for story in story_files: article, abstract = ParseStory(join(stories_directory,story)) tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend([article]) tf_example.features.feature['abstract'].bytes_list.value.extend([abstract]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str)) progress_bar.Increment() if makevocab: art_tokens = article.split() abs_tokens = abstract.split() tokens = art_tokens + abs_tokens tokens = [t.strip() for t in tokens if t not in [None, '']] vocab_counter.update(tokens) fileindex += 1 index_start = index1 print("Done writing bin file to directory \"%s\" " % bin_directory) if makevocab: print("Writing vocab file...") with open(join(bin_directory, "vocab"), 'w') as writer: for word, count in vocab_counter.most_common(VOCAB_SIZE): writer.write(word + ' ' + str(count) + '\n') print("Done writing vocab file to directory \"%s\" " % bin_directory)
def _text_to_binary(input_directories, output_filenames, split_fractions): filenames = _get_filenames(input_directories) random_shuffle(filenames) start_from_index = 0 for index, output_filename in enumerate(output_filenames): sample_count = int(len(filenames) * split_fractions[index]) print(output_filename + ': ' + str(sample_count)) end_index = min(start_from_index + sample_count, len(filenames)) _convert_files_to_binary(filenames[start_from_index:end_index], output_filename) start_from_index = end_index
def WriteTFrecords(stories_directory, tf_directory, outfiles, fraction): stories = [ f for f in listdir(stories_directory) if isfile(join(stories_directory, f)) ] random_shuffle(stories) print("Writing TFrecords files") print('story dir {} has {} stories'.format(stories_directory, len(stories))) index_start = 0 for index, outfile in enumerate(outfiles): counts = int(len(stories) * fraction[index]) index_stop = min(index_start + counts, len(stories)) index1 = index_start fileindex = 0 while index1 < index_stop: index1 = min(index_start + CHUNK_SIZE, index_stop) story_files = stories[index_start:index1] writer = tf.python_io.TFRecordWriter( join(tf_directory, outfile + '_' + str(fileindex) + '.tfrecord')) for story in story_files: try: article_sentence_list, abstract, label_list, title, file_name = ParseStory( join(stories_directory, story)) except: continue tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend( [article_sentence_list]) tf_example.features.feature[ 'abstract'].bytes_list.value.extend([abstract]) tf_example.features.feature[ 'label_list'].bytes_list.value.extend([label_list]) tf_example.features.feature['title'].bytes_list.value.extend( [title]) tf_example.features.feature[ 'file_name'].bytes_list.value.extend([file_name]) tf_example_str = tf_example.SerializeToString() writer.write(tf_example_str) writer.close() fileindex += 1 index_start = index1 print("Done writing TFrecords file to directory \"%s\" " % tf_directory)
def generate_news_data(corpus): """Generate some news data""" print("Generating Data") questions, answers, seen_answers = [], [], set() while corpus: line = corpus.pop() while len(line) > MIN_INPUT_LEN: if len(line) <= MAX_INPUT_LEN: answer = line line = "" else: space_location = line.rfind(" ", MIN_INPUT_LEN, MAX_INPUT_LEN - 1) if space_location > -1: answer = line[:space_location] line = line[len(answer) + 1:] else: space_location = line.rfind(" ") # no limits this time if space_location == -1: break # we are done with this line else: line = line[space_location + 1:] continue if answer and answer in seen_answers: continue seen_answers.add(answer) answers.append(answer) if random_randint(100000) == 8: # Show some progress print('.', end="") print('suffle', end=" ") random_shuffle(answers) print("Done") for answer_index, answer in enumerate(answers): question = add_noise_to_string(answer, AMOUNT_OF_NOISE) question += '.' * (MAX_INPUT_LEN - len(question)) answer += "." * (MAX_INPUT_LEN - len(answer)) answers[answer_index] = answer assert len(answer) == MAX_INPUT_LEN if random_randint(100000) == 8: # Show some progress print(len(seen_answers)) print("answer: '{}'".format(answer)) print("question: '{}'".format(question)) print() question = question[::-1] if INVERTED else question questions.append(question) return questions, answers
def generate_news_data(corpus): """Generate some news data""" print ("Generating Data") questions, answers, seen_answers = [], [], set() while corpus: line = corpus.pop() while len(line) > MIN_INPUT_LEN: if len(line) <= MAX_INPUT_LEN: answer = line line = "" else: space_location = line.rfind(" ", MIN_INPUT_LEN, MAX_INPUT_LEN - 1) if space_location > -1: answer = line[:space_location] line = line[len(answer) + 1:] else: space_location = line.rfind(" ") # no limits this time if space_location == -1: break # we are done with this line else: line = line[space_location + 1:] continue if answer and answer in seen_answers: continue seen_answers.add(answer) answers.append(answer) if random_randint(100000) == 8: # Show some progress print('.', end="") print('suffle', end=" ") random_shuffle(answers) print("Done") for answer_index, answer in enumerate(answers): question = add_noise_to_string(answer, AMOUNT_OF_NOISE) question += '.' * (MAX_INPUT_LEN - len(question)) answer += "." * (MAX_INPUT_LEN - len(answer)) answers[answer_index] = answer assert len(answer) == MAX_INPUT_LEN if random_randint(100000) == 8: # Show some progress print (len(seen_answers)) print ("answer: '{}'".format(answer)) print ("question: '{}'".format(question)) print () question = question[::-1] if INVERTED else question questions.append(question) return questions, answers
def split_data(): from numpy.random import shuffle as random_shuffle if (os.path.isfile(FILE_NAME_TRAIN)): print("Training and Validation files already created.") else: answers = open(FILE_NAME_FILTERED, encoding="utf8").read().split("\n") print('shuffle', end=" ") random_shuffle(answers) print("Done") # Explicitly set apart 10% for validation data that we never train over # skip if files already exist split_at = len(answers) - len(answers) // 10 with open(FILE_NAME_TRAIN, "wb") as output_file: output_file.write("\n".join(answers[:split_at]).encode('utf-8')) with open(FILE_NAME_VALIDATE, "wb") as output_file: output_file.write("\n".join(answers[split_at:]).encode('utf-8')) print("\nTraining and Validation files written.")
def _text_to_binary(output_files, split_fractions): """ Splitting the input data by split fractions for training, testing and validation and passing the output file accordingly to _convert_json_to_binary""" loaded_data = _get_json_file_data() random_shuffle(loaded_data) start_from_index = 0 for index, file_out in enumerate(output_files): # calculating no of examples in given file. sample_count = int(len(loaded_data) * split_fractions[index]) print(file_out + ": " + str(sample_count)) # computing corresponding index. end_index = min(start_from_index + sample_count, len(loaded_data)) # converting the block to binary. _convert_json_to_binary(loaded_data[start_from_index:end_index], file_out) # updating index. start_from_index = end_index
def get_sample_data(sample_size): """Set asside sample data for grid search. sample_size: float """ all_data = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n") print( datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S") + " " + 'Shuffling data...') random_shuffle(all_data) # Explicitly set apart sample_size as sample split_at = int(round(len(all_data) * sample_size)) print( datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S") + " " + 'Saving sample...') with open(NEWS_FILE_NAME_SAMPLE.format(sample_size), "wb") as output_file: output_file.write("\n".join(all_data[:split_at]).encode('utf-8')) print( datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S") + " " + 'Done.')
def generate_news_data(): """Generate some news data""" print("Generating Data") answers = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n") questions = [] print('shuffle', end=" ") random_shuffle(answers) print("Done") for answer_index, answer in enumerate(answers): question, answer = generate_question(answer) answers[answer_index] = answer assert len(answer) == CONFIG.max_input_len if random_randint(100000) == 8: # Show some progress print(len(answers)) print("answer: '{}'".format(answer)) print("question: '{}'".format(question)) print() question = question[::-1] if CONFIG.inverted else question questions.append(question) return questions, answers
def _text_to_binary(input_folder, output_files, split_fractions): """ Splitting the input data by split fractions for training, testing and validation and passing the output file accordingly to _convert_json_to_binary""" path = input_folder + '/' count_doc = 0 file_list = os.listdir(path) length_file_list = len(file_list) random_shuffle(file_list) for index, file_out in enumerate(output_files): sample_count = int(length_file_list * split_fractions[index]) for filename in file_list: count_doc += 1 document = _get_json_file_data(path + filename) _convert_json_to_binary(document, file_out) if count_doc == sample_count: count_doc = 0 break print(file_out + ": " + str(sample_count))
def _text_to_binary(input_folder, output_files, split_fractions): """ Splitting the input data by split fractions for training, testing and validation and passing the output file accordingly to _convert_json_to_binary""" count_doc = 0 file_list = os.listdir(input_folder) length_file_list = len(file_list) random_shuffle(file_list) start = 0 for index, file_out in enumerate(output_files): sample_count = int(length_file_list * split_fractions[index]) end = min(start+ sample_count, len(file_list)-1) for filename in file_list[start:end]: count_doc += 1 document = _get_json_file_data(input_folder + filename) _convert_json_to_binary(document, file_out) if count_doc + 1 == sample_count: start = end count_doc = 0 print(file_out + ": " + str(sample_count)) return True