コード例 #1
0
def text_to_binary(input_directories, output_filenames, split_fractions):
    filenames = get_filenames(input_directories)
    random_shuffle(filenames)
    start_from_index = 0
    counter = collections.Counter()  # for the vocab counts

    count = 0
    for index, output_filename in enumerate(output_filenames):
        sample_count = int(len(filenames) * split_fractions[index])
        print(output_filename + ': ' + str(sample_count))
        end_index = min(start_from_index + sample_count, len(filenames))
        convert_files_to_binary(filenames[start_from_index:end_index],
                                output_filename, counter)
        count += len(filenames[start_from_index:end_index])
        print("num files handled: ", count)
        start_from_index = end_index

    # create vocab file
    with open('vocab', 'w+') as vocab_f:
        for word, count in counter.most_common(VOCAB_LIMIT - 4):
            vocab_f.write(word + ' ' + str(count) + '\n')
        vocab_f.write('<s> 0\n')
        vocab_f.write('</s> 0\n')
        vocab_f.write('<UNK> 0\n')
        vocab_f.write('<PAD> 0\n')
コード例 #2
0
def generator(file_name):
    """Returns a tuple (inputs, targets)
    All arrays should contain the same number of samples.
    The generator is expected to loop over its data indefinitely.
    An epoch finishes when  samples_per_epoch samples have been seen by the model.
    """
    ctable = CharacterTable(read_top_chars())
    batch_of_answers = []
    while True:
        with open(file_name) as answers:
            for answer in answers:
                batch_of_answers.append(answer.strip().decode('utf-8'))
                if len(batch_of_answers) == CONFIG.batch_size:
                    random_shuffle(batch_of_answers)
                    batch_of_questions = []
                    for answer_index, answer in enumerate(batch_of_answers):
                        question, answer = generate_question(answer)
                        batch_of_answers[answer_index] = answer
                        assert len(answer) == CONFIG.max_input_len
                        question = question[::
                                            -1] if CONFIG.inverted else question
                        batch_of_questions.append(question)
                    X, y = _vectorize(batch_of_questions, batch_of_answers,
                                      ctable)
                    yield X, y
                    batch_of_answers = []
コード例 #3
0
    def generate_examples(self, corpus):
        """Generate examples of misspellings"""

        print("Generating examples")

        questions, answers, seen_answers = [], [], set()

        while corpus:
            line = corpus.pop()

            while len(line) > MIN_INPUT_LEN:
                if len(line) <= MAX_INPUT_LEN:
                    answer = line
                    line = ""
                else:
                    #print(line)

                    space_location = line.rfind(" ", MIN_INPUT_LEN,
                                                MAX_INPUT_LEN - 1)
                    #print(space_location)
                    if space_location > -1:
                        answer = line[:space_location]
                        line = line[len(answer) + 1:]
                    else:
                        space_location = line.rfind(" ")  # no limits this time
                        if space_location == -1:
                            break  # we are done with this line
                        else:
                            line = line[space_location + 1:]
                            continue

                if answer and answer in seen_answers:
                    continue

                seen_answers.add(answer)
                answers.append(answer)

        print('Shuffle')
        random_shuffle(answers)
        print("Shuffled")

        for answer_index, answer in enumerate(answers):
            question = self.add_noise_to_string(answer, AMOUNT_OF_NOISE)
            question += '.' * (MAX_INPUT_LEN - len(question))
            answer += "." * (MAX_INPUT_LEN - len(answer))
            answers[answer_index] = answer
            assert len(answer) == MAX_INPUT_LEN

            question = question[::-1] if self.inverted else question
            questions.append(question)

        print("Generated questions and answers")

        return questions, answers
コード例 #4
0
def preprocess_partition_data():
    """Set asside data for validation"""
    answers = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n")
    print('shuffle', end=" ")
    random_shuffle(answers)
    print("Done")
    # Explicitly set apart 10% for validation data that we never train over
    split_at = len(answers) - len(answers) // 10
    with open(NEWS_FILE_NAME_TRAIN, "wb") as output_file:
        output_file.write("\n".join(answers[:split_at]).encode('utf-8'))
    with open(NEWS_FILE_NAME_VALIDATE, "wb") as output_file:
        output_file.write("\n".join(answers[split_at:]).encode('utf-8'))
コード例 #5
0
ファイル: checkbin.py プロジェクト: zwc12/Summarization
def WriteBin(stories_directory, bin_directory, outfiles, fraction, makevocab=True):
  
  stories = _get_filenames(stories_directory)
  random_shuffle(stories)
  
  if makevocab:
    vocab_counter = collections.Counter()
    
  print("Writing bin file")
  
  index_start = 0
  progress_bar = ProgressBar(len(stories))
  for index, outfile in enumerate(outfiles):
    counts = int(len(stories) * fraction[index])
    index_stop = min(index_start + counts, len(stories))
    index1 = index_start
    fileindex = 0
    
    while index1< index_stop:
      index1 = min(index_start + CHUNK_SIZE, index_stop)
      story_files = stories[index_start:index1]
      
      with open(join(bin_directory, outfile + '_' + str(fileindex) + '.bin' ), 'wb') as writer:
        for story in story_files:
          article, abstract = ParseStory(join(stories_directory,story))
        
          tf_example = example_pb2.Example()
          tf_example.features.feature['article'].bytes_list.value.extend([article])
          tf_example.features.feature['abstract'].bytes_list.value.extend([abstract])
          tf_example_str = tf_example.SerializeToString()
          str_len = len(tf_example_str)
          writer.write(struct.pack('q', str_len))
          writer.write(struct.pack('%ds' % str_len, tf_example_str))
          progress_bar.Increment()
          if makevocab:
            art_tokens = article.split()
            abs_tokens = abstract.split()
            tokens = art_tokens + abs_tokens
            tokens = [t.strip() for t in tokens if t not in [None, '']]
          vocab_counter.update(tokens)
      fileindex += 1
      index_start = index1
  print("Done writing bin file to directory \"%s\" " % bin_directory)
  
  if makevocab:
    print("Writing vocab file...")
    with open(join(bin_directory, "vocab"), 'w') as writer:
      for word, count in vocab_counter.most_common(VOCAB_SIZE):
        writer.write(word + ' ' + str(count) + '\n')
    print("Done writing vocab file to directory \"%s\" " % bin_directory)
コード例 #6
0
def _text_to_binary(input_directories, output_filenames, split_fractions):
  filenames = _get_filenames(input_directories)
  
  random_shuffle(filenames)
  
  start_from_index = 0
  for index, output_filename in enumerate(output_filenames):
    sample_count = int(len(filenames) * split_fractions[index])
    print(output_filename + ': ' + str(sample_count))
    
    end_index = min(start_from_index + sample_count, len(filenames))
    _convert_files_to_binary(filenames[start_from_index:end_index], output_filename)
    
    start_from_index = end_index
コード例 #7
0
def WriteTFrecords(stories_directory, tf_directory, outfiles, fraction):
    stories = [
        f for f in listdir(stories_directory)
        if isfile(join(stories_directory, f))
    ]
    random_shuffle(stories)

    print("Writing TFrecords files")
    print('story dir {} has {} stories'.format(stories_directory,
                                               len(stories)))

    index_start = 0
    for index, outfile in enumerate(outfiles):
        counts = int(len(stories) * fraction[index])
        index_stop = min(index_start + counts, len(stories))
        index1 = index_start
        fileindex = 0

        while index1 < index_stop:
            index1 = min(index_start + CHUNK_SIZE, index_stop)
            story_files = stories[index_start:index1]

            writer = tf.python_io.TFRecordWriter(
                join(tf_directory,
                     outfile + '_' + str(fileindex) + '.tfrecord'))
            for story in story_files:
                try:
                    article_sentence_list, abstract, label_list, title, file_name = ParseStory(
                        join(stories_directory, story))
                except:
                    continue
                tf_example = example_pb2.Example()
                tf_example.features.feature['article'].bytes_list.value.extend(
                    [article_sentence_list])
                tf_example.features.feature[
                    'abstract'].bytes_list.value.extend([abstract])
                tf_example.features.feature[
                    'label_list'].bytes_list.value.extend([label_list])
                tf_example.features.feature['title'].bytes_list.value.extend(
                    [title])
                tf_example.features.feature[
                    'file_name'].bytes_list.value.extend([file_name])
                tf_example_str = tf_example.SerializeToString()
                writer.write(tf_example_str)
            writer.close()
            fileindex += 1
            index_start = index1
    print("Done writing TFrecords file to directory \"%s\" " % tf_directory)
コード例 #8
0
def generate_news_data(corpus):
    """Generate some news data"""
    print("Generating Data")
    questions, answers, seen_answers = [], [], set()
    while corpus:
        line = corpus.pop()
        while len(line) > MIN_INPUT_LEN:
            if len(line) <= MAX_INPUT_LEN:
                answer = line
                line = ""
            else:
                space_location = line.rfind(" ", MIN_INPUT_LEN,
                                            MAX_INPUT_LEN - 1)
                if space_location > -1:
                    answer = line[:space_location]
                    line = line[len(answer) + 1:]
                else:
                    space_location = line.rfind(" ")  # no limits this time
                    if space_location == -1:
                        break  # we are done with this line
                    else:
                        line = line[space_location + 1:]
                        continue
            if answer and answer in seen_answers:
                continue
            seen_answers.add(answer)
            answers.append(answer)
        if random_randint(100000) == 8:  # Show some progress
            print('.', end="")
    print('suffle', end=" ")
    random_shuffle(answers)
    print("Done")
    for answer_index, answer in enumerate(answers):
        question = add_noise_to_string(answer, AMOUNT_OF_NOISE)
        question += '.' * (MAX_INPUT_LEN - len(question))
        answer += "." * (MAX_INPUT_LEN - len(answer))
        answers[answer_index] = answer
        assert len(answer) == MAX_INPUT_LEN
        if random_randint(100000) == 8:  # Show some progress
            print(len(seen_answers))
            print("answer:   '{}'".format(answer))
            print("question: '{}'".format(question))
            print()
        question = question[::-1] if INVERTED else question
        questions.append(question)

    return questions, answers
コード例 #9
0
ファイル: keras_spell.py プロジェクト: omshanti/DeepSpell
def generate_news_data(corpus):
    """Generate some news data"""
    print ("Generating Data")
    questions, answers, seen_answers = [], [], set()
    while corpus:
        line = corpus.pop()
        while len(line) > MIN_INPUT_LEN:
            if len(line) <= MAX_INPUT_LEN:
                answer = line
                line = ""
            else:
                space_location = line.rfind(" ", MIN_INPUT_LEN, MAX_INPUT_LEN - 1)
                if space_location > -1:
                    answer = line[:space_location]
                    line = line[len(answer) + 1:]
                else:
                    space_location = line.rfind(" ") # no limits this time
                    if space_location == -1:
                        break # we are done with this line
                    else:
                        line = line[space_location + 1:]
                        continue
            if answer and answer in seen_answers:
                continue
            seen_answers.add(answer)
            answers.append(answer)
        if random_randint(100000) == 8: # Show some progress
            print('.', end="")
    print('suffle', end=" ")
    random_shuffle(answers)
    print("Done")
    for answer_index, answer in enumerate(answers):
        question = add_noise_to_string(answer, AMOUNT_OF_NOISE)
        question += '.' * (MAX_INPUT_LEN - len(question))
        answer += "." * (MAX_INPUT_LEN - len(answer))
        answers[answer_index] = answer
        assert len(answer) == MAX_INPUT_LEN
        if random_randint(100000) == 8: # Show some progress
            print (len(seen_answers))
            print ("answer:   '{}'".format(answer))
            print ("question: '{}'".format(question))
            print ()
        question = question[::-1] if INVERTED else question
        questions.append(question)

    return questions, answers
コード例 #10
0
def split_data():

    from numpy.random import shuffle as random_shuffle

    if (os.path.isfile(FILE_NAME_TRAIN)):
        print("Training and Validation files already created.")
    else:
        answers = open(FILE_NAME_FILTERED, encoding="utf8").read().split("\n")
        print('shuffle', end=" ")
        random_shuffle(answers)
        print("Done")
        # Explicitly set apart 10% for validation data that we never train over
        # skip if files already exist
        split_at = len(answers) - len(answers) // 10
        with open(FILE_NAME_TRAIN, "wb") as output_file:
            output_file.write("\n".join(answers[:split_at]).encode('utf-8'))
        with open(FILE_NAME_VALIDATE, "wb") as output_file:
            output_file.write("\n".join(answers[split_at:]).encode('utf-8'))
        print("\nTraining and Validation files written.")
コード例 #11
0
def _text_to_binary(output_files, split_fractions):
    """ Splitting the input data by split fractions
        for training, testing and validation and
        passing the output file accordingly to _convert_json_to_binary"""
    loaded_data = _get_json_file_data()

    random_shuffle(loaded_data)

    start_from_index = 0
    for index, file_out in enumerate(output_files):
        # calculating no of examples in given file.
        sample_count = int(len(loaded_data) * split_fractions[index])
        print(file_out + ": " + str(sample_count))
        # computing corresponding index.
        end_index = min(start_from_index + sample_count, len(loaded_data))
        # converting the block to binary.
        _convert_json_to_binary(loaded_data[start_from_index:end_index],
                                file_out)
        # updating index.
        start_from_index = end_index
コード例 #12
0
def get_sample_data(sample_size):
    """Set asside sample data for grid search.
    sample_size: float
    """
    all_data = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n")
    print(
        datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S") + " " +
        'Shuffling data...')
    random_shuffle(all_data)

    # Explicitly set apart sample_size as sample
    split_at = int(round(len(all_data) * sample_size))
    print(
        datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S") + " " +
        'Saving sample...')
    with open(NEWS_FILE_NAME_SAMPLE.format(sample_size), "wb") as output_file:
        output_file.write("\n".join(all_data[:split_at]).encode('utf-8'))

    print(
        datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S") + " " + 'Done.')
コード例 #13
0
def generate_news_data():
    """Generate some news data"""
    print("Generating Data")
    answers = open(NEWS_FILE_NAME_SPLIT).read().decode('utf-8').split("\n")
    questions = []
    print('shuffle', end=" ")
    random_shuffle(answers)
    print("Done")
    for answer_index, answer in enumerate(answers):
        question, answer = generate_question(answer)
        answers[answer_index] = answer
        assert len(answer) == CONFIG.max_input_len
        if random_randint(100000) == 8:  # Show some progress
            print(len(answers))
            print("answer:   '{}'".format(answer))
            print("question: '{}'".format(question))
            print()
        question = question[::-1] if CONFIG.inverted else question
        questions.append(question)

    return questions, answers
def _text_to_binary(input_folder, output_files, split_fractions):
    """ Splitting the input data by split fractions
        for training, testing and validation and
        passing the output file accordingly to _convert_json_to_binary"""

    path = input_folder + '/'
    count_doc = 0
    file_list = os.listdir(path)
    length_file_list = len(file_list)
    random_shuffle(file_list)

    for index, file_out in enumerate(output_files):
        sample_count = int(length_file_list * split_fractions[index])
        for filename in file_list:
            count_doc += 1
            document = _get_json_file_data(path + filename)
            _convert_json_to_binary(document, file_out)

            if count_doc == sample_count:
                count_doc = 0
                break

        print(file_out + ": " + str(sample_count))
def _text_to_binary(input_folder, output_files, split_fractions):
    """ Splitting the input data by split fractions
        for training, testing and validation and
        passing the output file accordingly to _convert_json_to_binary"""

    count_doc = 0
    file_list = os.listdir(input_folder)
    length_file_list = len(file_list)
    random_shuffle(file_list)
    start = 0

    for index, file_out in enumerate(output_files):
        sample_count = int(length_file_list * split_fractions[index])
        end = min(start+ sample_count, len(file_list)-1)
        for filename in file_list[start:end]:
            count_doc += 1
            document = _get_json_file_data(input_folder + filename)
            _convert_json_to_binary(document, file_out)
            if count_doc + 1 == sample_count:
                start = end
                count_doc = 0

        print(file_out + ": " + str(sample_count))
    return True