def __init__(self, name, document, vocab, index): self.index = index self.name = name self.words = np.array([ vocab.get(word) for sentence in to_raw_text_markupless(document) for word in sentence if vocab.get(word) ], dtype='int32') self.size = len(self.words)
def to_token_string(text): tokens = to_raw_text_markupless(text) tokens = [' '.join(sentence_tokens) for sentence_tokens in tokens] tokens = ' '.join(tokens) return tokens
def tokenize_and_write(file, text, token): for sentence in to_raw_text_markupless(text): file.write(" ".join(sentence)) file.write(token)
def collect_counts(documents): vocab = Counter() for value in documents.values(): vocab.update(word for sentence in to_raw_text_markupless(value) for word in sentence) return vocab
def tokenize_sentences(text): sentences = text.strip().split("\t") gen_sentences = [" ".join(tsentence) for sentence in sentences for tsentence in to_raw_text_markupless(sentence)] return "\t".join(gen_sentences[0:2]) + " ".join(gen_sentences[2:])
print("Generated %d question answer pairs" % (len(output_content) )) print("Skipped %d pairs because of answer shorter than %d words" % (num_too_short, MIN_ANSWER_LENGTH)) print("Skipped %d because of encoding issues." % (num_nonascii,)) num_valid = 0 num_train = 0 with open(VALIDATE_FILE, 'wt') as fvalid: with open(TRAIN_FILE, 'wt') as ftrain: for i, qa in enumerate(output_content): question, answer = qa print_progress(i, len(output_content)) question_tokens = [] answer_tokens = [] for line in to_raw_text_markupless(question): question_tokens.extend(line) for line in to_raw_text_markupless(answer): answer_tokens.extend(line) output_line = '%s\t%s\n' % (' '.join(question_tokens), ' '.join(answer_tokens)) if random.random() < VALIDATION_SIZE: fvalid.write(output_line) num_valid += 1 else: ftrain.write(output_line) num_train += 1 print("Saved %d pairs in %s" % (num_train, TRAIN_FILE))