def get_word2vec_and_vocab_part_2(): allennlp_reader = TextCatReader( word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()), segment_sentences=True) label_iterator = LabelIterator(allennlp_reader, filepaths_of_data_to_train_on) make_label_ind_files(label_iterator)
def __init__( self, model_folder_name: str, token_indexers: Dict[str, TokenIndexer] = None, word_tokenizer: Tokenizer = None, segment_sentences: bool = False, lazy: bool = False, column_titles_to_index: List[str] = ("tokens", ) ) -> None: super().__init__(lazy=lazy) if model_folder_name.endswith('/'): model_folder_name = model_folder_name[:-1] if '/' in model_folder_name: model_folder_name = model_folder_name[model_folder_name. rfind('/') + 1:] self.model_folder_name = model_folder_name self._word_tokenizer = word_tokenizer or WordTokenizer( word_filter=PassThroughWordFilter()) self._segment_sentences = segment_sentences self._token_indexers = token_indexers or { 'tokens': SingleIdTokenIndexer() } self._column_titles_to_index = column_titles_to_index assert len(self._column_titles_to_index) > 0 if self._segment_sentences: self._sentence_segmenter = SentenceTokenizer()
def __init__( self, word_splitter: WordSplitter = SimpleWordSplitter(), word_filter: WordFilter = PassThroughWordFilter(), word_stemmer: WordStemmer = PassThroughWordStemmer() ) -> None: self.word_splitter = word_splitter self.word_filter = word_filter self.word_stemmer = word_stemmer
def __init__(self, word_splitter: WordSplitter = None, word_filter: WordFilter = PassThroughWordFilter(), word_stemmer: WordStemmer = PassThroughWordStemmer(), start_tokens: List[str] = None, end_tokens: List[str] = None) -> None: self._word_splitter = word_splitter or JiebaSplitter() self._word_filter = word_filter self._word_stemmer = word_stemmer self._start_tokens = start_tokens or [] self._start_tokens.reverse() self._end_tokens = end_tokens or []
def __init__(self, word_splitter: WordSplitter = None, word_filter: WordFilter = PassThroughWordFilter(), word_stemmer: WordStemmer = PassThroughWordStemmer(), start_tokens: List[str] = None, end_tokens: List[str] = None) -> None: self._word_splitter = word_splitter or SpacyWordSplitter() self._word_filter = word_filter self._word_stemmer = word_stemmer self._start_tokens = start_tokens or [] # We reverse the tokens here because we're going to insert them with `insert(0)` later; # this makes sure they show up in the right order. self._start_tokens.reverse() self._end_tokens = end_tokens or []
def __init__(self, token_indexers: Dict[str, TokenIndexer] = None, word_tokenizer: Tokenizer = None, segment_sentences: bool = False, lazy: bool = False, column_titles_to_index: List[str] = ("tokens", )) -> None: super().__init__(lazy=lazy) self._word_tokenizer = word_tokenizer or WordTokenizer(word_filter=PassThroughWordFilter()) self._segment_sentences = segment_sentences self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} self._column_titles_to_index = column_titles_to_index assert len(self._column_titles_to_index) > 0 if self._segment_sentences: self._sentence_segmenter = SentenceTokenizer()
def get_word2vec_and_vocab_part_1(): first_data_filepath = filepaths_of_data_to_train_on[0] allennlp_reader = TextCatReader( word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()), segment_sentences=True) sentence_iterator = GensimSentenceIterator(allennlp_reader, filepaths_of_data_to_train_on) words_to_inds, num_words_in_vocab_including_unk, total_num_sentences, vocab_filename_nonums = \ get_vocab_and_collect_num_sentences(sentence_iterator, first_data_filepath) sentence_iterator.valid_vocab_words = words_to_inds if write_temp_file_of_all_provided_data_with_set_vocab: sentence_iterator, temp_data_filename = make_temp_file_and_iterate_over_that_instead( sentence_iterator) words_to_inds = None label_iterator = LabelIterator(allennlp_reader, filepaths_of_data_to_train_on) make_label_ind_files(label_iterator)
def __init__(self, word_splitter: WordSplitter = None, word_filter: WordFilter = PassThroughWordFilter(), word_stemmer: WordStemmer = PassThroughWordStemmer(), start_tokens: List[str] = None, end_tokens: List[str] = None, in_between_tokens: List[str] = None): self.tokenizer = WordTokenizer(word_splitter=word_splitter, word_filter=word_filter, word_stemmer=word_stemmer) self.start_tokens = start_tokens or [] self.start_tokens = [Token(token) for token in self.start_tokens] self.end_tokens = end_tokens or [] self.end_tokens = [Token(token) for token in self.end_tokens] self.in_between_tokens = in_between_tokens or [] self.in_between_tokens = [ Token(token) for token in self.in_between_tokens ]
def get_word2vec_and_vocab_part_3(): first_data_filepath = filepaths_of_data_to_train_on[0] vocab_filename_nonums = first_data_filepath[:first_data_filepath.rfind('.')] + \ vocabword_ind_not_numbered_file_ending num_words_in_vocab_including_unk = 0 with open(vocab_filename_nonums, 'r') as f: for line in f: if line.strip() == '': continue num_words_in_vocab_including_unk += 1 print( str(num_words_in_vocab_including_unk) + " words in vocab, including unk token.") if write_temp_file_of_all_provided_data_with_set_vocab: temp_data_filename = first_data_filepath + ".temp" sentence_iterator = SimpleIterator([temp_data_filename]) total_num_sentences = -1 with open(temp_data_filename, 'r') as f: for line in f: if line.strip() == '': continue total_num_sentences += 1 else: allennlp_reader = TextCatReader( word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()), segment_sentences=True) sentence_iterator = GensimSentenceIterator( allennlp_reader, filepaths_of_data_to_train_on) total_num_sentences = 0 for sentence in iter(sentence_iterator): total_num_sentences += 1 sentence_iterator.total_num_sentences = total_num_sentences print("Starting to train model.") trained_model = Word2Vec(None, iter=iterations_for_training_word_embeddings, min_count=0, size=word_embedding_dimension, workers=4) trained_model.build_vocab(sentence_iterator) trained_model.train(sentence_iterator, total_examples=total_num_sentences, epochs=iterations_for_training_word_embeddings) temp_filename = (first_data_filepath[:first_data_filepath.rfind('.')] + "_tempgensim") trained_model.save(temp_filename) print("Starting to move trained embeddings into numpy matrix at " + str(datetime.datetime.now())) np_embedding_filename = ( first_data_filepath[:first_data_filepath.rfind('.')] + embedding_file_tag + ".npy") num_vocab_words = num_words_in_vocab_including_unk - 1 embedding_matrix = np.zeros( (num_vocab_words + 2, word_embedding_dimension)) ind_counter = 1 with open(vocab_filename_nonums, 'r') as f: for line in f: if line.strip() == '': continue line = line[:-1] # get rid of newline embedding_matrix[ind_counter] = trained_model[line] ind_counter += 1 embedding_matrix[num_words_in_vocab_including_unk] = trained_model[ unk_token] norm_of_embeddings = np.linalg.norm(embedding_matrix, axis=1) norm_of_embeddings[norm_of_embeddings == 0] = 1e-13 embedding_matrix = embedding_matrix / norm_of_embeddings[:, None] np.save(np_embedding_filename, embedding_matrix) print( "Starting to save numpy matrix as hdf5 file containing torch tensor at " + str(datetime.datetime.now())) hdf5_filename = (first_data_filepath[:first_data_filepath.rfind('.')] + embedding_file_tag + ".h5") with h5py.File(hdf5_filename, "w") as f: dset = f.create_dataset( "embedding", (num_words_in_vocab_including_unk + 1, word_embedding_dimension), dtype='f') dset[...] = embedding_matrix print("Removing temporary gensim model files at " + str(datetime.datetime.now())) # remove temp gensim model files, now that embedding matrix has been saved if os.path.isfile(temp_filename): os.remove(temp_filename) other_files_to_rm = glob(temp_filename + ".*") for fname in other_files_to_rm: os.remove(fname) if write_temp_file_of_all_provided_data_with_set_vocab: if os.path.isfile(temp_data_filename): os.remove(temp_data_filename)
def add_info_about_num_sents_and_max_num_tokens_to_data_file(data_filename): print("Starting to calculate information about number of sentences and max number of tokens for each " + "instance to add to " + data_filename) numsents_maxnumtokens = [] numsents_maxnumtokens_if_not_considering_stop_words = [] allennlp_formatted_reader = TextCatReader(word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()), segment_sentences=True) pass_through_word_filter = PassThroughWordFilter() stop_word_filter = StopwordFilter() first_line = True num_instances_passed = 0 with open(data_filename, 'r') as f: for line in tqdm(f): if first_line: # find which field is tokens temp_line = line tokens_field_ind = 0 while not (temp_line.startswith('tokens\t') or temp_line.startswith('tokens\n')): temp_line = temp_line[temp_line.index('\t') + 1:] tokens_field_ind += 1 first_line = False else: if line.strip() == '': continue text_field = get_nth_field_in_line(line, tokens_field_ind) num_instances_passed += 1 if len(text_field) == 0: # reader will skip this, so its tuple is (0, 0) numsents_maxnumtokens.append((0, 0)) numsents_maxnumtokens_if_not_considering_stop_words.append((0, 0)) continue allennlp_formatted_reader._word_tokenizer._word_filter = pass_through_word_filter instance = allennlp_formatted_reader.text_to_instance(tokens=text_field, category='placeholder') if instance is None: # reader will skip this, so its tuple is (0, 0) # and since the stop word version is even MORE restrictive, it would be skipped there too numsents_maxnumtokens.append((0, 0)) numsents_maxnumtokens_if_not_considering_stop_words.append((0, 0)) continue list_of_sentences = instance.fields['tokens'].field_list num_sents = len(list_of_sentences) max_num_tokens = 0 for sentence_as_text_field in list_of_sentences: list_of_tokens = sentence_as_text_field.tokens if len(list_of_tokens) > max_num_tokens: max_num_tokens = len(list_of_tokens) numsents_maxnumtokens.append((num_sents, max_num_tokens)) allennlp_formatted_reader._word_tokenizer._word_filter = stop_word_filter instance = allennlp_formatted_reader.text_to_instance(tokens=text_field, category='placeholder') if instance is None: # reader will skip this, so its tuple is (0, 0) numsents_maxnumtokens_if_not_considering_stop_words.append((0, 0)) continue list_of_sentences = instance.fields['tokens'].field_list num_sents = len(list_of_sentences) max_num_tokens = 0 for sentence_as_text_field in list_of_sentences: list_of_tokens = sentence_as_text_field.tokens if len(list_of_tokens) > max_num_tokens: max_num_tokens = len(list_of_tokens) numsents_maxnumtokens_if_not_considering_stop_words.append((num_sents, max_num_tokens)) assert len(numsents_maxnumtokens) == num_instances_passed assert len(numsents_maxnumtokens_if_not_considering_stop_words) == num_instances_passed temp_full_filename = data_filename[:data_filename.rfind('.')] + "_temp.tsv" old_f = open(data_filename, 'r') first_line = True instance_counter = 0 with open(temp_full_filename, 'w') as f: for line in old_f: if first_line: old_line = line[:line.rfind('\n')] new_line = (old_line + '\t' + 'num_sentences_post_stopword_removal' + '\t' + 'max_num_tokens_in_sentence_post_stopword_removal' + '\t' + 'num_sentences' + '\t' + 'max_num_tokens_in_sentence' + '\n') f.write(new_line) first_line = False else: if line.strip() == '': continue str_num_sents_no_stopwords = str( numsents_maxnumtokens_if_not_considering_stop_words[instance_counter][0]) str_max_num_tokens_no_stopwords = str( numsents_maxnumtokens_if_not_considering_stop_words[instance_counter][1]) str_num_sents = str(numsents_maxnumtokens[instance_counter][0]) str_max_num_tokens = str(numsents_maxnumtokens[instance_counter][1]) old_line = line[:line.rfind('\n')] new_line = (old_line + '\t' + str_num_sents_no_stopwords + '\t' + str_max_num_tokens_no_stopwords + '\t' + str_num_sents + '\t' + str_max_num_tokens + '\n') f.write(new_line) instance_counter += 1 old_f.close() if os.path.isfile(data_filename): os.remove(data_filename) os.rename(temp_full_filename, data_filename)
class InstanceLenGenerator: def __init__(self, allennlp_formatted_reader, filepaths): self.allennlp_formatted_reader = allennlp_formatted_reader self.filepaths = filepaths def __iter__(self): for filepath in self.filepaths: for instance in tqdm( self.allennlp_formatted_reader._read(file_path=filepath)): instance_as_text_field = instance.fields['tokens'] yield len(instance_as_text_field.tokens) allennlp_reader = TextCatReader( word_tokenizer=WordTokenizer(word_filter=PassThroughWordFilter()), segment_sentences=False) len_generator = InstanceLenGenerator(allennlp_reader, filenames_to_use) all_lengths = [] for length in iter(len_generator): all_lengths.append(length) arr_of_lengths = np.array(all_lengths) m = np.mean(arr_of_lengths) sd = np.std(arr_of_lengths) with open(write_to_filename, 'w') as f: f.write('Mean: ' + str(m) + '\n') f.write('SD: ' + str(sd) + '\n') print("Done calculating word stats.")