def make_character_datasets_and_id_maps(self, sent_sets, id_maps, text_or_pickle, add_word_end_marker=True, add_word_start_marker=False, add_sentence_end_marker=False, add_sentence_start_marker=False): print('PennTreeBankParser: Making character level datasets') char_sent_sets = [] char_id_maps = [] for i in range(len(sent_sets)): char_map = SubwordMap(self.special_list) char_id_maps.append( char_map.convert_to_characters(id_maps[i]) ) char_sents = self.convert_sents_to_subwords(sent_sets[i], id_maps[i], char_id_maps[i], char_map, add_word_end_marker, add_word_start_marker) self.subword_sort(char_sents, char_id_maps[i]) char_sent_sets.append(char_sents) char_id_maps[i].print_IDs_range(range(20)) self.save_sets(char_sent_sets, char_id_maps, self.data_out_char_level, text_or_pickle)
def make_morphological_datasets_and_id_maps(self, sent_sets, id_maps, text_or_pickle, add_word_end_marker=True, add_word_start_marker=False, add_sentence_end_marker=False, add_sentence_start_marker=False, morph_train_params={'count_func':'log'}): print('PennTreeBankParser: Making morpholigical level datasets') morph_sent_sets = [] morph_id_maps = [] morph_map = SubwordMap(self.special_list) morph_map.train_morph_parser(id_maps[0]) for i in range(len(sent_sets)): morph_id_maps.append( morph_map.convert_to_morphemes(id_maps[i], train_params = morph_train_params)) morph_sents = self.convert_sents_to_subwords(sent_sets[i], id_maps[i], morph_id_maps[i], morph_map, add_word_end_marker, add_word_start_marker) self.subword_sort(morph_sents, morph_id_maps[i]) morph_sent_sets.append(morph_sents) self.save_sets(morph_sent_sets, morph_id_maps, self.data_out_morph_level, text_or_pickle)