class PretrainingIO(NetworkIO): __data_folder = join(io_utils.get_data_root(), u"Pretrain") def __init__(self, model_to_pretrain_name): super(PretrainingIO, self).__init__(model_to_pretrain_name) def get_samples_filepaths(self, limit=3, shuffle_files=True): subfiles = self.__get_all_subfiles(self.__data_folder) if shuffle_files: shuffle(subfiles) return subfiles[:limit] @staticmethod def __get_all_subfiles(data_folder): filepaths = [] for root, _, files in walk(data_folder): filepaths += map(lambda f: join(root, f), files) return sorted(filepaths) def get_word_embedding_filepath(self): return io_utils.get_rusvectores_news_embedding_filepath()
class RaNLPConfTaskRuSentRelIO(BaseAnswersIO): __cv_count = 3 __etalon_root = join(io_utils.get_data_root(), u"ranlp/rsr/opinions") __src_file = join(io_utils.get_data_root(), u"ranlp/sources/rsr.txt") __answers_root_template = join(io_utils.get_data_root(), u"ranlp/rsr/answers/{}") __splitted_data_folder = join(io_utils.get_data_root(), u"ranlp/rsr/splitted") def __init__(self, model_to_pretrain_name): super(RaNLPConfTaskRuSentRelIO, self).__init__(answers_root=self.__answers_root_template.format( model_to_pretrain_name), model_name=model_to_pretrain_name) self.__cv_index = 0 @property def CVIndex(self): return self.__cv_index @property def CVCount(self): return self.__cv_count @property def SplittedDataFolder(self): if not path.exists(self.__splitted_data_folder): makedirs(self.__splitted_data_folder) return self.__splitted_data_folder @property def SourceFile(self): return self.__src_file def inc_cv_index(self): self.__cv_index += 1 def get_train_test_paths(self): all_filepaths = sorted( RaNLPConfTaskRuSentRelIO.__get_all_subfiles( RaNLPConfTaskRuSentRelIO.__splitted_data_folder)) train_test_pairs = list( items_to_cv_pairs(cv=self.__cv_count, items_list=all_filepaths, shuffle=False)) return train_test_pairs[self.__cv_index] @staticmethod def __get_all_subfiles(data_folder): filepaths = [] for root, _, files in walk(data_folder): filepaths += map(lambda f: join(root, f), files) return sorted(filepaths) def get_word_embedding_filepath(self): return io_utils.get_rusvectores_news_embedding_filepath() def get_etalon_root(self): # TODO. Use the same code. Duplicate result = self.__etalon_root if not path.exists(result): makedirs(result) return result
def get_states_filepath(): return os.path.join(io_utils.get_data_root(), u'states.lss')
def get_capitals_filepath(): return os.path.join(io_utils.get_data_root(), u'capitals.lss')
class RaNLPConfTaskRuSentRelWithDevIO(BaseAnswersIO): __etalon_root = join(io_utils.get_data_root(), u"ranlp/rsr_dev/opinions") __answers_root_template = join(io_utils.get_data_root(), u"ranlp/rsr_dev/answers/{}") def __init__(self, model_to_pretrain_name): super(RaNLPConfTaskRuSentRelWithDevIO, self).__init__(answers_root=self.__answers_root_template.format( model_to_pretrain_name), model_name=model_to_pretrain_name) self.__cv_index = 0 self.__io_rsr = RaNLPConfTaskRuSentRelIO(model_to_pretrain_name) self.__io_dev = RaNLPConfTaskDevIO(model_to_pretrain_name) @property def RuSentRelIO(self): return self.__io_rsr @property def DevIO(self): return self.__io_dev @property def CVIndex(self): return self.__cv_index @property def CVCount(self): return self.__io_rsr.CVCount @property def SourceFile(self): return None @property def SourceDataFolder(self): return None @property def SplittedDataFolder(self): return None def inc_cv_index(self): self.__cv_index += 1 self.__io_rsr.inc_cv_index() self.__io_dev.inc_cv_index() def get_train_test_paths(self): rsr_train, rsr_test = self.__io_rsr.get_train_test_paths() dev_train, dev_test = self.__io_dev.get_train_test_paths() return rsr_train + dev_test + dev_train, rsr_test def get_word_embedding_filepath(self): return io_utils.get_rusvectores_news_embedding_filepath() def get_etalon_root(self): # TODO. Use the same code. Duplicate result = self.__etalon_root if not path.exists(result): makedirs(result) return result def iter_test_answers(self): pass
def get_rusentrel_stats_filepath(): return os.path.join(io_utils.get_data_root(), u"rusentrel_docs_stat.txt")
def get_rusvectores_news_embedding_filepath(): return path.join(get_data_root(), u"w2v/news_rusvectores2.bin.gz")