Ejemplo n.º 1
0
class PretrainingIO(NetworkIO):

    __data_folder = join(io_utils.get_data_root(),
                         u"Pretrain")

    def __init__(self, model_to_pretrain_name):
        super(PretrainingIO, self).__init__(model_to_pretrain_name)

    def get_samples_filepaths(self, limit=3, shuffle_files=True):
        subfiles = self.__get_all_subfiles(self.__data_folder)
        if shuffle_files:
            shuffle(subfiles)
        return subfiles[:limit]

    @staticmethod
    def __get_all_subfiles(data_folder):
        filepaths = []
        for root, _, files in walk(data_folder):
            filepaths += map(lambda f: join(root, f), files)
        return sorted(filepaths)

    def get_word_embedding_filepath(self):
        return io_utils.get_rusvectores_news_embedding_filepath()
Ejemplo n.º 2
0
class RaNLPConfTaskRuSentRelIO(BaseAnswersIO):

    __cv_count = 3
    __etalon_root = join(io_utils.get_data_root(), u"ranlp/rsr/opinions")
    __src_file = join(io_utils.get_data_root(), u"ranlp/sources/rsr.txt")
    __answers_root_template = join(io_utils.get_data_root(),
                                   u"ranlp/rsr/answers/{}")
    __splitted_data_folder = join(io_utils.get_data_root(),
                                  u"ranlp/rsr/splitted")

    def __init__(self, model_to_pretrain_name):
        super(RaNLPConfTaskRuSentRelIO,
              self).__init__(answers_root=self.__answers_root_template.format(
                  model_to_pretrain_name),
                             model_name=model_to_pretrain_name)
        self.__cv_index = 0

    @property
    def CVIndex(self):
        return self.__cv_index

    @property
    def CVCount(self):
        return self.__cv_count

    @property
    def SplittedDataFolder(self):
        if not path.exists(self.__splitted_data_folder):
            makedirs(self.__splitted_data_folder)
        return self.__splitted_data_folder

    @property
    def SourceFile(self):
        return self.__src_file

    def inc_cv_index(self):
        self.__cv_index += 1

    def get_train_test_paths(self):
        all_filepaths = sorted(
            RaNLPConfTaskRuSentRelIO.__get_all_subfiles(
                RaNLPConfTaskRuSentRelIO.__splitted_data_folder))

        train_test_pairs = list(
            items_to_cv_pairs(cv=self.__cv_count,
                              items_list=all_filepaths,
                              shuffle=False))
        return train_test_pairs[self.__cv_index]

    @staticmethod
    def __get_all_subfiles(data_folder):
        filepaths = []
        for root, _, files in walk(data_folder):
            filepaths += map(lambda f: join(root, f), files)
        return sorted(filepaths)

    def get_word_embedding_filepath(self):
        return io_utils.get_rusvectores_news_embedding_filepath()

    def get_etalon_root(self):
        # TODO. Use the same code. Duplicate
        result = self.__etalon_root
        if not path.exists(result):
            makedirs(result)
        return result
 def get_states_filepath():
     return os.path.join(io_utils.get_data_root(), u'states.lss')
 def get_capitals_filepath():
     return os.path.join(io_utils.get_data_root(), u'capitals.lss')
Ejemplo n.º 5
0
class RaNLPConfTaskRuSentRelWithDevIO(BaseAnswersIO):

    __etalon_root = join(io_utils.get_data_root(), u"ranlp/rsr_dev/opinions")
    __answers_root_template = join(io_utils.get_data_root(),
                                   u"ranlp/rsr_dev/answers/{}")

    def __init__(self, model_to_pretrain_name):
        super(RaNLPConfTaskRuSentRelWithDevIO,
              self).__init__(answers_root=self.__answers_root_template.format(
                  model_to_pretrain_name),
                             model_name=model_to_pretrain_name)
        self.__cv_index = 0
        self.__io_rsr = RaNLPConfTaskRuSentRelIO(model_to_pretrain_name)
        self.__io_dev = RaNLPConfTaskDevIO(model_to_pretrain_name)

    @property
    def RuSentRelIO(self):
        return self.__io_rsr

    @property
    def DevIO(self):
        return self.__io_dev

    @property
    def CVIndex(self):
        return self.__cv_index

    @property
    def CVCount(self):
        return self.__io_rsr.CVCount

    @property
    def SourceFile(self):
        return None

    @property
    def SourceDataFolder(self):
        return None

    @property
    def SplittedDataFolder(self):
        return None

    def inc_cv_index(self):
        self.__cv_index += 1
        self.__io_rsr.inc_cv_index()
        self.__io_dev.inc_cv_index()

    def get_train_test_paths(self):
        rsr_train, rsr_test = self.__io_rsr.get_train_test_paths()
        dev_train, dev_test = self.__io_dev.get_train_test_paths()
        return rsr_train + dev_test + dev_train, rsr_test

    def get_word_embedding_filepath(self):
        return io_utils.get_rusvectores_news_embedding_filepath()

    def get_etalon_root(self):
        # TODO. Use the same code. Duplicate
        result = self.__etalon_root
        if not path.exists(result):
            makedirs(result)
        return result

    def iter_test_answers(self):
        pass
Ejemplo n.º 6
0
def get_rusentrel_stats_filepath():
    return os.path.join(io_utils.get_data_root(), u"rusentrel_docs_stat.txt")
Ejemplo n.º 7
0
def get_rusvectores_news_embedding_filepath():
    return path.join(get_data_root(), u"w2v/news_rusvectores2.bin.gz")