Beispiel #1
0
def parsed_files_generator(path_to_dir_with_preprocessed_projects,
                           train_test_valid, percent, start_from, dao):
    for file in os.listdir(
            os.path.join(path_to_dir_with_preprocessed_projects,
                         train_test_valid)):
        if file.startswith(".") or get_project_name(
                file) in dao.processed_projects_cache:
            continue
        if included_in_fraction(file, percent, start_from):
            yield file
Beispiel #2
0
    def __init__(self, path, text_field, label_field, **kwargs):
        """Create an IMDB dataset instance given a path and fields.

        Arguments:
            path: Path to the dataset's highest level directory
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        threshold = kwargs.pop("threshold", 0.0)
        context_len = kwargs.pop("context_len", 0)
        data_params = kwargs.pop("data", None)

        path_to_ignored_projects = os.path.join(
            path, '..', '..', '..',
            f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}")
        logger.info(
            f"Loading ignored projects from {path_to_ignored_projects} ...")
        ignored_projects_set = set(read_list(path_to_ignored_projects))

        fields = [('text', text_field), ('label', label_field)]
        examples = []

        for c_filename_before, c_filename_after, l_filename in file_mapper(
                path, ContextsDataset._get_pair,
                lambda fi: fi.endswith('label')):
            if not included_in_fraction(os.path.basename(l_filename),
                                        data_params.percent,
                                        data_params.start_from):
                continue

            proj_name = re.sub(f"\.{ContextsDataset.LABEL_FILE_EXT}$", "",
                               get_dir_and_file(l_filename))
            if proj_name in ignored_projects_set:
                continue

            c_file_before = None
            c_file_after = None
            l_file = None
            try:
                c_file_before = open(c_filename_before, 'r')
                c_file_after = open(c_filename_after, 'r')
                l_file = open(l_filename, 'r')
                for context_before, context_after, level in zip(
                        c_file_before, c_file_after, l_file):
                    level = level.rstrip('\n')
                    if level:
                        context_for_prediction = ContextsDataset._get_context_for_prediction(
                            context_before, context_after, context_len,
                            data_params.backwards)
                        example = data.Example.fromlist(
                            [context_for_prediction, level], fields)
                        examples.append(example)

            except FileNotFoundError:
                project_name = c_filename_before[:-len(ContextsDataset.
                                                       FW_CONTEXTS_FILE_EXT)]
                logger.error(f"Project context not loaded: {project_name}")
                continue
            finally:
                if c_file_before is not None:
                    c_file_before.close()
                if c_file_after is not None:
                    c_file_after.close()
                if l_file is not None:
                    l_file.close()

        if not examples:
            raise ValueError(
                f"Examples list is empty. (percent={data_params.percent}, start from={data_params.start_from})"
            )

        random.shuffle(examples)
        logger.debug(
            f"Number of examples gathered from {path}: {len(examples)} ")
        super(ContextsDataset, self).__init__(examples, fields, **kwargs)
 def test_include_to_df_invalid_filename(self):
     with self.assertRaises(ValueError):
         included_in_fraction('file', 0.1, 99.9)
    def test_include_to_df_smoke_true(self):
        result = included_in_fraction('120_file', 50.0, 0.0)

        self.assertTrue(result)
 def test_include_to_df_invalid_percent(self):
     with self.assertRaises(ValueError):
         included_in_fraction('30_file', 101, 99.9)
 def test_include_to_df_invalid_start_from(self):
     with self.assertRaises(ValueError):
         included_in_fraction('file', 0.1, 150)
 def test_include_to_df_zero_percent(self):
     with self.assertRaises(ValueError):
         included_in_fraction('990_file', 0.0, 99.0)
    def test_include_to_df_999(self):
        result = included_in_fraction('999_file', 0.1, 99.9)

        self.assertTrue(result)
    def test_include_to_df_zero_chunk_false(self):
        result = included_in_fraction('0_file', 0.1, 0.1)

        self.assertFalse(result)
    def test_include_to_df_zero_chunk_true(self):
        result = included_in_fraction('0_file', 0.1, 0.0)

        self.assertTrue(result)
Beispiel #11
0
def file_generator(full_src_dir, extension, percent, start_from):
    return file_mapper(
        full_src_dir, lambda l: l, lambda fi: fi.endswith(extension) and
        fractions_manager.included_in_fraction(fi, percent, start_from))