Esempio n. 1
0
def create_cases(
    case_creators, case_creators_picker, filename: str
) -> Tuple[List[Optional[Tuple[List[str], List[str], bool]]], str]:
    rel_path = get_dir_and_file(filename)
    with open(filename, 'r') as f:
        res = []
        for line in f:
            list_of_words = line.rstrip('\n').split(" ")
            list_of_words = remove_some_log_statements(list_of_words)
            if placeholders['log_statement'] in list_of_words:
                case_creator = case_creators_picker(case_creators)
                res.append(case_creator.create_from(list_of_words))
            else:
                res.append(None)
    return res, rel_path
Esempio n. 2
0
def calc_logged_stats(path_to_label_file):
    stats = defaultdict(int)
    with open(path_to_label_file, 'r') as f:
        for line in f:
            stripped_line = line.rstrip('\n')
            if stripped_line in ['1', '0']:
                stats[WITH_LOGGING] += 1
            elif stripped_line == '':
                stats[NO_LOGGING] += 1
            else:
                raise AssertionError(
                    f"Invalid line: {stripped_line} in file: {path_to_label_file}"
                )
    if stats == {}:
        logger.warning(
            f"The project {path_to_label_file} contains no files. Skipping...")
        return None
    else:
        return stats, re.sub(f"\.{ContextsDataset.LABEL_FILE_EXT}$", "",
                             get_dir_and_file(path_to_label_file))
Esempio n. 3
0
    def __init__(self, path, text_field, label_field, **kwargs):
        """Create an IMDB dataset instance given a path and fields.

        Arguments:
            path: Path to the dataset's highest level directory
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        threshold = kwargs.pop("threshold", 0.0)
        context_len = kwargs.pop("context_len", 0)
        data_params = kwargs.pop("data", None)

        path_to_ignored_projects = os.path.join(
            path, '..', '..', '..',
            f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}")
        logger.info(
            f"Loading ignored projects from {path_to_ignored_projects} ...")
        ignored_projects_set = set(read_list(path_to_ignored_projects))

        fields = [('text', text_field), ('label', label_field)]
        examples = []

        for c_filename_before, c_filename_after, l_filename in file_mapper(
                path, ContextsDataset._get_pair,
                lambda fi: fi.endswith('label')):
            if not included_in_fraction(os.path.basename(l_filename),
                                        data_params.percent,
                                        data_params.start_from):
                continue

            proj_name = re.sub(f"\.{ContextsDataset.LABEL_FILE_EXT}$", "",
                               get_dir_and_file(l_filename))
            if proj_name in ignored_projects_set:
                continue

            c_file_before = None
            c_file_after = None
            l_file = None
            try:
                c_file_before = open(c_filename_before, 'r')
                c_file_after = open(c_filename_after, 'r')
                l_file = open(l_filename, 'r')
                for context_before, context_after, level in zip(
                        c_file_before, c_file_after, l_file):
                    level = level.rstrip('\n')
                    if level:
                        context_for_prediction = ContextsDataset._get_context_for_prediction(
                            context_before, context_after, context_len,
                            data_params.backwards)
                        example = data.Example.fromlist(
                            [context_for_prediction, level], fields)
                        examples.append(example)

            except FileNotFoundError:
                project_name = c_filename_before[:-len(ContextsDataset.
                                                       FW_CONTEXTS_FILE_EXT)]
                logger.error(f"Project context not loaded: {project_name}")
                continue
            finally:
                if c_file_before is not None:
                    c_file_before.close()
                if c_file_after is not None:
                    c_file_after.close()
                if l_file is not None:
                    l_file.close()

        if not examples:
            raise ValueError(
                f"Examples list is empty. (percent={data_params.percent}, start from={data_params.start_from})"
            )

        random.shuffle(examples)
        logger.debug(
            f"Number of examples gathered from {path}: {len(examples)} ")
        super(ContextsDataset, self).__init__(examples, fields, **kwargs)