Exemple #1
0
def preprocess_and_write(params):
    from logrec.properties import REWRITE_PARSED_FILE

    src_dir, dest_dir, train_test_valid, project, preprocessing_param_dict = params
    full_dest_dir = os.path.join(dest_dir, train_test_valid)
    path_to_preprocessed_file = os.path.join(full_dest_dir,
                                             f'{project}.{EXTENSION}')
    if not os.path.exists(full_dest_dir):
        os.makedirs(full_dest_dir, exist_ok=True)
    if not REWRITE_PARSED_FILE and os.path.exists(path_to_preprocessed_file):
        logger.warning(
            f"File {path_to_preprocessed_file} already exists! Doing nothing.")
        return
    dir_with_files_to_preprocess = os.path.join(src_dir, train_test_valid,
                                                project)
    if not os.path.exists(dir_with_files_to_preprocess):
        logger.error(f"Path {dir_with_files_to_preprocess} does not exist")
        exit(2)
    filenames = []
    with gzip.GzipFile(f'{path_to_preprocessed_file}.part', 'wb') as f:
        total_files = sum(
            f
            for f in file_mapper(dir_with_files_to_preprocess, lambda path: 1))
        logger.info(
            f"Preprocessing java files from {dir_with_files_to_preprocess}. Files to process: {total_files}"
        )
        pickle.dump(preprocessing_param_dict, f, pickle.HIGHEST_PROTOCOL)
        for ind, (lines_from_file, file_path) in enumerate(
                file_mapper(dir_with_files_to_preprocess, read_file_contents)):
            if (ind + 1) % 100 == 0:
                logger.info(
                    f"[{os.path.join(train_test_valid, project)}] Parsed {ind+1} out of {total_files} files ({(ind+1)/float(total_files)*100:.2f}%)"
                )
            parsed = apply_preprocessors(from_file(lines_from_file),
                                         pp_params["preprocessors"],
                                         {'interesting_context_words': []})
            pickle.dump(parsed, f, pickle.HIGHEST_PROTOCOL)
            filename = os.path.relpath(file_path,
                                       start=dir_with_files_to_preprocess)
            filenames.append(filename)

    with open(os.path.join(full_dest_dir, f'.{project}.{FILENAMES_EXTENSION}'),
              "w") as f:
        for filename in filenames:
            try:
                f.write(f"{filename}\n")
            except UnicodeEncodeError:
                f.write("<bad encoding>\n")
                logger.warning("Filename has bad encoding")

    # remove .part to show that all raw files in this project have been preprocessed
    os.rename(f'{path_to_preprocessed_file}.part', path_to_preprocessed_file)
Exemple #2
0
def run(dataset: str, repr: str, classifier: str):
    from logrec.classifier.context_datasets import ContextsDataset

    PrepConfig.assert_classification_config(repr)

    path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset)
    full_src_dir = os.path.join(path_to_dataset, REPR_DIR, repr)
    dest_dir = os.path.join(path_to_dataset, CLASSIFICATION_DIR, classifier,
                            args.repr)
    logger.info(f"Writing to {dest_dir}")

    os.makedirs(os.path.join(dest_dir, TRAIN_DIR), exist_ok=True)
    os.makedirs(os.path.join(dest_dir, TEST_DIR), exist_ok=True)
    os.makedirs(os.path.join(dest_dir, VALID_DIR), exist_ok=True)

    total_files = sum(
        file_mapper(full_src_dir, lambda f: 1,
                    lambda fi: fi.endswith("parsed.repr")))
    count = 0

    cases_creator = get_cases_creator(classifier)
    for lines, rel_path in file_mapper(full_src_dir, cases_creator,
                                       lambda fi: fi.endswith("parsed.repr")):
        count += 1
        logger.info(f"Processing {count} out of {total_files}")
        forward_path = os.path.join(
            dest_dir,
            re.sub("parsed\\.repr", ContextsDataset.FW_CONTEXTS_FILE_EXT,
                   rel_path))
        backward_path = os.path.join(
            dest_dir,
            re.sub("parsed\\.repr", ContextsDataset.BW_CONTEXTS_FILE_EXT,
                   rel_path))
        label_path = os.path.join(
            dest_dir,
            re.sub("parsed\\.repr", ContextsDataset.LABEL_FILE_EXT, rel_path))
        with open(forward_path,
                  'w') as f, open(backward_path,
                                  'w') as b, open(label_path, 'w') as l:
            for line in lines:
                if line:
                    l.write(f'{line[2]}\n')
                    f.write(f'{" ".join(line[0])}\n')
                    b.write(f'{" ".join(line[1])}\n')
                else:
                    l.write('\n')
                    f.write('\n')
                    b.write('\n')
def create_df_gen(dir: str, percent: float, start_from: float, backwards: bool) \
        -> Generator[pandas.DataFrame, None, None]:
    lines = []
    files_total = sum(
        f for f in file_mapper(dir,
                               include_to_df_tester(percent, start_from),
                               extension=None,
                               ignore_prefix="_"))

    DATAFRAME_LINES_THRESHOLD = 3000
    cur_file = 0
    at_least_one_frame_created = False
    for root, dirs, files in os.walk(dir):
        for file in files:
            with open(os.path.join(root, file), 'r') as f:
                if include_to_df(file, percent, start_from):
                    cur_file += 1
                    logger.debug(
                        f'Adding {os.path.join(root, file)} to dataframe [{cur_file} out of {files_total}]'
                    )
                    for line in f:
                        if backwards:
                            line = reverse_line(line)
                        lines.append(line)
                    if len(lines) > DATAFRAME_LINES_THRESHOLD:
                        logger.debug("Submitting dataFrame...")
                        yield pandas.DataFrame(lines)
                        lines = []
                        at_least_one_frame_created = True
    if lines:
        yield pandas.DataFrame(lines)
        at_least_one_frame_created = True
    if not at_least_one_frame_created:
        raise ValueError(f"No data available: {os.path.abspath(dir)}")
def show_tests(path_to_test_set: str, model: SequentialRNN, text_field: Field,
               sample_test_runs_file: str, backwards: bool, n_predictions: int,
               n_samples: int) -> None:
    logger.info("================    Running tests ============")
    counter = 0
    text = ""
    stop_showing_examples = False
    for c_filename_before, c_filename_after, l_filename in file_mapper(
            path_to_test_set, ContextsDataset._get_pair,
            lambda fi: fi.endswith('label')):
        if stop_showing_examples:
            break
        c_file_before = None
        c_file_after = None
        l_file = None
        try:
            c_file_before = open(c_filename_before, 'r')
            c_file_after = open(c_filename_after, 'r')
            l_file = open(l_filename, 'r')
            for context_before, context_after, label in zip(
                    c_file_before, c_file_after, l_file):
                if label.rstrip('\n') == '':
                    continue

                if counter >= n_samples:
                    stop_showing_examples = True
                    break

                context_before = context_before.rstrip("\n")
                context_after = context_after.rstrip("\n")
                prepared_input = prepare_input(context_before, context_after,
                                               backwards)
                formatted_input = format_input(context_before, context_after,
                                               backwards)
                probs, labels = get_predictions(model, text_field,
                                                prepared_input, n_predictions)
                formatted_predictions = format_predictions(
                    probs, labels, LEVEL_LABEL, label.rstrip("\n"))
                logger.info(formatted_input + formatted_predictions)
                text += (formatted_input + formatted_predictions)
                counter += 1
        except FileNotFoundError:
            project_name = c_filename_before[:-len(ContextsDataset.
                                                   FW_CONTEXTS_FILE_EXT)]
            logger.error(f"Project context not loaded: {project_name}")
            continue
        finally:
            if c_file_before is not None:
                c_file_before.close()
            if c_file_after is not None:
                c_file_after.close()
            if l_file is not None:
                l_file.close()
    logger.info(f"Saving test output to {sample_test_runs_file}")
    with open(sample_test_runs_file, 'w') as f:
        f.write(text)
def calc_stats(dest_dir, threshold):
    projects_to_ignore = []
    res_logged_stats = {}
    for logged_stats, proj_name in file_mapper(dest_dir, calc_logged_stats,
                                               ContextsDataset.LABEL_FILE_EXT):
        if float(logged_stats[WITH_LOGGING]) / (logged_stats[WITH_LOGGING] +
                                                logged_stats[NO_LOGGING]) <= (
                                                    threshold * 0.01):
            projects_to_ignore.append(proj_name)
        else:
            merge_dicts_(res_logged_stats, logged_stats)
    return projects_to_ignore, res_logged_stats
Exemple #6
0
    def __init__(self, path, text_field, label_field, **kwargs):
        """Create an IMDB dataset instance given a path and fields.

        Arguments:
            path: Path to the dataset's highest level directory
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        threshold = kwargs.pop("threshold", 0.0)
        context_len = kwargs.pop("context_len", 0)
        data_params = kwargs.pop("data", None)

        path_to_ignored_projects = os.path.join(
            path, '..', '..', '..',
            f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}")
        logger.info(
            f"Loading ignored projects from {path_to_ignored_projects} ...")
        ignored_projects_set = set(read_list(path_to_ignored_projects))

        fields = [('text', text_field), ('label', label_field)]
        examples = []

        for c_filename_before, c_filename_after, l_filename in file_mapper(
                path, ContextsDataset._get_pair,
                lambda fi: fi.endswith('label')):
            if not included_in_fraction(os.path.basename(l_filename),
                                        data_params.percent,
                                        data_params.start_from):
                continue

            proj_name = re.sub(f"\.{ContextsDataset.LABEL_FILE_EXT}$", "",
                               get_dir_and_file(l_filename))
            if proj_name in ignored_projects_set:
                continue

            c_file_before = None
            c_file_after = None
            l_file = None
            try:
                c_file_before = open(c_filename_before, 'r')
                c_file_after = open(c_filename_after, 'r')
                l_file = open(l_filename, 'r')
                for context_before, context_after, level in zip(
                        c_file_before, c_file_after, l_file):
                    level = level.rstrip('\n')
                    if level:
                        context_for_prediction = ContextsDataset._get_context_for_prediction(
                            context_before, context_after, context_len,
                            data_params.backwards)
                        example = data.Example.fromlist(
                            [context_for_prediction, level], fields)
                        examples.append(example)

            except FileNotFoundError:
                project_name = c_filename_before[:-len(ContextsDataset.
                                                       FW_CONTEXTS_FILE_EXT)]
                logger.error(f"Project context not loaded: {project_name}")
                continue
            finally:
                if c_file_before is not None:
                    c_file_before.close()
                if c_file_after is not None:
                    c_file_after.close()
                if l_file is not None:
                    l_file.close()

        if not examples:
            raise ValueError(
                f"Examples list is empty. (percent={data_params.percent}, start from={data_params.start_from})"
            )

        random.shuffle(examples)
        logger.debug(
            f"Number of examples gathered from {path}: {len(examples)} ")
        super(ContextsDataset, self).__init__(examples, fields, **kwargs)
Exemple #7
0
def run(full_src_dir, full_metadata_dir):
    if not os.path.exists(full_src_dir):
        logger.error(f"Dir does not exist: {full_src_dir}")
        exit(3)

    if os.path.exists(os.path.join(full_metadata_dir, 'vocabsize')):
        logger.warning(
            f"File already exists: {os.path.join(full_metadata_dir, 'vocabsize')}. Doing nothing."
        )
        exit(0)

    logger.info(f"Reading files from: {os.path.abspath(full_src_dir)}")

    all_files = [
        file for file in file_mapper(full_src_dir, lambda l: l, REPR_EXTENSION)
    ]
    if not all_files:
        logger.warning("No preprocessed files found.")
        exit(4)

    path_to_dump = os.path.join(full_metadata_dir, 'part_vocab')
    dumps_valid_file = os.path.join(path_to_dump, 'ready')

    if os.path.exists(dumps_valid_file):
        all_files = [
            file
            for file in file_mapper(path_to_dump, lambda l: l, PARTVOCAB_EXT)
        ]
        task_list = []
        removed_files = []
        for file in all_files:
            if '_' in os.path.basename(
                    file
            ):  # not very robust solution for checking if creation of this backup file
                # hasn't been terminated properly
                file, rm_files = finish_file_dumping(file)
                removed_files.extend(list(rm_files))
            if file not in removed_files:
                part_vocab = pickle.load(open(file, 'rb'))
                if not isinstance(part_vocab, PartialVocab):
                    raise TypeError(
                        f"Object {str(part_vocab)} must be VocabMerger version {part_vocab.VERSION}"
                    )
                task_list.append(part_vocab)

        logger.info(f"Loaded partially calculated vocabs from {path_to_dump}")
    else:
        logger.info(f"Calculating vocabulary from scratch")
        if os.path.exists(path_to_dump):
            shutil.rmtree(path_to_dump)
        os.makedirs(path_to_dump)
        task_list = create_initial_partial_vocabs(all_files, path_to_dump)
        open(dumps_valid_file, 'a').close()

    num_mergers = multiprocessing.cpu_count()
    logger.info(
        f"Using {num_mergers} mergers, number of partial vocabs: {len(task_list)}"
    )
    queue_size.value = len(task_list)
    merger_counter = AtomicInteger(num_mergers)
    tasks_queues, chunk_sizes = mapify_tasks(task_list)
    chunk_queue, chunk_queue_size = create_chunk_queue(chunk_sizes,
                                                       num_mergers)
    logger.info(f'==================    Starting merging    =================')
    logger.info(f'Merges need to be done: {chunk_queue_size}')
    chunk_queue_elm_counter = AtomicInteger(chunk_queue_size)
    mergers = [
        VocabMerger(i + 1, tasks_queues, path_to_dump, merger_counter,
                    chunk_queue, chunk_queue_elm_counter)
        for i in range(num_mergers)
    ]
    for merger in mergers:
        merger.start()

    for i, merger in enumerate(mergers):
        merger.join()
        logger.info(f'Merger {merger.id} has joined ({i+1}/{len(mergers)})')

    os.kill(os.getpid(), signal.SIGTERM)
def file_generator(full_src_dir, extension, percent, start_from):
    return file_mapper(
        full_src_dir, lambda l: l, lambda fi: fi.endswith(extension) and
        fractions_manager.included_in_fraction(fi, percent, start_from))