def preprocess_and_write(params): from logrec.properties import REWRITE_PARSED_FILE src_dir, dest_dir, train_test_valid, project, preprocessing_param_dict = params full_dest_dir = os.path.join(dest_dir, train_test_valid) path_to_preprocessed_file = os.path.join(full_dest_dir, f'{project}.{EXTENSION}') if not os.path.exists(full_dest_dir): os.makedirs(full_dest_dir, exist_ok=True) if not REWRITE_PARSED_FILE and os.path.exists(path_to_preprocessed_file): logger.warning( f"File {path_to_preprocessed_file} already exists! Doing nothing.") return dir_with_files_to_preprocess = os.path.join(src_dir, train_test_valid, project) if not os.path.exists(dir_with_files_to_preprocess): logger.error(f"Path {dir_with_files_to_preprocess} does not exist") exit(2) filenames = [] with gzip.GzipFile(f'{path_to_preprocessed_file}.part', 'wb') as f: total_files = sum( f for f in file_mapper(dir_with_files_to_preprocess, lambda path: 1)) logger.info( f"Preprocessing java files from {dir_with_files_to_preprocess}. Files to process: {total_files}" ) pickle.dump(preprocessing_param_dict, f, pickle.HIGHEST_PROTOCOL) for ind, (lines_from_file, file_path) in enumerate( file_mapper(dir_with_files_to_preprocess, read_file_contents)): if (ind + 1) % 100 == 0: logger.info( f"[{os.path.join(train_test_valid, project)}] Parsed {ind+1} out of {total_files} files ({(ind+1)/float(total_files)*100:.2f}%)" ) parsed = apply_preprocessors(from_file(lines_from_file), pp_params["preprocessors"], {'interesting_context_words': []}) pickle.dump(parsed, f, pickle.HIGHEST_PROTOCOL) filename = os.path.relpath(file_path, start=dir_with_files_to_preprocess) filenames.append(filename) with open(os.path.join(full_dest_dir, f'.{project}.{FILENAMES_EXTENSION}'), "w") as f: for filename in filenames: try: f.write(f"{filename}\n") except UnicodeEncodeError: f.write("<bad encoding>\n") logger.warning("Filename has bad encoding") # remove .part to show that all raw files in this project have been preprocessed os.rename(f'{path_to_preprocessed_file}.part', path_to_preprocessed_file)
def run(dataset: str, repr: str, classifier: str): from logrec.classifier.context_datasets import ContextsDataset PrepConfig.assert_classification_config(repr) path_to_dataset = os.path.join(DEFAULT_PARSED_DATASETS_DIR, dataset) full_src_dir = os.path.join(path_to_dataset, REPR_DIR, repr) dest_dir = os.path.join(path_to_dataset, CLASSIFICATION_DIR, classifier, args.repr) logger.info(f"Writing to {dest_dir}") os.makedirs(os.path.join(dest_dir, TRAIN_DIR), exist_ok=True) os.makedirs(os.path.join(dest_dir, TEST_DIR), exist_ok=True) os.makedirs(os.path.join(dest_dir, VALID_DIR), exist_ok=True) total_files = sum( file_mapper(full_src_dir, lambda f: 1, lambda fi: fi.endswith("parsed.repr"))) count = 0 cases_creator = get_cases_creator(classifier) for lines, rel_path in file_mapper(full_src_dir, cases_creator, lambda fi: fi.endswith("parsed.repr")): count += 1 logger.info(f"Processing {count} out of {total_files}") forward_path = os.path.join( dest_dir, re.sub("parsed\\.repr", ContextsDataset.FW_CONTEXTS_FILE_EXT, rel_path)) backward_path = os.path.join( dest_dir, re.sub("parsed\\.repr", ContextsDataset.BW_CONTEXTS_FILE_EXT, rel_path)) label_path = os.path.join( dest_dir, re.sub("parsed\\.repr", ContextsDataset.LABEL_FILE_EXT, rel_path)) with open(forward_path, 'w') as f, open(backward_path, 'w') as b, open(label_path, 'w') as l: for line in lines: if line: l.write(f'{line[2]}\n') f.write(f'{" ".join(line[0])}\n') b.write(f'{" ".join(line[1])}\n') else: l.write('\n') f.write('\n') b.write('\n')
def create_df_gen(dir: str, percent: float, start_from: float, backwards: bool) \ -> Generator[pandas.DataFrame, None, None]: lines = [] files_total = sum( f for f in file_mapper(dir, include_to_df_tester(percent, start_from), extension=None, ignore_prefix="_")) DATAFRAME_LINES_THRESHOLD = 3000 cur_file = 0 at_least_one_frame_created = False for root, dirs, files in os.walk(dir): for file in files: with open(os.path.join(root, file), 'r') as f: if include_to_df(file, percent, start_from): cur_file += 1 logger.debug( f'Adding {os.path.join(root, file)} to dataframe [{cur_file} out of {files_total}]' ) for line in f: if backwards: line = reverse_line(line) lines.append(line) if len(lines) > DATAFRAME_LINES_THRESHOLD: logger.debug("Submitting dataFrame...") yield pandas.DataFrame(lines) lines = [] at_least_one_frame_created = True if lines: yield pandas.DataFrame(lines) at_least_one_frame_created = True if not at_least_one_frame_created: raise ValueError(f"No data available: {os.path.abspath(dir)}")
def show_tests(path_to_test_set: str, model: SequentialRNN, text_field: Field, sample_test_runs_file: str, backwards: bool, n_predictions: int, n_samples: int) -> None: logger.info("================ Running tests ============") counter = 0 text = "" stop_showing_examples = False for c_filename_before, c_filename_after, l_filename in file_mapper( path_to_test_set, ContextsDataset._get_pair, lambda fi: fi.endswith('label')): if stop_showing_examples: break c_file_before = None c_file_after = None l_file = None try: c_file_before = open(c_filename_before, 'r') c_file_after = open(c_filename_after, 'r') l_file = open(l_filename, 'r') for context_before, context_after, label in zip( c_file_before, c_file_after, l_file): if label.rstrip('\n') == '': continue if counter >= n_samples: stop_showing_examples = True break context_before = context_before.rstrip("\n") context_after = context_after.rstrip("\n") prepared_input = prepare_input(context_before, context_after, backwards) formatted_input = format_input(context_before, context_after, backwards) probs, labels = get_predictions(model, text_field, prepared_input, n_predictions) formatted_predictions = format_predictions( probs, labels, LEVEL_LABEL, label.rstrip("\n")) logger.info(formatted_input + formatted_predictions) text += (formatted_input + formatted_predictions) counter += 1 except FileNotFoundError: project_name = c_filename_before[:-len(ContextsDataset. FW_CONTEXTS_FILE_EXT)] logger.error(f"Project context not loaded: {project_name}") continue finally: if c_file_before is not None: c_file_before.close() if c_file_after is not None: c_file_after.close() if l_file is not None: l_file.close() logger.info(f"Saving test output to {sample_test_runs_file}") with open(sample_test_runs_file, 'w') as f: f.write(text)
def calc_stats(dest_dir, threshold): projects_to_ignore = [] res_logged_stats = {} for logged_stats, proj_name in file_mapper(dest_dir, calc_logged_stats, ContextsDataset.LABEL_FILE_EXT): if float(logged_stats[WITH_LOGGING]) / (logged_stats[WITH_LOGGING] + logged_stats[NO_LOGGING]) <= ( threshold * 0.01): projects_to_ignore.append(proj_name) else: merge_dicts_(res_logged_stats, logged_stats) return projects_to_ignore, res_logged_stats
def __init__(self, path, text_field, label_field, **kwargs): """Create an IMDB dataset instance given a path and fields. Arguments: path: Path to the dataset's highest level directory text_field: The field that will be used for text data. label_field: The field that will be used for label data. Remaining keyword arguments: Passed to the constructor of data.Dataset. """ threshold = kwargs.pop("threshold", 0.0) context_len = kwargs.pop("context_len", 0) data_params = kwargs.pop("data", None) path_to_ignored_projects = os.path.join( path, '..', '..', '..', f"{IGNORED_PROJECTS_FILE_NAME}.{threshold}") logger.info( f"Loading ignored projects from {path_to_ignored_projects} ...") ignored_projects_set = set(read_list(path_to_ignored_projects)) fields = [('text', text_field), ('label', label_field)] examples = [] for c_filename_before, c_filename_after, l_filename in file_mapper( path, ContextsDataset._get_pair, lambda fi: fi.endswith('label')): if not included_in_fraction(os.path.basename(l_filename), data_params.percent, data_params.start_from): continue proj_name = re.sub(f"\.{ContextsDataset.LABEL_FILE_EXT}$", "", get_dir_and_file(l_filename)) if proj_name in ignored_projects_set: continue c_file_before = None c_file_after = None l_file = None try: c_file_before = open(c_filename_before, 'r') c_file_after = open(c_filename_after, 'r') l_file = open(l_filename, 'r') for context_before, context_after, level in zip( c_file_before, c_file_after, l_file): level = level.rstrip('\n') if level: context_for_prediction = ContextsDataset._get_context_for_prediction( context_before, context_after, context_len, data_params.backwards) example = data.Example.fromlist( [context_for_prediction, level], fields) examples.append(example) except FileNotFoundError: project_name = c_filename_before[:-len(ContextsDataset. FW_CONTEXTS_FILE_EXT)] logger.error(f"Project context not loaded: {project_name}") continue finally: if c_file_before is not None: c_file_before.close() if c_file_after is not None: c_file_after.close() if l_file is not None: l_file.close() if not examples: raise ValueError( f"Examples list is empty. (percent={data_params.percent}, start from={data_params.start_from})" ) random.shuffle(examples) logger.debug( f"Number of examples gathered from {path}: {len(examples)} ") super(ContextsDataset, self).__init__(examples, fields, **kwargs)
def run(full_src_dir, full_metadata_dir): if not os.path.exists(full_src_dir): logger.error(f"Dir does not exist: {full_src_dir}") exit(3) if os.path.exists(os.path.join(full_metadata_dir, 'vocabsize')): logger.warning( f"File already exists: {os.path.join(full_metadata_dir, 'vocabsize')}. Doing nothing." ) exit(0) logger.info(f"Reading files from: {os.path.abspath(full_src_dir)}") all_files = [ file for file in file_mapper(full_src_dir, lambda l: l, REPR_EXTENSION) ] if not all_files: logger.warning("No preprocessed files found.") exit(4) path_to_dump = os.path.join(full_metadata_dir, 'part_vocab') dumps_valid_file = os.path.join(path_to_dump, 'ready') if os.path.exists(dumps_valid_file): all_files = [ file for file in file_mapper(path_to_dump, lambda l: l, PARTVOCAB_EXT) ] task_list = [] removed_files = [] for file in all_files: if '_' in os.path.basename( file ): # not very robust solution for checking if creation of this backup file # hasn't been terminated properly file, rm_files = finish_file_dumping(file) removed_files.extend(list(rm_files)) if file not in removed_files: part_vocab = pickle.load(open(file, 'rb')) if not isinstance(part_vocab, PartialVocab): raise TypeError( f"Object {str(part_vocab)} must be VocabMerger version {part_vocab.VERSION}" ) task_list.append(part_vocab) logger.info(f"Loaded partially calculated vocabs from {path_to_dump}") else: logger.info(f"Calculating vocabulary from scratch") if os.path.exists(path_to_dump): shutil.rmtree(path_to_dump) os.makedirs(path_to_dump) task_list = create_initial_partial_vocabs(all_files, path_to_dump) open(dumps_valid_file, 'a').close() num_mergers = multiprocessing.cpu_count() logger.info( f"Using {num_mergers} mergers, number of partial vocabs: {len(task_list)}" ) queue_size.value = len(task_list) merger_counter = AtomicInteger(num_mergers) tasks_queues, chunk_sizes = mapify_tasks(task_list) chunk_queue, chunk_queue_size = create_chunk_queue(chunk_sizes, num_mergers) logger.info(f'================== Starting merging =================') logger.info(f'Merges need to be done: {chunk_queue_size}') chunk_queue_elm_counter = AtomicInteger(chunk_queue_size) mergers = [ VocabMerger(i + 1, tasks_queues, path_to_dump, merger_counter, chunk_queue, chunk_queue_elm_counter) for i in range(num_mergers) ] for merger in mergers: merger.start() for i, merger in enumerate(mergers): merger.join() logger.info(f'Merger {merger.id} has joined ({i+1}/{len(mergers)})') os.kill(os.getpid(), signal.SIGTERM)
def file_generator(full_src_dir, extension, percent, start_from): return file_mapper( full_src_dir, lambda l: l, lambda fi: fi.endswith(extension) and fractions_manager.included_in_fraction(fi, percent, start_from))