def __store_data(rows:list, output_length:int, chunks:int, saved_chunks:int, output_dir:str, fieldnames:str) -> list: data_about_to_end = (output_length - saved_chunks) < chunks and (output_length - saved_chunks) == len(rows) if len(rows) == chunks or data_about_to_end: saved_chunks = saved_chunks + chunks if not data_about_to_end else output_length filename = f'{str(saved_chunks)}.csv' output_path = os.path.join(output_dir, filename) write_csv(path=output_path, datalist=rows, fieldnames=fieldnames) rows = list() return rows, saved_chunks
def __dump_if_chunk_size(chunk:dict, existing_files:set, pid:psutil.Process) -> dict: memory_used = pid.memory_info().rss / (1024.0 ** 3) if memory_used > 10: for filepath, dump in chunk.items(): all_data = get_data(filepath) if os.path.exists(filepath) else list() all_data.extend(dump) write_csv(filepath, all_data) existing_files.add(filepath) return dict() return chunk
def __split_csvs_by_venues(files:list, venues_occurrences:dict, output_dir:str, pid:psutil.Process, verbose:bool): pathoo(output_dir) if verbose: print('[INFO:prepare_multiprocess] Splitting CSVs by venue') pbar = tqdm(total=len(files)) chunk_venues = dict() chunk_no_venues = dict() existing_files = set() no_venues_outdata = list() counter = 0 for file in files: data = get_data(file) for row in data: venues = list() if row['type'] in VENUES: venues.append(row['id'].split()) venue_and_ids = re.search(name_and_ids, row['venue']) if venue_and_ids: ids = venue_and_ids.group(2).split() venues.append(ids) if venues: output_filepath = None for venue_ids in venues: all_ids:list = venue_ids all_ids.extend(__find_all_ids_by_key(venues_occurrences, key=all_ids[0])) for any_id in all_ids: filename = any_id.replace(':', '').replace('/', '').replace('\\', '') if os.path.join(output_dir, f'{filename}.csv') in existing_files: output_filepath = os.path.join(output_dir, f'{filename}.csv') filename = all_ids[0].replace(':', '').replace('/', '').replace('\\', '') output_filepath = os.path.join(output_dir, f'{filename}.csv') if not output_filepath else output_filepath chunk_venues.setdefault(output_filepath, list()).append(row) chunk_venues = __dump_if_chunk_size(chunk_venues, existing_files, pid) elif not venues: no_venues_outdata.append(row) if len(no_venues_outdata) == 1000: no_venues_filepath = os.path.join(output_dir, f'no_venues_{counter}.csv') chunk_no_venues[no_venues_filepath] = no_venues_outdata counter += 1 no_venues_outdata = list() chunk_no_venues = __dump_if_chunk_size(chunk_no_venues, existing_files, pid) pbar.update() if verbose else None pbar.close() if verbose else None if no_venues_outdata: no_venues_filepath = os.path.join(output_dir, f'no_venues_{counter}.csv') chunk_no_venues[no_venues_filepath] = no_venues_outdata for chunk in [chunk_venues, chunk_no_venues]: for filepath, dump in chunk.items(): all_data = get_data(filepath) if os.path.exists(filepath) else list() all_data.extend(dump) write_csv(filepath, all_data) del chunk
def __split_in_chunks(output_dir:str, chunk_size:int, verbose:bool): files = os.listdir(output_dir) if verbose: print('[INFO:prepare_multiprocess] Splitting CSVs in chunks') pbar = tqdm(total=len(files)) even_chunk = list() counter = 0 for file in files: filepath = os.path.join(output_dir, file) data = get_data(filepath) len_data = len(data) if len_data > chunk_size: while len_data > chunk_size: write_csv(os.path.join(output_dir, f'{counter}.csv'), data[:chunk_size]) counter += 1 del data[:chunk_size] len_data = len(data) even_chunk.extend(data) if len(even_chunk) >= chunk_size: write_csv(os.path.join(output_dir, f'{counter}.csv'), even_chunk) counter += 1 even_chunk = list() elif len_data <= chunk_size: even_chunk.extend(data) if len(even_chunk) >= chunk_size: write_csv(os.path.join(output_dir, f'{counter}.csv'), even_chunk) counter += 1 even_chunk = list() os.remove(filepath) pbar.update() if verbose else None pbar.close() if verbose else None if even_chunk: write_csv(os.path.join(output_dir, f'{counter}.csv'), even_chunk)
def __store_csv(self, counter: int) -> None: if counter != 0 and counter % self.items_per_file == 0: if os.path.exists(self.cur_output_dir): if len(os.listdir( self.cur_output_dir)) % self.dir_split_number == 0: cur_dir = str( int(counter - self.items_per_file + self.dir_split_number * self.items_per_file)) self.cur_output_dir = os.path.join(self.output_csv_dir, cur_dir) path = os.path.join(self.cur_output_dir, f'{counter}.csv') fieldnames = [ 'id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type', 'publisher', 'editor' ] write_csv(path=path, datalist=self.data, fieldnames=fieldnames) self.data = list()
def __save_relevant_venues(items_by_id:dict, items_per_file:int, output_dir:str, fieldnames:list): output_dir = os.path.join(output_dir, 'venues') rows = list() counter = 0 for item_id, data in items_by_id.items(): item_type = data['type'] row = dict() name, ids = __get_name_and_ids(item_id, data) row['id'] = ids row['title'] = name row['type'] = item_type for volume, volume_issues in data['volume'].items(): volume_row = dict() volume_row['volume'] = volume volume_row['venue'] = f'{name} [{ids}]' if volume_issues: volume_row['type'] = 'journal issue' for volume_issue in volume_issues: volume_issue_row = dict(volume_row) volume_issue_row['issue'] = volume_issue rows.append(volume_issue_row) else: volume_row['type'] = 'journal volume' rows.append(volume_row) for venue_issue in data['issue']: issue_row = dict() issue_row['venue'] = f'{name} [{ids}]' issue_row['issue'] = venue_issue issue_row['type'] = 'journal issue' rows.append(issue_row) if not data['volume'] and not data['issue']: rows.append(row) if len(rows) >= items_per_file: output_path = os.path.join(output_dir, f"{counter}.csv") write_csv(output_path, rows, fieldnames) rows = list() counter += 1 output_path = os.path.join(output_dir, f"{counter}.csv") write_csv(output_path, rows, fieldnames)