def __store_data(rows:list, output_length:int, chunks:int, saved_chunks:int, output_dir:str, fieldnames:str) -> list:
    data_about_to_end = (output_length - saved_chunks) < chunks and (output_length - saved_chunks) == len(rows)
    if len(rows) == chunks or data_about_to_end:
        saved_chunks = saved_chunks + chunks if not data_about_to_end else output_length
        filename = f'{str(saved_chunks)}.csv'
        output_path = os.path.join(output_dir, filename)
        write_csv(path=output_path, datalist=rows, fieldnames=fieldnames)
        rows = list()
    return rows, saved_chunks
def __dump_if_chunk_size(chunk:dict, existing_files:set, pid:psutil.Process) -> dict:
    memory_used = pid.memory_info().rss / (1024.0 ** 3)
    if memory_used > 10:
        for filepath, dump in chunk.items():
            all_data = get_data(filepath) if os.path.exists(filepath) else list()
            all_data.extend(dump)
            write_csv(filepath, all_data)
            existing_files.add(filepath)
        return dict()
    return chunk
def __split_csvs_by_venues(files:list, venues_occurrences:dict, output_dir:str, pid:psutil.Process, verbose:bool):
    pathoo(output_dir)
    if verbose:
        print('[INFO:prepare_multiprocess] Splitting CSVs by venue')
        pbar = tqdm(total=len(files))
    chunk_venues = dict()
    chunk_no_venues = dict()
    existing_files = set()
    no_venues_outdata = list()
    counter = 0
    for file in files:
        data = get_data(file)
        for row in data:
            venues = list()
            if row['type'] in VENUES:
                venues.append(row['id'].split())
            venue_and_ids = re.search(name_and_ids, row['venue'])
            if venue_and_ids:
                ids = venue_and_ids.group(2).split()
                venues.append(ids)
            if venues:
                output_filepath = None
                for venue_ids in venues:
                    all_ids:list = venue_ids
                    all_ids.extend(__find_all_ids_by_key(venues_occurrences, key=all_ids[0]))
                    for any_id in all_ids:
                        filename = any_id.replace(':', '').replace('/', '').replace('\\', '')
                        if os.path.join(output_dir, f'{filename}.csv') in existing_files:
                            output_filepath = os.path.join(output_dir, f'{filename}.csv')
                filename = all_ids[0].replace(':', '').replace('/', '').replace('\\', '')
                output_filepath = os.path.join(output_dir, f'{filename}.csv') if not output_filepath else output_filepath
                chunk_venues.setdefault(output_filepath, list()).append(row)
                chunk_venues = __dump_if_chunk_size(chunk_venues, existing_files, pid)
            elif not venues:
                no_venues_outdata.append(row)
                if len(no_venues_outdata) == 1000:
                    no_venues_filepath = os.path.join(output_dir, f'no_venues_{counter}.csv')
                    chunk_no_venues[no_venues_filepath] = no_venues_outdata
                    counter += 1
                    no_venues_outdata = list()
                chunk_no_venues = __dump_if_chunk_size(chunk_no_venues, existing_files, pid)
        pbar.update() if verbose else None
    pbar.close() if verbose else None
    if no_venues_outdata:
        no_venues_filepath = os.path.join(output_dir, f'no_venues_{counter}.csv')
        chunk_no_venues[no_venues_filepath] = no_venues_outdata
    for chunk in [chunk_venues, chunk_no_venues]:
        for filepath, dump in chunk.items():
            all_data = get_data(filepath) if os.path.exists(filepath) else list()
            all_data.extend(dump)
            write_csv(filepath, all_data)
        del chunk
def __split_in_chunks(output_dir:str, chunk_size:int, verbose:bool):
    files = os.listdir(output_dir)
    if verbose:
        print('[INFO:prepare_multiprocess] Splitting CSVs in chunks')
        pbar = tqdm(total=len(files))
    even_chunk = list()
    counter = 0
    for file in files:
        filepath = os.path.join(output_dir, file)
        data = get_data(filepath)
        len_data = len(data)
        if len_data > chunk_size:
            while len_data > chunk_size:
                write_csv(os.path.join(output_dir, f'{counter}.csv'), data[:chunk_size])
                counter += 1
                del data[:chunk_size]
                len_data = len(data)
            even_chunk.extend(data)
            if len(even_chunk) >= chunk_size:
                write_csv(os.path.join(output_dir, f'{counter}.csv'), even_chunk)
                counter += 1
                even_chunk = list()
        elif len_data <= chunk_size:
            even_chunk.extend(data)
            if len(even_chunk) >= chunk_size:
                write_csv(os.path.join(output_dir, f'{counter}.csv'), even_chunk)
                counter += 1
                even_chunk = list()
        os.remove(filepath)
        pbar.update() if verbose else None
    pbar.close() if verbose else None
    if even_chunk:
        write_csv(os.path.join(output_dir, f'{counter}.csv'), even_chunk)
Exemple #5
0
 def __store_csv(self, counter: int) -> None:
     if counter != 0 and counter % self.items_per_file == 0:
         if os.path.exists(self.cur_output_dir):
             if len(os.listdir(
                     self.cur_output_dir)) % self.dir_split_number == 0:
                 cur_dir = str(
                     int(counter - self.items_per_file +
                         self.dir_split_number * self.items_per_file))
                 self.cur_output_dir = os.path.join(self.output_csv_dir,
                                                    cur_dir)
         path = os.path.join(self.cur_output_dir, f'{counter}.csv')
         fieldnames = [
             'id', 'title', 'author', 'pub_date', 'venue', 'volume',
             'issue', 'page', 'type', 'publisher', 'editor'
         ]
         write_csv(path=path, datalist=self.data, fieldnames=fieldnames)
         self.data = list()
def __save_relevant_venues(items_by_id:dict, items_per_file:int, output_dir:str, fieldnames:list):
    output_dir = os.path.join(output_dir, 'venues')
    rows = list()
    counter = 0
    for item_id, data in items_by_id.items():
        item_type = data['type']
        row = dict()
        name, ids = __get_name_and_ids(item_id, data)
        row['id'] = ids
        row['title'] = name
        row['type'] = item_type
        for volume, volume_issues in data['volume'].items():
            volume_row = dict()
            volume_row['volume'] = volume
            volume_row['venue'] = f'{name} [{ids}]'
            if volume_issues:
                volume_row['type'] = 'journal issue'
                for volume_issue in volume_issues:
                    volume_issue_row = dict(volume_row)
                    volume_issue_row['issue'] = volume_issue
                    rows.append(volume_issue_row)
            else:
                volume_row['type'] = 'journal volume'
                rows.append(volume_row)
        for venue_issue in data['issue']:
            issue_row = dict()
            issue_row['venue'] = f'{name} [{ids}]'
            issue_row['issue'] = venue_issue
            issue_row['type'] = 'journal issue'
            rows.append(issue_row)
        if not data['volume'] and not data['issue']:
            rows.append(row)
        if len(rows) >= items_per_file:
            output_path = os.path.join(output_dir, f"{counter}.csv")
            write_csv(output_path, rows, fieldnames)
            rows = list()
            counter += 1
    output_path = os.path.join(output_dir, f"{counter}.csv")
    write_csv(output_path, rows, fieldnames)