def __split_csvs_by_venues(files:list, venues_occurrences:dict, output_dir:str, pid:psutil.Process, verbose:bool):
    pathoo(output_dir)
    if verbose:
        print('[INFO:prepare_multiprocess] Splitting CSVs by venue')
        pbar = tqdm(total=len(files))
    chunk_venues = dict()
    chunk_no_venues = dict()
    existing_files = set()
    no_venues_outdata = list()
    counter = 0
    for file in files:
        data = get_data(file)
        for row in data:
            venues = list()
            if row['type'] in VENUES:
                venues.append(row['id'].split())
            venue_and_ids = re.search(name_and_ids, row['venue'])
            if venue_and_ids:
                ids = venue_and_ids.group(2).split()
                venues.append(ids)
            if venues:
                output_filepath = None
                for venue_ids in venues:
                    all_ids:list = venue_ids
                    all_ids.extend(__find_all_ids_by_key(venues_occurrences, key=all_ids[0]))
                    for any_id in all_ids:
                        filename = any_id.replace(':', '').replace('/', '').replace('\\', '')
                        if os.path.join(output_dir, f'{filename}.csv') in existing_files:
                            output_filepath = os.path.join(output_dir, f'{filename}.csv')
                filename = all_ids[0].replace(':', '').replace('/', '').replace('\\', '')
                output_filepath = os.path.join(output_dir, f'{filename}.csv') if not output_filepath else output_filepath
                chunk_venues.setdefault(output_filepath, list()).append(row)
                chunk_venues = __dump_if_chunk_size(chunk_venues, existing_files, pid)
            elif not venues:
                no_venues_outdata.append(row)
                if len(no_venues_outdata) == 1000:
                    no_venues_filepath = os.path.join(output_dir, f'no_venues_{counter}.csv')
                    chunk_no_venues[no_venues_filepath] = no_venues_outdata
                    counter += 1
                    no_venues_outdata = list()
                chunk_no_venues = __dump_if_chunk_size(chunk_no_venues, existing_files, pid)
        pbar.update() if verbose else None
    pbar.close() if verbose else None
    if no_venues_outdata:
        no_venues_filepath = os.path.join(output_dir, f'no_venues_{counter}.csv')
        chunk_no_venues[no_venues_filepath] = no_venues_outdata
    for chunk in [chunk_venues, chunk_no_venues]:
        for filepath, dump in chunk.items():
            all_data = get_data(filepath) if os.path.exists(filepath) else list()
            all_data.extend(dump)
            write_csv(filepath, all_data)
        del chunk
def __split_in_chunks(output_dir:str, chunk_size:int, verbose:bool):
    files = os.listdir(output_dir)
    if verbose:
        print('[INFO:prepare_multiprocess] Splitting CSVs in chunks')
        pbar = tqdm(total=len(files))
    even_chunk = list()
    counter = 0
    for file in files:
        filepath = os.path.join(output_dir, file)
        data = get_data(filepath)
        len_data = len(data)
        if len_data > chunk_size:
            while len_data > chunk_size:
                write_csv(os.path.join(output_dir, f'{counter}.csv'), data[:chunk_size])
                counter += 1
                del data[:chunk_size]
                len_data = len(data)
            even_chunk.extend(data)
            if len(even_chunk) >= chunk_size:
                write_csv(os.path.join(output_dir, f'{counter}.csv'), even_chunk)
                counter += 1
                even_chunk = list()
        elif len_data <= chunk_size:
            even_chunk.extend(data)
            if len(even_chunk) >= chunk_size:
                write_csv(os.path.join(output_dir, f'{counter}.csv'), even_chunk)
                counter += 1
                even_chunk = list()
        os.remove(filepath)
        pbar.update() if verbose else None
    pbar.close() if verbose else None
    if even_chunk:
        write_csv(os.path.join(output_dir, f'{counter}.csv'), even_chunk)
def __dump_if_chunk_size(chunk:dict, existing_files:set, pid:psutil.Process) -> dict:
    memory_used = pid.memory_info().rss / (1024.0 ** 3)
    if memory_used > 10:
        for filepath, dump in chunk.items():
            all_data = get_data(filepath) if os.path.exists(filepath) else list()
            all_data.extend(dump)
            write_csv(filepath, all_data)
            existing_files.add(filepath)
        return dict()
    return chunk
def prepare_relevant_items(csv_dir:str, output_dir:str, items_per_file:int, verbose:bool) -> None:
    '''
    This function receives an input folder containing CSVs formatted for Meta. 
    It output other CSVs, including deduplicated items only. 
    You can specify how many items to insert in each output file.

    :params csv_dir: the path to the folder containing the input CSV files
    :type csv_dir: str
    :params output_dir: the location of the folder to save to output file
    :type output_dir: str
    :params items_per_file: an integer to specify how many rows to insert in each output file
    :type items_per_file: int
    :params verbose: if True, show a loading bar, elapsed, and estimated time
    :type verbose: bool
    :returns: None -- This function returns None and saves the output CSV files in the `output_dir` folder
    '''
    files = [os.path.join(csv_dir, file) for file in sort_files(os.listdir(csv_dir)) if file.endswith('.csv')]
    pbar = tqdm(total=len(files)) if verbose else None
    pathoo(output_dir)
    ids_found = set()
    venues_found = dict()
    duplicated_ids = dict()
    venues_by_id = dict()
    publishers_found = set()
    publishers_by_id = dict()
    resp_agents_found = set()
    resp_agents_by_id = dict()
    # Look for all venues, responsible agents, and publishers
    for file in files:
        data = get_data(file)
        _get_duplicated_ids(data=data, ids_found=ids_found, items_by_id=duplicated_ids)
        _get_relevant_venues(data=data, ids_found=venues_found, items_by_id=venues_by_id, overlapping_ids=ids_found)
        _get_publishers(data=data, ids_found=publishers_found, items_by_id=publishers_by_id)
        _get_resp_agents(data=data, ids_found=resp_agents_found, items_by_id=resp_agents_by_id)
        pbar.update() if verbose else None
    pbar.close() if verbose else None
    pbar = tqdm(total=len(files)) if verbose else None
    ids_merged = _do_collective_merge(duplicated_ids, verbose)
    venues_merged = _do_collective_merge(venues_by_id, verbose)
    publishers_merged = _do_collective_merge(publishers_by_id, verbose)
    resp_agents_merged = _do_collective_merge(resp_agents_by_id, verbose)
    fieldnames = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type', 'publisher', 'editor']
    __save_relevant_venues(venues_merged, items_per_file, output_dir, fieldnames)
    __save_ids(ids_merged, items_per_file, output_dir, fieldnames)
    __save_responsible_agents(resp_agents_merged, items_per_file, output_dir, fieldnames, ('people', 'author'))
    __save_responsible_agents(publishers_merged, items_per_file, output_dir, fieldnames, ('publishers', 'publisher'))
 def test_split_csvs_in_chunk(self):
     CHUNK_SIZE = 4
     split_csvs_in_chunks(csv_dir=CSV_DIR, output_dir=TMP_DIR, chunk_size=CHUNK_SIZE, verbose=False)
     output = dict()
     for file in os.listdir(TMP_DIR):
         output[file] = get_data(os.path.join(TMP_DIR, file))
     expected_output = {
         '0.csv': [
             {'id': 'doi:10.17117/na.2015.08.1067', 'title': '', 'author': '', 'pub_date': '', 'venue': 'The Korean Journal of Food And Nutrition [issn:1225-4339]', 'volume': '26', 'issue': '', 'page': '', 'type': 'journal article', 'publisher': 'Consulting Company Ucom [crossref:6623]', 'editor': 'NAIMI, ELMEHDI [orcid:0000-0002-4126-8519]'}, 
             {'id': 'doi:10.9799/ksfan.2012.25.1.069', 'title': 'Nonthermal Sterilization and Shelf-life Extension of Seafood Products by Intense Pulsed Light Treatment', 'author': 'Cheigh, Chan-Ick [orcid:0000-0003-2542-5788]; Mun, Ji-Hye; Chung, Myong-Soo', 'pub_date': '2012-3-31', 'venue': 'The Korean Journal of Food And Nutrition [issn:1225-4339]', 'volume': '25', 'issue': '1', 'page': '69-76', 'type': 'journal article', 'publisher': 'The Korean Society of Food and Nutrition [crossref:4768]', 'editor': 'Chung, Myong-Soo [orcid:0000-0002-9666-2513]'}, 
             {'id': 'doi:10.9799/ksfan.2012.25.1.069', 'title': 'Nonthermal Sterilization and Shelf-life Extension of Seafood Products by Intense Pulsed Light Treatment', 'author': 'Cheigh, Chan-Ick [orcid:0000-0003-2542-5788]; Mun, Ji-Hye; Chung, Myong-Soo', 'pub_date': '2012-3-31', 'venue': 'The Korean Journal of Food And Nutrition [issn:1225-4339]', 'volume': '25', 'issue': '1', 'page': '69-76', 'type': 'journal article', 'publisher': 'Consulting Company Ucom [crossref:6623]', 'editor': 'Chung, Myong-Soo [orcid:0000-0002-9666-2513]'}, 
             {'id': 'doi:10.9799/ksfan.2012.25.1.077', 'title': 'Properties of Immature Green Cherry Tomato Pickles', 'author': 'Koh, Jong-Ho; Shin, Hae-Hun; Kim, Young-Shik [orcid:0000-0001-5673-6314]; Kook, Moo-Chang', 'pub_date': '2012-3-31', 'venue': 'The Korean Journal of Food And Nutrition [issn:1225-4339]', 'volume': '', 'issue': '2', 'page': '77-82', 'type': 'journal article', 'publisher': 'The Korean Society of Food and Nutrition [crossref:4768]', 'editor': ''}],
         '1.csv': [
             {'id': 'issn:1524-4539 issn:0009-7322', 'title': 'Circulation', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'journal', 'publisher': '', 'editor': ''}
         ]}
     output = {k:sorted(sorted(d.items()) for d in v) for k, v in output.items()}
     expected_output = {k:sorted(sorted(d.items()) for d in v) for k, v in expected_output.items()}
     shutil.rmtree(TMP_DIR)
     self.assertEqual(output, expected_output)
def __index_all_venues(files:list, verbose:bool) -> dict:
    if verbose:
        print('[INFO:prepare_multiprocess] Scanning venues')
        pbar = tqdm(total=len(files))
    venues_occurrences = dict()
    for file in files:
        data = get_data(file)
        for row in data:
            venues = list()
            if row['type'] in VENUES:
                venues.append(row['id'].split())
            venue_and_ids = re.search(name_and_ids, row['venue'])
            if venue_and_ids:
                ids = venue_and_ids.group(2).split()
                venues.append(ids)
            for venue_ids in venues:
                for venue_id in venue_ids:
                    venues_occurrences.setdefault(venue_id, {'others': set()})
                    venues_occurrences[venue_id]['others'].update({other for other in venue_ids if other != venue_id})
        pbar.update() if verbose else None
    pbar.close() if verbose else None
    return venues_occurrences
Beispiel #7
0
 def curate_and_create(
         self,
         filename: str,
         worker_number: int = None,
         resp_agents_only: bool = False) -> Tuple[Storer, Storer, str]:
     try:
         filepath = os.path.join(self.input_csv_dir, filename)
         data = get_data(filepath)
         supplier_prefix = f'{self.supplier_prefix}0' if worker_number is None else f'{self.supplier_prefix}{str(worker_number)}0'
         # Curator
         self.info_dir = os.path.join(
             self.info_dir,
             supplier_prefix) if worker_number else self.info_dir
         curator_info_dir = os.path.join(self.info_dir, 'curator' + os.sep)
         if resp_agents_only:
             curator_obj = RespAgentsCurator(
                 data=data,
                 ts=self.triplestore_url,
                 prov_config=self.time_agnostic_library_config,
                 info_dir=curator_info_dir,
                 base_iri=self.base_iri,
                 prefix=supplier_prefix)
         else:
             curator_obj = Curator(
                 data=data,
                 ts=self.triplestore_url,
                 prov_config=self.time_agnostic_library_config,
                 info_dir=curator_info_dir,
                 base_iri=self.base_iri,
                 prefix=supplier_prefix,
                 valid_dois_cache=self.valid_dois_cache)
         name = f"{datetime.now().strftime('%Y-%m-%dT%H_%M_%S_%f')}_{supplier_prefix}"
         curator_obj.curator(filename=name,
                             path_csv=self.output_csv_dir,
                             path_index=self.indexes_dir)
         # Creator
         creator_info_dir = os.path.join(self.info_dir, 'creator' + os.sep)
         if resp_agents_only:
             creator_obj = RespAgentsCreator(
                 data=curator_obj.data,
                 endpoint=self.triplestore_url,
                 base_iri=self.base_iri,
                 info_dir=creator_info_dir,
                 supplier_prefix=supplier_prefix,
                 resp_agent=self.resp_agent,
                 ra_index=curator_obj.index_id_ra,
                 preexisting_entities=curator_obj.preexisting_entities)
         else:
             creator_obj = Creator(
                 data=curator_obj.data,
                 endpoint=self.triplestore_url,
                 base_iri=self.base_iri,
                 info_dir=creator_info_dir,
                 supplier_prefix=supplier_prefix,
                 resp_agent=self.resp_agent,
                 ra_index=curator_obj.index_id_ra,
                 br_index=curator_obj.index_id_br,
                 re_index_csv=curator_obj.re_index,
                 ar_index_csv=curator_obj.ar_index,
                 vi_index=curator_obj.VolIss,
                 preexisting_entities=curator_obj.preexisting_entities)
         creator = creator_obj.creator(source=self.source)
         # Provenance
         prov = ProvSet(creator,
                        self.base_iri,
                        creator_info_dir,
                        wanted_label=False)
         prov.generate_provenance()
         # Storer
         res_storer = Storer(creator,
                             context_map={},
                             dir_split=self.dir_split_number,
                             n_file_item=self.items_per_file,
                             default_dir=self.default_dir,
                             output_format='json-ld')
         prov_storer = Storer(prov,
                              context_map={},
                              dir_split=self.dir_split_number,
                              n_file_item=self.items_per_file,
                              output_format='json-ld')
         with suppress_stdout():
             self.store_data_and_prov(res_storer, prov_storer, filename)
         return {'success': filename}
     except Exception as e:
         template = "An exception of type {0} occurred. Arguments:\n{1!r}"
         message = template.format(type(e).__name__, e.args)
         return {'error': filename, 'msg': message}
Beispiel #8
0
 def test_run_meta_process(self):
     reset_server()
     output_folder = os.path.join(BASE_DIR, 'output_1')
     meta_process = MetaProcess(
         config=os.path.join(BASE_DIR, 'meta_config_1.yaml'))
     run_meta_process(meta_process)
     output = list()
     for dirpath, _, filenames in os.walk(os.path.join(
             output_folder, 'csv')):
         for file in filenames:
             output.extend(get_data(os.path.join(dirpath, file)))
     expected_output = [{
         'id':
         'doi:10.17117/na.2015.08.1067 meta:br/0601',
         'title':
         '',
         'author':
         '',
         'pub_date':
         '',
         'venue':
         'The Korean Journal Of Food And Nutrition [issn:1225-4339 meta:br/0603]',
         'volume':
         '26',
         'issue':
         '',
         'page':
         '',
         'type':
         'journal article',
         'publisher':
         'Consulting Company Ucom [crossref:6623 meta:ra/0601]',
         'editor':
         'Naimi, Elmehdi [orcid:0000-0002-4126-8519 meta:ra/0602]'
     }, {
         'id': 'issn:1524-4539 issn:0009-7322 meta:br/0602',
         'title': 'Circulation',
         'author': '',
         'pub_date': '',
         'venue': '',
         'volume': '',
         'issue': '',
         'page': '',
         'type': 'journal',
         'publisher': '',
         'editor': ''
     }, {
         'id':
         'doi:10.9799/ksfan.2012.25.1.069 meta:br/0605',
         'title':
         'Nonthermal Sterilization And Shelf-life Extension Of Seafood Products By Intense Pulsed Light Treatment',
         'author':
         'Cheigh, Chan-Ick [orcid:0000-0003-2542-5788 meta:ra/0603]; Mun, Ji-Hye [meta:ra/0604]; Chung, Myong-Soo [meta:ra/0605]',
         'pub_date':
         '2012-03-31',
         'venue':
         'The Korean Journal Of Food And Nutrition [issn:1225-4339 meta:br/0603]',
         'volume':
         '25',
         'issue':
         '1',
         'page':
         '69-76',
         'type':
         'journal article',
         'publisher':
         'The Korean Society Of Food And Nutrition [crossref:4768 meta:ra/0606]',
         'editor':
         'Chung, Myong-Soo [orcid:0000-0002-9666-2513 meta:ra/0607]'
     }, {
         'id': 'doi:10.9799/ksfan.2012.25.1.077 meta:br/0606',
         'title': 'Properties Of Immature Green Cherry Tomato Pickles',
         'author':
         'Koh, Jong-Ho [meta:ra/0608]; Shin, Hae-Hun [meta:ra/0609]; Kim, Young-Shik [orcid:0000-0001-5673-6314 meta:ra/06010]; Kook, Moo-Chang [meta:ra/06011]',
         'pub_date': '2012-03-31',
         'venue':
         'The Korean Journal Of Food And Nutrition [issn:1225-4339 meta:br/0603]',
         'volume': '',
         'issue': '2',
         'page': '77-82',
         'type': 'journal article',
         'publisher':
         'The Korean Society Of Food And Nutrition [crossref:4768 meta:ra/0606]',
         'editor': ''
     }]
     output = sorted(sorted(d.items()) for d in output)
     expected_output = sorted(sorted(d.items()) for d in expected_output)
     shutil.rmtree(output_folder)
     self.assertEqual(output, expected_output)
Beispiel #9
0
 def test_get_data(self):
     filepath = os.path.join(BASE_DIR, 'long_field.csv')
     data = get_data(filepath)
     field_size = sys.getsizeof(data[0]['author'])
     self.assertEqual(field_size, 137622)