def __init__(self,
              orcid_index: str = None,
              doi_csv: str = None,
              publishers_filepath: str = None):
     self.doi_set = CSVManager.load_csv_column_as_set(
         doi_csv, 'doi') if doi_csv else None
     self.publishers_mapping = self.load_publishers_mapping(
         publishers_filepath) if publishers_filepath else None
     orcid_index = orcid_index if orcid_index else None
     self.orcid_index = CSVManager(orcid_index)
Exemple #2
0
 def test_normalize_id_with_cache(self):
     identifiers = ['doi:10.1123/ijatt']
     output_data = list()
     csv_manager = CSVManager()
     csv_manager.data = {'10.1123/ijatt.2015-0070': {'v'}}
     for id in identifiers:
         output_data.append(
             Cleaner(id).normalize_id(valid_dois_cache=csv_manager))
     expected_data = [None]
     expected_cache = {
         '10.1123/ijatt.2015-0070': {'v'},
         '10.1123/ijatt': {'i'}
     }
     output = (csv_manager.data, output_data)
     expected_output = (expected_cache, expected_data)
     self.assertEqual(output, expected_output)
 def test_get_agents_strings_list_overlapping_surnames(self):
     # The surname of one author is included in the surname of another.
     authors_list = [
         {
             "given": "Puvaneswari",
             "family": "Paravamsivam",
             "sequence": "first",
             "affiliation": []
         },
         {
             "given": "Chua Kek",
             "family": "Heng",
             "sequence": "additional",
             "affiliation": []
         },
         {
             "given": "Sri Nurestri Abdul",
             "family": "Malek",
             "sequence": "additional",
             "affiliation": []
         },
         {
             "given": "Vikineswary",
             "family": "Sabaratnam",
             "sequence": "additional",
             "affiliation": []
         },
         {
             "given": "Ravishankar Ram",
             "family": "M",
             "sequence": "additional",
             "affiliation": []
         },
         {
             "given": "Umah Rani",
             "family": "Kuppusamy",
             "sequence": "additional",
             "affiliation": []
         }
     ]
     crossref_processor = CrossrefProcessing(None, None)
     csv_manager = CSVManager()
     csv_manager.data = {'10.9799/ksfan.2012.25.1.105': {'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]'}}
     crossref_processor.orcid_index = csv_manager
     authors_strings_list = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list)
     expected_authors_list = ['Paravamsivam, Puvaneswari', 'Heng, Chua Kek', 'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]', 'Sabaratnam, Vikineswary', 'M, Ravishankar Ram', 'Kuppusamy, Umah Rani']
     self.assertEqual(authors_strings_list, expected_authors_list)
 def __init__(self,
              output_path: str,
              threshold: int = 10000,
              low_memory: bool = False,
              verbose: bool = False):
     self.file_counter = 0
     self.threshold = 10000 if not threshold else int(threshold)
     self.verbose = verbose
     if self.verbose:
         print("[INFO: CSVManager] Loading existing csv file")
     self.doimanager = DOIManager(use_api_service=False)
     self.csvstorage = CSVManager(output_path=output_path,
                                  line_threshold=threshold,
                                  low_memory=low_memory)
     # ORCIDs are extracted to skip the corresponding files at the first reading of an existing CSV.
     self.cache = self.cache = set(
         el.split("[")[1][:-1].strip()
         for _, v in self.csvstorage.data.items() for el in v)
Exemple #5
0
    def __init__(self, valid_doi=None, use_api_service=True):
        if valid_doi is None:
            valid_doi = CSVManager()

        self.api = "https://doi.org/api/handles/"
        self.valid_doi = valid_doi
        self.use_api_service = use_api_service
        self.p = "doi:"
        super(DOIManager, self).__init__()
Exemple #6
0
 def test_normalize_id(self):
     identifiers = [
         'doi:10.1123/ijatt.2015-0070', 'doi:1',
         'orcid:0000-0003-0530-4305', 'orcid:0000-0000', 'issn:1479-6708',
         'issn:0000-0000', 'isbn:9783319403120', 'isbn:0000-0000'
     ]
     output = list()
     csv_manager = CSVManager()
     for id in identifiers:
         output.append(
             Cleaner(id).normalize_id(valid_dois_cache=csv_manager))
     expected_output = [
         'doi:10.1123/ijatt.2015-0070', None, 'orcid:0000-0003-0530-4305',
         None, 'issn:1479-6708', None, 'isbn:9783319403120', None
     ]
     self.assertEqual(output, expected_output)
Exemple #7
0
 def __init__(self, config: str):
     with open(config, encoding='utf-8') as file:
         settings = yaml.full_load(file)
     # Mandatory settings
     self.triplestore_url = settings['triplestore_url']
     self.input_csv_dir = normalize_path(settings['input_csv_dir'])
     self.base_output_dir = normalize_path(settings['base_output_dir'])
     self.resp_agent = settings['resp_agent']
     self.info_dir = os.path.join(self.base_output_dir, 'info_dir')
     self.output_csv_dir = os.path.join(self.base_output_dir, 'csv')
     self.output_rdf_dir = os.path.join(self.base_output_dir,
                                        f'rdf{os.sep}')
     self.indexes_dir = os.path.join(self.base_output_dir, 'indexes')
     self.cache_path = os.path.join(self.base_output_dir, 'cache.txt')
     self.errors_path = os.path.join(self.base_output_dir, 'errors.txt')
     # Optional settings
     self.base_iri = settings['base_iri']
     self.context_path = settings['context_path']
     self.dir_split_number = settings['dir_split_number']
     self.items_per_file = settings['items_per_file']
     self.default_dir = settings['default_dir']
     self.rdf_output_in_chunks = settings['rdf_output_in_chunks']
     self.source = settings['source']
     self.valid_dois_cache = CSVManager() if bool(
         settings['use_doi_api_service']) == True else None
     self.workers_number = int(settings['workers_number'])
     supplier_prefix: str = settings['supplier_prefix']
     self.supplier_prefix = supplier_prefix[:-1] if supplier_prefix.endswith(
         '0') else supplier_prefix
     self.verbose = settings['verbose']
     # Time-Agnostic_library integration
     self.time_agnostic_library_config = os.path.join(
         os.path.dirname(config), 'time_agnostic_library_config.json')
     if not os.path.exists(self.time_agnostic_library_config):
         generate_config_file(
             config_path=self.time_agnostic_library_config,
             dataset_urls=[self.triplestore_url],
             dataset_dirs=list(),
             provenance_urls=settings['provenance_endpoints'],
             provenance_dirs=list(),
             blazegraph_full_text_search=settings[
                 'blazegraph_full_text_search'],
             graphdb_connector_name=settings['graphdb_connector_name'],
             cache_endpoint=settings['cache_endpoint'],
             cache_update_endpoint=settings['cache_update_endpoint'])
class Index_orcid_doi:
    def __init__(self,
                 output_path: str,
                 threshold: int = 10000,
                 low_memory: bool = False,
                 verbose: bool = False):
        self.file_counter = 0
        self.threshold = 10000 if not threshold else int(threshold)
        self.verbose = verbose
        if self.verbose:
            print("[INFO: CSVManager] Loading existing csv file")
        self.doimanager = DOIManager(use_api_service=False)
        self.csvstorage = CSVManager(output_path=output_path,
                                     line_threshold=threshold,
                                     low_memory=low_memory)
        # ORCIDs are extracted to skip the corresponding files at the first reading of an existing CSV.
        self.cache = self.cache = set(
            el.split("[")[1][:-1].strip()
            for _, v in self.csvstorage.data.items() for el in v)

    def explorer(self, summaries_path: str) -> None:
        if self.verbose:
            print("[INFO: Index_orcid_doi] Counting files to process")
        files_to_process = [
            os.path.join(fold, file)
            for fold, _, files in os.walk(summaries_path) for file in files
            if file.replace('.xml', '') not in self.cache
        ]
        processed_files = len(self.cache)
        del self.cache
        if self.verbose:
            pbar = tqdm(total=len(files_to_process))
        for file in files_to_process:
            self.finder(file)
            self.file_counter += 1
            cur_file = self.file_counter + processed_files
            if self.file_counter % self.threshold == 0:
                self.csvstorage.dump_data(
                    f'{cur_file-self.threshold+1}-{cur_file}.csv')
            if self.verbose:
                pbar.update(1)
        cur_file = self.file_counter + processed_files
        self.csvstorage.dump_data(
            f'{cur_file + 1 - (cur_file % self.threshold)}-{cur_file}.csv')
        if self.verbose:
            pbar.close()

    def finder(self, file: str):
        orcid = file.replace('.xml', '')[-19:]
        valid_doi = False
        if file.endswith('.xml'):
            with open(file, 'r', encoding='utf-8') as xml_file:
                xml_soup = BeautifulSoup(xml_file, 'xml')
                ids = xml_soup.findAll('common:external-id')
                if ids:
                    for el in ids:
                        id_type = el.find('common:external-id-type')
                        rel = el.find('common:external-id-relationship')
                        if id_type and rel:
                            if id_type.get_text().lower(
                            ) == 'doi' and rel.get_text().lower() == 'self':
                                doi = el.find(
                                    'common:external-id-value').get_text()
                                doi = self.doimanager.normalise(doi)
                                if doi:
                                    g_name = xml_soup.find(
                                        'personal-details:given-names')
                                    f_name = xml_soup.find(
                                        'personal-details:family-name')
                                    if f_name:
                                        f_name = f_name.get_text()
                                        if g_name:
                                            g_name = g_name.get_text()
                                            name = f_name + ', ' + g_name
                                        else:
                                            name = f_name
                                        auto = name + ' [' + orcid + ']'
                                        valid_doi = True
                                        self.csvstorage.add_value(doi, auto)
        if not valid_doi:
            # Save file names where nothing was found, to skip them during the next run
            self.csvstorage.add_value('None', f'[{orcid}]')
class CrossrefProcessing:
    def __init__(self,
                 orcid_index: str = None,
                 doi_csv: str = None,
                 publishers_filepath: str = None):
        self.doi_set = CSVManager.load_csv_column_as_set(
            doi_csv, 'doi') if doi_csv else None
        self.publishers_mapping = self.load_publishers_mapping(
            publishers_filepath) if publishers_filepath else None
        orcid_index = orcid_index if orcid_index else None
        self.orcid_index = CSVManager(orcid_index)

    def csv_creator(self, data: dict) -> list:
        data = data['items']
        output = list()
        for x in data:
            if not 'DOI' in x:
                continue
            if isinstance(x['DOI'], list):
                doi = DOIManager().normalise(str(x['DOI'][0]))
            else:
                doi = DOIManager().normalise(str(x['DOI']))
            if (doi and self.doi_set
                    and doi in self.doi_set) or (doi and not self.doi_set):
                row = dict()

                # create empty row
                keys = [
                    'id', 'title', 'author', 'pub_date', 'venue', 'volume',
                    'issue', 'page', 'type', 'publisher', 'editor'
                ]
                for k in keys:
                    row[k] = ''

                if 'type' in x:
                    if x['type']:
                        row['type'] = x['type'].replace('-', ' ')

                # row['id']
                idlist = list()
                idlist.append(str('doi:' + doi))

                if 'ISBN' in x:
                    if row['type'] in {
                            'book', 'dissertation', 'edited book', 'monograph',
                            'reference book', 'report', 'standard'
                    }:
                        self.id_worker(x['ISBN'], idlist, self.isbn_worker)

                if 'ISSN' in x:
                    if row['type'] in {
                            'book series', 'book set', 'journal',
                            'proceedings series', 'series', 'standard series'
                    }:
                        self.id_worker(x['ISSN'], idlist, self.issn_worker)
                    elif row['type'] == 'report series':
                        br_id = True
                        if 'container-title' in x:
                            if x['container-title']:
                                br_id = False
                        if br_id:
                            self.id_worker(x['ISSN'], idlist, self.issn_worker)
                row['id'] = ' '.join(idlist)

                # row['title']
                if 'title' in x:
                    if x['title']:
                        if isinstance(x['title'], list):
                            text_title = x['title'][0]
                        else:
                            text_title = x['title']
                        soup = BeautifulSoup(text_title, 'html.parser')
                        title_soup = soup.get_text().replace('\n', '')
                        title = html.unescape(title_soup)
                        row['title'] = title

                # row['author']
                if 'author' in x:
                    autlist = self.get_agents_strings_list(doi, x['author'])
                    row['author'] = '; '.join(autlist)

                # row['pub_date']
                if 'issued' in x:
                    if x['issued']['date-parts'][0][0]:
                        row['pub_date'] = '-'.join(
                            [str(y) for y in x['issued']['date-parts'][0]])
                    else:
                        row['pub_date'] = ''

                # row['venue']
                row['venue'] = self.get_venue_name(x, row)

                if 'volume' in x:
                    row['volume'] = x['volume']
                if 'issue' in x:
                    row['issue'] = x['issue']
                if 'page' in x:
                    row['page'] = self.get_pages(x)

                row['publisher'] = self.get_publisher_name(doi, x)

                if 'editor' in x:
                    editlist = self.get_agents_strings_list(doi, x['editor'])
                    row['editor'] = '; '.join(editlist)
                output.append(row)
        return output

    def orcid_finder(self, doi: str) -> dict:
        found = dict()
        doi = doi.lower()
        people: List[str] = self.orcid_index.get_value(doi)
        if people:
            for person in people:
                orcid = re.search(orcid_pattern, person).group(0)
                name: str = person[:person.find(orcid) - 1]
                found[orcid] = name.strip().lower()
        return found

    def get_pages(self, item: dict) -> str:
        '''
        This function returns the pages interval. 

        :params item: the item's dictionary
        :type item: dict
        :returns: str -- The output is a string in the format 'START-END', for example, '583-584'. If there are no pages, the output is an empty string.
        '''
        roman_letters = {'I', 'V', 'X', 'L', 'C', 'D', 'M'}
        pages_list = re.split(pages_separator, item['page'])
        clean_pages_list = list()
        for page in pages_list:
            # e.g. 583-584
            if all(c.isdigit() for c in page):
                clean_pages_list.append(page)
            # e.g. G27. It is a born digital document. PeerJ uses this approach, where G27 identifies the whole document, since it has no pages.
            elif len(pages_list) == 1:
                clean_pages_list.append(page)
            # e.g. iv-vii. This syntax is used in the prefaces.
            elif all(c.upper() in roman_letters for c in page):
                clean_pages_list.append(page)
            # 583b-584. It is an error. The b must be removed.
            elif any(c.isdigit() for c in page):
                page_without_letters = ''.join(
                    [c for c in page if c.isdigit()])
                clean_pages_list.append(page_without_letters)
        pages = '-'.join(clean_pages_list)
        return pages

    def get_publisher_name(self, doi: str, item: dict) -> str:
        '''
        This function aims to return a publisher's name and id. If a mapping was provided, 
        it is used to find the publisher's standardized name from its id or DOI prefix. 

        :params doi: the item's DOI
        :type doi: str
        :params item: the item's dictionary
        :type item: dict
        :returns: str -- The output is a string in the format 'NAME [SCHEMA:ID]', for example, 'American Medical Association (AMA) [crossref:10]'. If the id does not exist, the output is only the name. Finally, if there is no publisher, the output is an empty string.
        '''
        data = {'publisher': '', 'member': None, 'prefix': doi.split('/')[0]}
        for field in {'publisher', 'member', 'prefix'}:
            if field in item:
                if item[field]:
                    data[field] = item[field]
        publisher = data['publisher']
        member = data['member']
        prefix = data['prefix']
        relevant_member = False
        if self.publishers_mapping and member:
            if member in self.publishers_mapping:
                relevant_member = True
        if self.publishers_mapping:
            if relevant_member:
                name = self.publishers_mapping[member]['name']
                name_and_id = f'{name} [crossref:{member}]'
            else:
                member_dict = next(({
                    member: data
                } for member, data in self.publishers_mapping.items()
                                    if prefix in data['prefixes']), None)
                if member_dict:
                    member = list(member_dict.keys())[0]
                    name_and_id = f"{member_dict[member]['name']} [crossref:{member}]"
                else:
                    name_and_id = publisher
        else:
            name_and_id = f'{publisher} [crossref:{member}]' if member else publisher
        return name_and_id

    def get_venue_name(self, item: dict, row: dict) -> str:
        '''
        This method deals with generating the venue's name, followed by id in square brackets, separated by spaces. 
        HTML tags are deleted and HTML entities escaped. In addition, any ISBN and ISSN are validated. 
        Finally, the square brackets in the venue name are replaced by round brackets to avoid conflicts with the ids enclosures.

        :params item: the item's dictionary
        :type item: dict
        :params row: a CSV row
        :type row: dict
        :returns: str -- The output is a string in the format 'NAME [SCHEMA:ID]', for example, 'Nutrition & Food Science [issn:0034-6659]'. If the id does not exist, the output is only the name. Finally, if there is no venue, the output is an empty string.
        '''
        name_and_id = ''
        if 'container-title' in item:
            if item['container-title']:
                if isinstance(item['container-title'], list):
                    ventit = str(item['container-title'][0]).replace('\n', '')
                else:
                    ventit = str(item['container-title']).replace('\n', '')
                ven_soup = BeautifulSoup(ventit, 'html.parser')
                ventit = html.unescape(ven_soup.get_text())
                ambiguous_brackets = re.search(ids_inside_square_brackets,
                                               ventit)
                if ambiguous_brackets:
                    match = ambiguous_brackets.group(1)
                    open_bracket = ventit.find(match) - 1
                    close_bracket = ventit.find(match) + len(match)
                    ventit = ventit[:open_bracket] + '(' + ventit[
                        open_bracket + 1:]
                    ventit = ventit[:close_bracket] + ')' + ventit[
                        close_bracket + 1:]
                venidlist = list()
                if 'ISBN' in item:
                    if row['type'] in {
                            'book chapter', 'book part', 'book section',
                            'book track', 'reference entry'
                    }:
                        self.id_worker(item['ISBN'], venidlist,
                                       self.isbn_worker)

                if 'ISSN' in item:
                    if row['type'] in {
                            'book', 'data file', 'dataset', 'edited book',
                            'journal article', 'journal volume',
                            'journal issue', 'monograph', 'proceedings',
                            'peer review', 'reference book', 'reference entry',
                            'report'
                    }:
                        self.id_worker(item['ISSN'], venidlist,
                                       self.issn_worker)
                    elif row['type'] == 'report series':
                        if 'container-title' in item:
                            if item['container-title']:
                                self.id_worker(item['ISSN'], venidlist,
                                               self.issn_worker)
                if venidlist:
                    name_and_id = ventit + ' [' + ' '.join(venidlist) + ']'
                else:
                    name_and_id = ventit
        return name_and_id

    def get_agents_strings_list(self, doi: str,
                                agents_list: List[dict]) -> list:
        agents_strings_list = list()
        dict_orcid = None
        if not all('ORCID' in agent for agent in agents_list):
            dict_orcid = self.orcid_finder(doi)
        agents_list = [{
            k: Cleaner(v).remove_unwanted_characters()
            if k in {'family', 'given', 'name'} else v
            for k, v in agent_dict.items()
        } for agent_dict in agents_list]
        for agent in agents_list:
            f_name = None
            g_name = None
            agent_string = None
            if 'family' in agent:
                f_name = agent['family']
                if 'given' in agent:
                    g_name = agent['given']
                    agent_string = f_name + ', ' + g_name
                else:
                    agent_string = f_name + ', '
            elif 'name' in agent:
                agent_string = agent['name']
                f_name = agent_string.split(
                )[-1] if ' ' in agent_string else None
            elif 'given' in agent and 'family' not in agent:
                agent_string = ', ' + agent['given']
            orcid = None
            if 'ORCID' in agent:
                if isinstance(agent['ORCID'], list):
                    orcid = str(agent['ORCID'][0])
                else:
                    orcid = str(agent['ORCID'])
                orcid = ORCIDManager().normalise(
                    orcid) if ORCIDManager().is_valid(orcid) else None
            elif dict_orcid and f_name:
                for ori in dict_orcid:
                    orc_n: List[str] = dict_orcid[ori].split(', ')
                    orc_f = orc_n[0].lower()
                    orc_g = orc_n[1] if len(orc_n) == 2 else None
                    if f_name.lower() in orc_f.lower() or orc_f.lower(
                    ) in f_name.lower():
                        # If there are several authors with the same surname
                        if len([
                                person for person in agents_list
                                if 'family' in person if person['family']
                                if person['family'].lower() in orc_f.lower()
                                or orc_f.lower() in person['family'].lower()
                        ]) > 1 and g_name and orc_g:
                            # If there are several authors with the same surname and the same given names' initials
                            if len([
                                    person for person in agents_list
                                    if 'given' in person if person['given']
                                    if person['given'][0].lower() ==
                                    orc_g[0].lower()
                            ]) > 1:
                                # If there are no homonyms
                                if not len([
                                        person for person in agents_list
                                        if 'given' in person if person['given']
                                        if person['given'].lower() ==
                                        orc_g.lower()
                                ]) > 1:
                                    if orc_g.lower() == g_name.lower():
                                        orcid = ori
                            elif orc_g[0].lower() == g_name[0].lower():
                                orcid = ori
                        else:
                            orcid = ori
            if agent_string and orcid:
                agent_string += ' [' + 'orcid:' + str(orcid) + ']'
            if agent_string:
                agents_strings_list.append(agent_string)
        return agents_strings_list

    @staticmethod
    def id_worker(field, idlist: list, func) -> None:
        if isinstance(field, list):
            for i in field:
                func(str(i), idlist)
        else:
            id = str(field)
            func(id, idlist)

    @staticmethod
    def issn_worker(issnid, idlist):
        if ISSNManager().is_valid(issnid):
            issnid = ISSNManager().normalise(issnid, include_prefix=True)
            idlist.append(issnid)

    @staticmethod
    def isbn_worker(isbnid, idlist):
        if ISBNManager().is_valid(isbnid):
            isbnid = ISBNManager().normalise(isbnid, include_prefix=True)
            idlist.append(isbnid)

    @staticmethod
    def load_publishers_mapping(publishers_filepath: str) -> dict:
        publishers_mapping: Dict[str, Dict[str, set]] = dict()
        with open(publishers_filepath, 'r', encoding='utf-8') as f:
            data = DictReader(f)
            for row in data:
                id = row['id']
                publishers_mapping.setdefault(id, dict())
                publishers_mapping[id]['name'] = row['name']
                publishers_mapping[id].setdefault('prefixes',
                                                  set()).add(row['prefix'])
        return publishers_mapping