def get_article_xml(article_file, tag_path_elements=None): """ For a local article file, read its XML tree Can also interpret DOIs Defaults to reading the tree location for uncorrected proofs/versions of record :param article_file: the xml file for a single article :param tag_path_elements: xpath location in the XML tree of the article file :return: content of article file at that xpath location """ if tag_path_elements is None: tag_path_elements = ('/', 'article', 'front', 'article-meta', 'custom-meta-group', 'custom-meta', 'meta-value') try: article_tree = et.parse(article_file) except OSError: if validate_doi(article_file): article_file = doi_to_path(article_file) elif article_file.endswith('xml'): article_file = article_file[:-3] + 'XML' elif article_file.endswith('XML'): article_file = article_file[:-3] + 'xml' elif article_file.endswith('nxml'): article_file = article_file[:-3] + 'nxml' elif not article_file.endswith('.'): article_file = article_file + '.xml' else: article_file = article_file + 'xml' article_tree = et.parse(article_file) articleXML = article_tree.getroot() tag_location = '/'.join(tag_path_elements) return articleXML.xpath(tag_location)
def doi(self, d): """ Using regular expressions, make sure the doi is valid before instantiating the article object. """ if validate_doi(d) is False: raise Exception("Invalid format for PLOS DOI") self.reset_memoized_attrs() self._doi = d
def validate_corpus(corpusdir=corpusdir): """ For every local article file and DOI listed on Solr, validate file names, DOIs, URLs in terms of regular expressions. Stops checking as soon as encounters problem and prints it :return: boolean of whether corpus passed validity checks """ # check DOIs plos_dois = get_all_plos_dois() plos_valid_dois = [doi for doi in plos_dois if validate_doi(doi)] if set(plos_dois) == set(plos_valid_dois): pass else: print("Invalid DOIs: {}".format(set(plos_dois) - set(plos_valid_dois))) return False # check urls plos_urls = [doi_to_url(doi) for doi in plos_valid_dois] plos_valid_urls = [url for url in plos_urls if validate_url(url)] if set(plos_urls) == set(plos_valid_urls) and len(plos_valid_urls) == len( plos_valid_dois): pass else: print("Invalid URLs: {}".format(set(plos_urls) - set(plos_valid_urls))) return False # check files and filenames plos_files = listdir_nohidden(corpusdir) if plos_files: plos_valid_filenames = [ article for article in plos_files if validate_file(article) ] if len(plos_valid_dois) == len(plos_valid_filenames): pass else: print("Invalid filenames: {}".format( set(plos_valid_dois) - set(plos_valid_filenames))) return False plos_valid_files = [ article for article in plos_valid_filenames if os.path.isfile(article) ] if set(plos_valid_filenames) == set(plos_valid_files): return True else: invalid_files = set(plos_valid_filenames) - set(plos_valid_files) if len(invalid_files) > max_invalid_files_to_print: print("Too many invalid files to print: {}".format( len(invalid_files))) else: print("Invalid files: {}".format(invalid_files)) return False else: print( "Corpus directory empty. Re-download by running create_local_plos_corpus()" ) return False
def doi_to_path(doi, directory=corpusdir): """ For a given PLOS DOI, return the relative path to that local article For DOIs that contain the word 'annotation', searches online version of the article xml to extract the journal name, which goes into the filename. Will print DOI if it can't find the journal name Uses regex to make sure it's a DOI and not a file Example: doi_to_path('10.1371/journal.pone.1000001') = 'allofplos_xml/journal.pone.1000001.xml' :param doi: full unique identifier for a PLOS article :param directory: defaults to corpusdir, containing article files :return: relative path to local XML file """ if doi.startswith(annotation_doi) and validate_doi(doi): article_file = os.path.join(directory, "plos.correction." + doi.split('/')[-1] + suffix_lower) elif validate_doi(doi): article_file = os.path.join(directory, doi.lstrip(prefix) + suffix_lower) # NOTE: The following check is weird, a DOI should never validate as a file name. elif validate_filename(doi): article_file = doi return article_file
def check_if_doi_resolves(doi, plos_valid=True): """ Return metadata for a given DOI. If the link works, make sure that it points to the same DOI Checks first if it's a valid DOI or see if it's a redirect. """ if plos_valid and validate_doi(doi) is False: return "Not valid PLOS DOI structure" url = "http://dx.doi.org/" + doi if check_if_link_works(url): headers = {"accept": "application/vnd.citationstyles.csl+json"} r = requests.get(url, headers=headers) r_doi = r.json()['DOI'] if r_doi == doi: return 'works' else: return r_doi else: return "doesn't work"
def filename_to_doi(filename): """ For a local XML file in the corpusdir directory, transform it to the article's DOI Includes transform for the 'annotation' DOIs Uses regex to make sure it's a file and not a DOI Example: filename_to_doi('journal.pone.1000001.xml') = '10.1371/journal.pone.1000001' :param article_file: relative path to local XML file in the corpusdir directory :param directory: defaults to corpusdir, containing article files :return: full unique identifier for a PLOS article """ if correction in filename and validate_filename(filename): article = 'annotation/' + (filename.split('.', 4)[2]) doi = prefix + article elif validate_filename(filename): doi = prefix + os.path.splitext((os.path.basename(filename)))[0] # NOTE: A filename should never validate as a DOI, so the next elif is wrong. elif validate_doi(filename): doi = filename return doi
def check_if_doi_resolves(self, plos_valid=True): """Whether a PLOS DOI resolves via dx.doi.org to the correct article landing page. If the link works, make sure that it points to the same DOI Checks first if it's a valid DOI or see if it's a redirect. :return: 'works' if works as expected, 'doesn't work' if it doesn't resolve correctly, or if the metadata DOI doesn't match self.doi, return the metadata DOI """ if plos_valid and validate_doi(self.doi) is False: return "Not valid PLOS DOI structure" url = "http://dx.doi.org/" + self.doi if self.check_if_link_works() is True: headers = {"accept": "application/vnd.citationstyles.csl+json"} r = requests.get(url, headers=headers) r_doi = r.json()['DOI'] if r_doi == self.doi: return "works" else: return r_doi else: return "doesn't work"