def check_for_uncorrected_proofs(directory=newarticledir, text_list=uncorrected_proofs_text_list): """ For a list of articles, check whether they are the 'uncorrected proof' type One of the checks on newly downloaded articles before they're added to corpusdir :param text_list: List of DOIs :param directory: Directory containing the article files :return: all articles that are uncorrected proofs, including from main article directory """ # Read in uncorrected proofs from uncorrected_proofs_text_list txt file # If uncorrected_proofs_list txt file doesn't exist, build that list from scratch from main article directory uncorrected_proofs_list = get_uncorrected_proofs_list() # Check directory for uncorrected proofs # Append uncorrected proofs to running list articles = listdir_nohidden(directory) new_proofs = 0 for article_file in articles: if check_if_uncorrected_proof(article_file): uncorrected_proofs_list.append(filename_to_doi(article_file)) new_proofs += 1 # Copy all uncorrected proofs from list to clean text file with open(text_list, 'w') as file: for item in sorted(set(uncorrected_proofs_list)): file.write("%s\n" % item) if uncorrected_proofs_list: print("{} uncorrected proofs found. {} total in list.".format( new_proofs, len(uncorrected_proofs_list))) else: print("No uncorrected proofs found in folder or in existing list.") return uncorrected_proofs_list
def get_article_abstract(article_file): """ For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus. Make it small, medium, or large depending on number of fields desired. :param article_file: individual local PLOS XML article :return: plain-text string of content in abstract """ abstract = get_article_xml(article_file, tag_path_elements=[ "/", "article", "front", "article-meta", "abstract" ]) try: abstract_text = et.tostring(abstract[0], encoding='unicode', method='text') except IndexError: if check_article_type(article_file) == 'research-article' and \ get_plos_article_type(article_file) == 'Research Article': print('No abstract found for research article {}'.format( filename_to_doi(article_file))) abstract_text = '' # clean up text: rem white space, new line marks, blank lines abstract_text = abstract_text.strip().replace(' ', '') abstract_text = os.linesep.join( [s for s in abstract_text.splitlines() if s]) return abstract_text
def get_retracted_doi_list(article_list=None, directory=corpusdir): """ Scans through articles in a directory to see if they are retraction notifications, scans articles that are that type to find DOIs of retracted articles :return: tuple of lists of DOIs for retractions articles, and retracted articles """ retractions_doi_list = [] retracted_doi_list = [] if article_list is None: article_list = listdir_nohidden(directory) for article_file in article_list: if check_if_retraction_article(article_file): retractions_doi_list.append(filename_to_doi(article_file)) # Look in those articles to find actual articles that are retracted retracted_doi = get_related_retraction_article(article_file)[0] retracted_doi_list.append(retracted_doi) # check linked DOI for accuracy if make_regex_bool( full_doi_regex_match.search(retracted_doi)) is False: print("{} has incorrect linked DOI field: '{}'".format( article_file, retracted_doi)) if len(retractions_doi_list) == len(retracted_doi_list): print(len(retracted_doi_list), 'retracted articles found.') else: print( 'Number of retraction articles and retracted articles are different: ', '{} vs. {}'.format(len(retractions_article_list), len(retracted_article_list))) return retractions_doi_list, retracted_doi_list
def get_uncorrected_proofs_list(): """ Loads the uncorrected proofs txt file. Failing that, creates new txt file from scratch using corpusdir. :return: list of DOIs of uncorrected proofs from text list """ try: with open(uncorrected_proofs_text_list) as file: uncorrected_proofs_list = file.read().splitlines() except FileNotFoundError: print("Creating new text list of uncorrected proofs from scratch.") article_files = listdir_nohidden(corpusdir) uncorrected_proofs_list = [] max_value = len(article_files) bar = progressbar.ProgressBar(redirect_stdout=True, max_value=max_value) for i, article_file in enumerate(article_files): bar.update(i + 1) if check_if_uncorrected_proof(article_file): uncorrected_proofs_list.append(filename_to_doi(article_file)) bar.finish() print("Saving uncorrected proofs.") with open(uncorrected_proofs_text_list, 'w') as file: max_value = len(uncorrected_proofs_list) bar = progressbar.ProgressBar(redirect_stdout=True, max_value=max_value) for i, item in enumerate(sorted(uncorrected_proofs_list)): file.write("%s\n" % item) bar.update(i + 1) bar.finish() return uncorrected_proofs_list
def filename(self, value): """Sets an article object using a local filename. Converts a filename to DOI using an existing function. :param value: filename :type value: string """ self.doi = filename_to_doi(value)
def download_updated_xml(article_file, tempdir=newarticledir, vor_check=False): """ For an article file, compare local XML to remote XML If they're different, download new version of article :param article_file: the filename for a single article :param tempdir: directory where files are downloaded to :param vor_check: whether checking to see if uncorrected proof is updated :return: boolean for whether update was available & downloaded """ doi = filename_to_doi(article_file) try: os.mkdir(tempdir) except FileExistsError: pass url = URL_TMP.format(doi) articletree_remote = et.parse(url) articleXML_remote = et.tostring(articletree_remote, method='xml', encoding='unicode') if not article_file.endswith('.xml'): article_file += '.xml' try: articletree_local = et.parse( os.path.join(corpusdir, os.path.basename(article_file))) except OSError: article_file_alt = os.path.join( tempdir, os.path.basename(doi_to_path(article_file))) articletree_local = et.parse(article_file_alt) articleXML_local = et.tostring(articletree_local, method='xml', encoding='unicode') if articleXML_remote == articleXML_local: updated = False get_new = False else: get_new = True if vor_check: # make sure that update is to a VOR for uncorrected proof get_new = False path_parts = [ '/', 'article', 'front', 'article-meta', 'custom-meta-group', 'custom-meta', 'meta-value' ] r = articletree_remote.xpath("/".join(path_parts)) for x in r: if x.text == 'vor-update-to-uncorrected-proof': get_new = True break if get_new: article_path = os.path.join(tempdir, os.path.basename(article_file)) with open(article_path, 'w') as file: file.write(articleXML_remote) updated = True return updated
def test_file_conversions(self): """ TODO: What this tests are about! """ self.assertEqual(example_doi, filename_to_doi(example_file), "{0} does not transform to {1}".format(example_file, example_doi)) self.assertEqual(example_doi2, filename_to_doi(example_file2), "{0} does not transform to {1}".format(example_file2, example_doi2)) self.assertEqual(example_url, filename_to_url(example_file), "{0} does not transform to {1}".format(example_file, example_url)) self.assertEqual(example_url2, filename_to_url(example_file2), "{0} does not transform to {1}".format(example_file2, example_url2)) self.assertEqual(example_url_int, filename_to_url(example_file, plos_network=True), "{0} does not transform to {1}".format(example_file, example_url_int)) self.assertEqual(example_url2_int, filename_to_url(example_file2, plos_network=True), "{0} does not transform to {1}".format(example_file2, example_url2))
def get_article_dates(article_file, string_=False): """ For an individual article, get all of its dates :param article_file: file path/DOI of the article :return: tuple of dict of date types mapped to datetime objects for that article, dict for date strings if wrong order """ dates = {} tag_path_1 = ["/", "article", "front", "article-meta", "pub-date"] raw_xml_1 = get_article_xml(article_file=article_file, tag_path_elements=tag_path_1) for element in raw_xml_1: pub_type = element.get('pub-type') try: date = parse_article_date(element) except ValueError: print('Error getting pubdate for {}'.format(article_file)) date = '' dates[pub_type] = date tag_path_2 = ["/", "article", "front", "article-meta", "history"] raw_xml_2 = get_article_xml(article_file=article_file, tag_path_elements=tag_path_2) for element in raw_xml_2: for part in element: date_type = part.get('date-type') try: date = parse_article_date(part) except ValueError: print( 'Error getting history dates for {}'.format(article_file)) date = '' dates[date_type] = date if dates.get('received', '') and dates.get('accepted', '') in dates: if not dates['received'] <= dates['accepted'] <= dates['epub']: wrong_date_strings = { date_type: date.strftime('%Y-%m-%d') for date_type, date in dates.items() } wrong_date_strings['doi'] = filename_to_doi(article_file) # print('Dates not in correct order: {}'.format(date_strings)) else: wrong_date_strings = '' else: wrong_date_strings = '' if string_: for key, value in dates.items(): if value: dates[key] = value.strftime('%Y-%m-%d') return dates, wrong_date_strings
def get_random_list_of_dois(directory=corpusdir, count=100): ''' Gets a list of random DOIs. Tries first to construct from local files in corpusdir, otherwise tries Solr DOI list as backup. :param directory: defaults to searching corpusdir :param count: specify how many DOIs are to be returned :return: a list of random DOIs for analysis ''' try: article_list = listdir_nohidden(directory) sample_file_list = random.sample(article_list, count) sample_doi_list = [filename_to_doi(file) for file in sample_file_list] except OSError: doi_list = get_all_solr_dois() sample_doi_list = random.sample(doi_list, count) return sample_doi_list
def update_corpus_metadata_csv(csv_file='allofplos_metadata.csv', comparison_dois=None): """ Incrementally update the metadata of PLOS articles in the csv file :param csv_file: csv file of data, defaults to 'allofplos_metadata.csv' :comparison_dois: list of DOIs to check whether their metadats is included return updated corpus metadata """ # Step 1: get metadata and DOI list from existing csv file try: corpus_metadata = read_corpus_metadata_from_csv(csv_file) csv_doi_list = [row[0] for row in corpus_metadata] except FileNotFoundError: corpus_metadata = [] csv_doi_list = [] # Step 2: compare DOI list with master list if comparison_dois is None: comparison_dois = get_all_solr_dois() dois_needed_list = list(set(comparison_dois) - set(csv_doi_list)) # Step 3: compare to local file list local_doi_list = [ filename_to_doi(article_file) for article_file in listdir_nohidden(corpusdir) ] files_needed_list = list(set(dois_needed_list) - set(local_doi_list)) if files_needed_list: print( 'Local corpus must be updated before .csv metadata can be updated.\nUpdating local corpus now' ) download_check_and_move(files_needed_list, uncorrected_proofs_text_list, tempdir=newarticledir, destination=corpusdir) # Step 4: append new data to existing list new_corpus_metadata, wrong_dates = get_corpus_metadata( article_list=dois_needed_list) corpus_metadata.extend(new_corpus_metadata) # Step 5: write new dataset to .csv corpus_metadata_to_csv(corpus_metadata=corpus_metadata, csv_file='allofplos_metadata_updated.csv') return corpus_metadata
def article_doi_sanity_check(directory=corpusdir, article_list=None, source='solr'): """ For every article in a directory, make sure that the DOI field is both valid and matches the file name, if applicable. Prints invalid DOIs that don't match regex. :return: list of articles where the filename does not match the linked DOI """ messed_up_articles = [] if article_list is None: if source == 'PMC': article_list = listdir_nohidden(pmcdir, extension='.nxml') elif source == 'solr': article_list = listdir_nohidden(corpusdir) doifile_dict = { get_article_doi(article_file=article_file): article_file for article_file in article_list } doi_list = list(doifile_dict.keys()) # check for PLOS regular regex bad_doi_list = [ doi for doi in full_doi_filter(doi_list) if doi is not False ] # check for Currents regex if PMC if bad_doi_list: if directory == pmcdir or source == 'PMC': bad_doi_list = currents_doi_filter(bad_doi_list) for doi in bad_doi_list: print("{} has invalid DOI field: '{}'".format(doifile_dict[doi], doi)) if directory == corpusdir or source == 'solr': messed_up_articles = [ doifile_dict[doi] for doi in doi_list if filename_to_doi(doifile_dict[doi]) != doi ] if len(messed_up_articles) == 0: print('All article file names match DOIs.') else: print(len(messed_up_articles), 'article files have DOI errors.') return messed_up_articles return bad_doi_list
def get_dois_needed_list(comparison_list=None, directory=corpusdir): """ Takes result of query from get_all_solr_dois and compares to local article directory. :param comparison_list: Defaults to creating a full list of local article files. :param directory: An int value indicating the first row of results to return :return: A list of DOIs for articles that are not in the local article directory. """ if comparison_list is None: comparison_list = get_all_solr_dois() # Transform local files to DOIs local_article_list = [ filename_to_doi(article) for article in listdir_nohidden(directory, '.xml') ] dois_needed_list = list(set(comparison_list) - set(local_article_list)) if dois_needed_list: print(len(dois_needed_list), "new articles to download.") else: print("No new articles found to add to Corpus folder.") return dois_needed_list
def repo_download(dois, tempdir, ignore_existing=True, plos_network=False): """ Downloads a list of articles by DOI from PLOS's content-repo (crepo) to a temporary directory Use in conjunction with get_dois_needed_list :param dois: Iterable with DOIs for articles to obtain :param tempdir: Temporary directory where files are copied to :param ignore_existing: Don't re-download to tempdir if already downloaded """ # make temporary directory, if needed try: os.mkdir(tempdir) except FileExistsError: pass if ignore_existing: existing_articles = [ filename_to_doi(file) for file in listdir_nohidden(tempdir) ] dois = set(dois) - set(existing_articles) max_value = len(dois) bar = progressbar.ProgressBar(redirect_stdout=True, max_value=max_value) for i, doi in enumerate(sorted(dois)): url = URL_TMP.format(doi) articleXML = et.parse(url) article_path = doi_to_path(doi, directory=tempdir) # create new local XML files if ignore_existing is False or ignore_existing and os.path.isfile( article_path) is False: with open(article_path, 'w') as file: file.write( et.tostring(articleXML, method='xml', encoding='unicode')) if not plos_network: time.sleep(1) bar.update(i + 1) bar.finish() print(len(listdir_nohidden(tempdir)), "new articles downloaded.") logging.info(len(listdir_nohidden(tempdir)))
def from_filename(cls, filename): """Initiate an article object using a local XML file. """ return cls(filename_to_doi(filename))
def get_article_metadata(article_file, size='small'): """ For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus. Make it small, medium, or large depending on number of fields desired. :param article_file: individual local PLOS XML article :param size: small, medium or large, aka how many fields to return for each article :return: tuple of metadata fields tuple, wrong_date_strings dict """ doi = filename_to_doi(article_file) filename = os.path.basename(doi_to_path(article_file)).rstrip('.xml') title = get_article_title(article_file) journal = get_plos_journal(article_file) jats_article_type = check_article_type(article_file) plos_article_type = get_plos_article_type(article_file) dtd_version = get_article_dtd(article_file) dates, wrong_date_strings = get_article_dates(article_file, string_=True) (pubdate, collection, received, accepted) = ('', '', '', '') pubdate = dates['epub'] counts = get_article_counts(article_file) (fig_count, table_count, page_count) = ('', '', '') body_word_count = get_article_body_word_count(article_file) if jats_article_type == 'correction': related_article = get_related_article_doi(article_file, corrected=True)[0] elif jats_article_type == 'retraction': related_article = get_related_retraction_article(article_file)[0] else: related_article = '' abstract = get_article_abstract(article_file) try: collection = dates['collection'] except KeyError: pass try: received = dates['received'] except KeyError: pass try: accepted = dates['accepted'] except KeyError: pass try: fig_count = counts['fig-count'] except KeyError: pass try: table_count = counts['table-count'] except KeyError: pass try: page_count = counts['page-count'] except KeyError: pass metadata = [ doi, filename, title, journal, jats_article_type, plos_article_type, dtd_version, pubdate, received, accepted, collection, fig_count, table_count, page_count, body_word_count, related_article, abstract ] metadata = tuple(metadata) if len(metadata) == 17: return metadata, wrong_date_strings else: print('Error in {}: {} items'.format(article_file, len(metadata))) return False
def get_all_local_dois(corpusdir=corpusdir): local_dois = [ filename_to_doi(article_file) for article_file in listdir_nohidden(corpusdir) ] return local_dois