예제 #1
0
def check_for_uncorrected_proofs(directory=newarticledir,
                                 text_list=uncorrected_proofs_text_list):
    """
    For a list of articles, check whether they are the 'uncorrected proof' type
    One of the checks on newly downloaded articles before they're added to corpusdir
    :param text_list: List of DOIs
    :param directory: Directory containing the article files
    :return: all articles that are uncorrected proofs, including from main article directory
    """

    # Read in uncorrected proofs from uncorrected_proofs_text_list txt file
    # If uncorrected_proofs_list txt file doesn't exist, build that list from scratch from main article directory
    uncorrected_proofs_list = get_uncorrected_proofs_list()

    # Check directory for uncorrected proofs
    # Append uncorrected proofs to running list
    articles = listdir_nohidden(directory)
    new_proofs = 0
    for article_file in articles:
        if check_if_uncorrected_proof(article_file):
            uncorrected_proofs_list.append(filename_to_doi(article_file))
            new_proofs += 1
    # Copy all uncorrected proofs from list to clean text file
    with open(text_list, 'w') as file:
        for item in sorted(set(uncorrected_proofs_list)):
            file.write("%s\n" % item)
    if uncorrected_proofs_list:
        print("{} uncorrected proofs found. {} total in list.".format(
            new_proofs, len(uncorrected_proofs_list)))
    else:
        print("No uncorrected proofs found in folder or in existing list.")
    return uncorrected_proofs_list
예제 #2
0
def get_article_abstract(article_file):
    """
    For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus.
    Make it small, medium, or large depending on number of fields desired.
    :param article_file: individual local PLOS XML article
    :return: plain-text string of content in abstract
    """
    abstract = get_article_xml(article_file,
                               tag_path_elements=[
                                   "/", "article", "front", "article-meta",
                                   "abstract"
                               ])
    try:
        abstract_text = et.tostring(abstract[0],
                                    encoding='unicode',
                                    method='text')
    except IndexError:
        if check_article_type(article_file) == 'research-article' and \
          get_plos_article_type(article_file) == 'Research Article':
            print('No abstract found for research article {}'.format(
                filename_to_doi(article_file)))

        abstract_text = ''

    # clean up text: rem white space, new line marks, blank lines
    abstract_text = abstract_text.strip().replace('  ', '')
    abstract_text = os.linesep.join(
        [s for s in abstract_text.splitlines() if s])

    return abstract_text
예제 #3
0
def get_retracted_doi_list(article_list=None, directory=corpusdir):
    """
    Scans through articles in a directory to see if they are retraction notifications,
    scans articles that are that type to find DOIs of retracted articles
    :return: tuple of lists of DOIs for retractions articles, and retracted articles
    """
    retractions_doi_list = []
    retracted_doi_list = []
    if article_list is None:
        article_list = listdir_nohidden(directory)
    for article_file in article_list:
        if check_if_retraction_article(article_file):
            retractions_doi_list.append(filename_to_doi(article_file))
            # Look in those articles to find actual articles that are retracted
            retracted_doi = get_related_retraction_article(article_file)[0]
            retracted_doi_list.append(retracted_doi)
            # check linked DOI for accuracy
            if make_regex_bool(
                    full_doi_regex_match.search(retracted_doi)) is False:
                print("{} has incorrect linked DOI field: '{}'".format(
                    article_file, retracted_doi))
    if len(retractions_doi_list) == len(retracted_doi_list):
        print(len(retracted_doi_list), 'retracted articles found.')
    else:
        print(
            'Number of retraction articles and retracted articles are different: ',
            '{} vs. {}'.format(len(retractions_article_list),
                               len(retracted_article_list)))
    return retractions_doi_list, retracted_doi_list
예제 #4
0
def get_uncorrected_proofs_list():
    """
    Loads the uncorrected proofs txt file.
    Failing that, creates new txt file from scratch using corpusdir.
    :return: list of DOIs of uncorrected proofs from text list
    """
    try:
        with open(uncorrected_proofs_text_list) as file:
            uncorrected_proofs_list = file.read().splitlines()
    except FileNotFoundError:
        print("Creating new text list of uncorrected proofs from scratch.")
        article_files = listdir_nohidden(corpusdir)
        uncorrected_proofs_list = []
        max_value = len(article_files)
        bar = progressbar.ProgressBar(redirect_stdout=True,
                                      max_value=max_value)
        for i, article_file in enumerate(article_files):
            bar.update(i + 1)
            if check_if_uncorrected_proof(article_file):
                uncorrected_proofs_list.append(filename_to_doi(article_file))
        bar.finish()
        print("Saving uncorrected proofs.")
        with open(uncorrected_proofs_text_list, 'w') as file:
            max_value = len(uncorrected_proofs_list)
            bar = progressbar.ProgressBar(redirect_stdout=True,
                                          max_value=max_value)
            for i, item in enumerate(sorted(uncorrected_proofs_list)):
                file.write("%s\n" % item)
                bar.update(i + 1)
            bar.finish()
    return uncorrected_proofs_list
예제 #5
0
    def filename(self, value):
        """Sets an article object using a local filename.

        Converts a filename to DOI using an existing function.
        :param value: filename
        :type value: string
        """
        self.doi = filename_to_doi(value)
예제 #6
0
def download_updated_xml(article_file, tempdir=newarticledir, vor_check=False):
    """
    For an article file, compare local XML to remote XML
    If they're different, download new version of article
    :param article_file: the filename for a single article
    :param tempdir: directory where files are downloaded to
    :param vor_check: whether checking to see if uncorrected proof is updated
    :return: boolean for whether update was available & downloaded
    """
    doi = filename_to_doi(article_file)
    try:
        os.mkdir(tempdir)
    except FileExistsError:
        pass
    url = URL_TMP.format(doi)
    articletree_remote = et.parse(url)
    articleXML_remote = et.tostring(articletree_remote,
                                    method='xml',
                                    encoding='unicode')
    if not article_file.endswith('.xml'):
        article_file += '.xml'
    try:
        articletree_local = et.parse(
            os.path.join(corpusdir, os.path.basename(article_file)))
    except OSError:
        article_file_alt = os.path.join(
            tempdir, os.path.basename(doi_to_path(article_file)))
        articletree_local = et.parse(article_file_alt)
    articleXML_local = et.tostring(articletree_local,
                                   method='xml',
                                   encoding='unicode')

    if articleXML_remote == articleXML_local:
        updated = False
        get_new = False
    else:
        get_new = True
        if vor_check:
            # make sure that update is to a VOR for uncorrected proof
            get_new = False
            path_parts = [
                '/', 'article', 'front', 'article-meta', 'custom-meta-group',
                'custom-meta', 'meta-value'
            ]
            r = articletree_remote.xpath("/".join(path_parts))
            for x in r:
                if x.text == 'vor-update-to-uncorrected-proof':
                    get_new = True
                    break
        if get_new:
            article_path = os.path.join(tempdir,
                                        os.path.basename(article_file))
            with open(article_path, 'w') as file:
                file.write(articleXML_remote)
            updated = True
    return updated
예제 #7
0
 def test_file_conversions(self):
     """
     TODO: What this tests are about!
     """
     self.assertEqual(example_doi, filename_to_doi(example_file),
                      "{0} does not transform to {1}".format(example_file, example_doi))
     self.assertEqual(example_doi2, filename_to_doi(example_file2),
                      "{0} does not transform to {1}".format(example_file2, example_doi2))
     self.assertEqual(example_url, filename_to_url(example_file),
                      "{0} does not transform to {1}".format(example_file, example_url))
     self.assertEqual(example_url2, filename_to_url(example_file2),
                      "{0} does not transform to {1}".format(example_file2, example_url2))
     self.assertEqual(example_url_int, filename_to_url(example_file,
                      plos_network=True),
                      "{0} does not transform to {1}".format(example_file,
                      example_url_int))
     self.assertEqual(example_url2_int, filename_to_url(example_file2,
                      plos_network=True),
                      "{0} does not transform to {1}".format(example_file2,
                      example_url2))
예제 #8
0
def get_article_dates(article_file, string_=False):
    """
    For an individual article, get all of its dates
    :param article_file: file path/DOI of the article
    :return: tuple of dict of date types mapped to datetime objects for that article, dict for date strings if wrong order
    """
    dates = {}

    tag_path_1 = ["/", "article", "front", "article-meta", "pub-date"]
    raw_xml_1 = get_article_xml(article_file=article_file,
                                tag_path_elements=tag_path_1)
    for element in raw_xml_1:
        pub_type = element.get('pub-type')
        try:
            date = parse_article_date(element)
        except ValueError:
            print('Error getting pubdate for {}'.format(article_file))
            date = ''
        dates[pub_type] = date

    tag_path_2 = ["/", "article", "front", "article-meta", "history"]
    raw_xml_2 = get_article_xml(article_file=article_file,
                                tag_path_elements=tag_path_2)
    for element in raw_xml_2:
        for part in element:
            date_type = part.get('date-type')
            try:
                date = parse_article_date(part)
            except ValueError:
                print(
                    'Error getting history dates for {}'.format(article_file))
                date = ''
            dates[date_type] = date
    if dates.get('received', '') and dates.get('accepted', '') in dates:
        if not dates['received'] <= dates['accepted'] <= dates['epub']:
            wrong_date_strings = {
                date_type: date.strftime('%Y-%m-%d')
                for date_type, date in dates.items()
            }
            wrong_date_strings['doi'] = filename_to_doi(article_file)
            # print('Dates not in correct order: {}'.format(date_strings))
        else:
            wrong_date_strings = ''
    else:
        wrong_date_strings = ''

    if string_:
        for key, value in dates.items():
            if value:
                dates[key] = value.strftime('%Y-%m-%d')

    return dates, wrong_date_strings
예제 #9
0
def get_random_list_of_dois(directory=corpusdir, count=100):
    '''
    Gets a list of random DOIs. Tries first to construct from local files in
    corpusdir, otherwise tries Solr DOI list as backup.
    :param directory: defaults to searching corpusdir
    :param count: specify how many DOIs are to be returned
    :return: a list of random DOIs for analysis
    '''
    try:
        article_list = listdir_nohidden(directory)
        sample_file_list = random.sample(article_list, count)
        sample_doi_list = [filename_to_doi(file) for file in sample_file_list]
    except OSError:
        doi_list = get_all_solr_dois()
        sample_doi_list = random.sample(doi_list, count)
    return sample_doi_list
예제 #10
0
def update_corpus_metadata_csv(csv_file='allofplos_metadata.csv',
                               comparison_dois=None):
    """
    Incrementally update the metadata of PLOS articles in the csv file
    :param csv_file: csv file of data, defaults to 'allofplos_metadata.csv'
    :comparison_dois: list of DOIs to check whether their metadats is included
    return updated corpus metadata
    """
    # Step 1: get metadata and DOI list from existing csv file
    try:
        corpus_metadata = read_corpus_metadata_from_csv(csv_file)
        csv_doi_list = [row[0] for row in corpus_metadata]
    except FileNotFoundError:
        corpus_metadata = []
        csv_doi_list = []
    # Step 2: compare DOI list with master list
    if comparison_dois is None:
        comparison_dois = get_all_solr_dois()
    dois_needed_list = list(set(comparison_dois) - set(csv_doi_list))
    # Step 3: compare to local file list
    local_doi_list = [
        filename_to_doi(article_file)
        for article_file in listdir_nohidden(corpusdir)
    ]
    files_needed_list = list(set(dois_needed_list) - set(local_doi_list))
    if files_needed_list:
        print(
            'Local corpus must be updated before .csv metadata can be updated.\nUpdating local corpus now'
        )
        download_check_and_move(files_needed_list,
                                uncorrected_proofs_text_list,
                                tempdir=newarticledir,
                                destination=corpusdir)

    # Step 4: append new data to existing list
    new_corpus_metadata, wrong_dates = get_corpus_metadata(
        article_list=dois_needed_list)
    corpus_metadata.extend(new_corpus_metadata)
    # Step 5: write new dataset to .csv
    corpus_metadata_to_csv(corpus_metadata=corpus_metadata,
                           csv_file='allofplos_metadata_updated.csv')
    return corpus_metadata
예제 #11
0
def article_doi_sanity_check(directory=corpusdir,
                             article_list=None,
                             source='solr'):
    """
    For every article in a directory, make sure that the DOI field is both valid and matches
    the file name, if applicable. Prints invalid DOIs that don't match regex.
    :return: list of articles where the filename does not match the linked DOI
    """
    messed_up_articles = []
    if article_list is None:
        if source == 'PMC':
            article_list = listdir_nohidden(pmcdir, extension='.nxml')
        elif source == 'solr':
            article_list = listdir_nohidden(corpusdir)
    doifile_dict = {
        get_article_doi(article_file=article_file): article_file
        for article_file in article_list
    }
    doi_list = list(doifile_dict.keys())
    # check for PLOS regular regex
    bad_doi_list = [
        doi for doi in full_doi_filter(doi_list) if doi is not False
    ]
    # check for Currents regex if PMC
    if bad_doi_list:
        if directory == pmcdir or source == 'PMC':
            bad_doi_list = currents_doi_filter(bad_doi_list)
    for doi in bad_doi_list:
        print("{} has invalid DOI field: '{}'".format(doifile_dict[doi], doi))
    if directory == corpusdir or source == 'solr':
        messed_up_articles = [
            doifile_dict[doi] for doi in doi_list
            if filename_to_doi(doifile_dict[doi]) != doi
        ]
        if len(messed_up_articles) == 0:
            print('All article file names match DOIs.')
        else:
            print(len(messed_up_articles), 'article files have DOI errors.')
        return messed_up_articles
    return bad_doi_list
예제 #12
0
def get_dois_needed_list(comparison_list=None, directory=corpusdir):
    """
    Takes result of query from get_all_solr_dois and compares to local article directory.
    :param comparison_list: Defaults to creating a full list of local article files.
    :param directory: An int value indicating the first row of results to return
    :return: A list of DOIs for articles that are not in the local article directory.
    """
    if comparison_list is None:
        comparison_list = get_all_solr_dois()

    # Transform local files to DOIs
    local_article_list = [
        filename_to_doi(article)
        for article in listdir_nohidden(directory, '.xml')
    ]

    dois_needed_list = list(set(comparison_list) - set(local_article_list))
    if dois_needed_list:
        print(len(dois_needed_list), "new articles to download.")
    else:
        print("No new articles found to add to Corpus folder.")
    return dois_needed_list
예제 #13
0
def repo_download(dois, tempdir, ignore_existing=True, plos_network=False):
    """
    Downloads a list of articles by DOI from PLOS's content-repo (crepo) to a temporary directory
    Use in conjunction with get_dois_needed_list
    :param dois: Iterable with DOIs for articles to obtain
    :param tempdir: Temporary directory where files are copied to
    :param ignore_existing: Don't re-download to tempdir if already downloaded
    """
    # make temporary directory, if needed
    try:
        os.mkdir(tempdir)
    except FileExistsError:
        pass

    if ignore_existing:
        existing_articles = [
            filename_to_doi(file) for file in listdir_nohidden(tempdir)
        ]
        dois = set(dois) - set(existing_articles)

    max_value = len(dois)
    bar = progressbar.ProgressBar(redirect_stdout=True, max_value=max_value)
    for i, doi in enumerate(sorted(dois)):
        url = URL_TMP.format(doi)
        articleXML = et.parse(url)
        article_path = doi_to_path(doi, directory=tempdir)
        # create new local XML files
        if ignore_existing is False or ignore_existing and os.path.isfile(
                article_path) is False:
            with open(article_path, 'w') as file:
                file.write(
                    et.tostring(articleXML, method='xml', encoding='unicode'))
            if not plos_network:
                time.sleep(1)
        bar.update(i + 1)
    bar.finish()
    print(len(listdir_nohidden(tempdir)), "new articles downloaded.")
    logging.info(len(listdir_nohidden(tempdir)))
예제 #14
0
 def from_filename(cls, filename):
     """Initiate an article object using a local XML file.
     """
     return cls(filename_to_doi(filename))
예제 #15
0
def get_article_metadata(article_file, size='small'):
    """
    For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus.
    Make it small, medium, or large depending on number of fields desired.
    :param article_file: individual local PLOS XML article
    :param size: small, medium or large, aka how many fields to return for each article
    :return: tuple of metadata fields tuple, wrong_date_strings dict
    """
    doi = filename_to_doi(article_file)
    filename = os.path.basename(doi_to_path(article_file)).rstrip('.xml')
    title = get_article_title(article_file)
    journal = get_plos_journal(article_file)
    jats_article_type = check_article_type(article_file)
    plos_article_type = get_plos_article_type(article_file)
    dtd_version = get_article_dtd(article_file)
    dates, wrong_date_strings = get_article_dates(article_file, string_=True)
    (pubdate, collection, received, accepted) = ('', '', '', '')
    pubdate = dates['epub']
    counts = get_article_counts(article_file)
    (fig_count, table_count, page_count) = ('', '', '')
    body_word_count = get_article_body_word_count(article_file)
    if jats_article_type == 'correction':
        related_article = get_related_article_doi(article_file,
                                                  corrected=True)[0]
    elif jats_article_type == 'retraction':
        related_article = get_related_retraction_article(article_file)[0]
    else:
        related_article = ''
    abstract = get_article_abstract(article_file)
    try:
        collection = dates['collection']
    except KeyError:
        pass
    try:
        received = dates['received']
    except KeyError:
        pass
    try:
        accepted = dates['accepted']
    except KeyError:
        pass
    try:
        fig_count = counts['fig-count']
    except KeyError:
        pass
    try:
        table_count = counts['table-count']
    except KeyError:
        pass
    try:
        page_count = counts['page-count']
    except KeyError:
        pass
    metadata = [
        doi, filename, title, journal, jats_article_type, plos_article_type,
        dtd_version, pubdate, received, accepted, collection, fig_count,
        table_count, page_count, body_word_count, related_article, abstract
    ]
    metadata = tuple(metadata)
    if len(metadata) == 17:
        return metadata, wrong_date_strings
    else:
        print('Error in {}: {} items'.format(article_file, len(metadata)))
        return False
예제 #16
0
def get_all_local_dois(corpusdir=corpusdir):
    local_dois = [
        filename_to_doi(article_file)
        for article_file in listdir_nohidden(corpusdir)
    ]
    return local_dois