Python get_article_xmlの例、allofplos.plos_corpus.get_article_xml Pythonの例

コード例 #1

0

ファイルを表示

ファイル: corpus_analysis.py プロジェクト: aculich/allofplos

def get_plos_journal(article_file, caps_fixed=True):
    """
    For an individual PLOS article, get the journal it was published in.
    :param article_file: individual local PLOS XML article
    :param caps_fixed: whether to render the journal name correctly or as-is
    :return: PLOS journal at specified xpath location
    """
    try:
        journal = get_article_xml(article_file=article_file,
                                  tag_path_elements=[
                                      "/", "article", "front", "journal-meta",
                                      "journal-title-group", "journal-title"
                                  ])
        journal = journal[0].text
    except IndexError:
        # Need to file ticket for this
        journal_meta = get_article_xml(
            article_file='allofplos_xml/journal.pone.0047704.xml',
            tag_path_elements=["/", "article", "front", "journal-meta"])
        for journal_child in journal_meta[0]:
            if journal_child.attrib['journal-id-type'] == 'nlm-ta':
                journal = journal_child.text
                break

    if caps_fixed:
        journal = journal.split()
        if journal[0].lower() == 'plos':
            journal[0] = "PLOS"
        journal = (' ').join(journal)
    return journal

コード例 #2

0

ファイルを表示

ファイル: corpus_analysis.py プロジェクト: aculich/allofplos

def get_article_dates(article_file, string_=False):
    """
    For an individual article, get all of its dates
    :param article_file: file path/DOI of the article
    :return: tuple of dict of date types mapped to datetime objects for that article, dict for date strings if wrong order
    """
    dates = {}

    tag_path_1 = ["/", "article", "front", "article-meta", "pub-date"]
    raw_xml_1 = get_article_xml(article_file=article_file,
                                tag_path_elements=tag_path_1)
    for element in raw_xml_1:
        pub_type = element.get('pub-type')
        try:
            date = parse_article_date(element)
        except ValueError:
            print('Error getting pubdate for {}'.format(article_file))
            date = ''
        dates[pub_type] = date

    tag_path_2 = ["/", "article", "front", "article-meta", "history"]
    raw_xml_2 = get_article_xml(article_file=article_file,
                                tag_path_elements=tag_path_2)
    for element in raw_xml_2:
        for part in element:
            date_type = part.get('date-type')
            try:
                date = parse_article_date(part)
            except ValueError:
                print(
                    'Error getting history dates for {}'.format(article_file))
                date = ''
            dates[date_type] = date
    if dates.get('received', '') and dates.get('accepted', '') in dates:
        if not dates['received'] <= dates['accepted'] <= dates['epub']:
            wrong_date_strings = {
                date_type: date.strftime('%Y-%m-%d')
                for date_type, date in dates.items()
            }
            wrong_date_strings['doi'] = filename_to_doi(article_file)
            # print('Dates not in correct order: {}'.format(date_strings))
        else:
            wrong_date_strings = ''
    else:
        wrong_date_strings = ''

    if string_:
        for key, value in dates.items():
            if value:
                dates[key] = value.strftime('%Y-%m-%d')

    return dates, wrong_date_strings

コード例 #3

0

ファイルを表示

ファイル: corpus_analysis.py プロジェクト: aculich/allofplos

def get_article_abstract(article_file):
    """
    For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus.
    Make it small, medium, or large depending on number of fields desired.
    :param article_file: individual local PLOS XML article
    :return: plain-text string of content in abstract
    """
    abstract = get_article_xml(article_file,
                               tag_path_elements=[
                                   "/", "article", "front", "article-meta",
                                   "abstract"
                               ])
    try:
        abstract_text = et.tostring(abstract[0],
                                    encoding='unicode',
                                    method='text')
    except IndexError:
        if check_article_type(article_file) == 'research-article' and \
          get_plos_article_type(article_file) == 'Research Article':
            print('No abstract found for research article {}'.format(
                filename_to_doi(article_file)))

        abstract_text = ''

    # clean up text: rem white space, new line marks, blank lines
    abstract_text = abstract_text.strip().replace('  ', '')
    abstract_text = os.linesep.join(
        [s for s in abstract_text.splitlines() if s])

    return abstract_text

コード例 #4

0

ファイルを表示

ファイル: corpus_analysis.py プロジェクト: aculich/allofplos

def get_article_doi(article_file):
    raw_xml = get_article_xml(article_file=article_file,
                              tag_path_elements=[
                                  "/", "article", "front", "article-meta",
                                  "article-id"
                              ])
    for x in raw_xml:
        for name, value in x.items():
            if value == 'doi':
                doi = x.text
                break
    return doi

コード例 #5

0

ファイルを表示

ファイル: corpus_analysis.py プロジェクト: aculich/allofplos

def get_article_title(article_file):
    """
    For an individual PLOS article, get its title.
    :param article_file: individual local PLOS XML article
    :return: string of article title at specified xpath location
    """
    title = get_article_xml(article_file=article_file,
                            tag_path_elements=[
                                "/", "article", "front", "article-meta",
                                "title-group", "article-title"
                            ])
    title_text = et.tostring(title[0], encoding='unicode', method='text')
    return title_text

コード例 #6

0

ファイルを表示

ファイル: corpus_analysis.py プロジェクト: aculich/allofplos

def get_article_body_word_count(article_file):
    """
    For an article, get how many words are in the body
    :param article_file: individual local PLOS XML article
    :return: count of words in the body of the PLOS article
    """
    body = get_article_xml(article_file,
                           tag_path_elements=["/", "article", "body"])
    try:
        body_text = et.tostring(body[0], encoding='unicode', method='text')
        body_word_count = len(body_text.split(" "))
    except IndexError:
        print("Error parsing article body: {}".format(article_file))
        body_word_count = 0
    return body_word_count

コード例 #7

0

ファイルを表示

ファイル: corpus_analysis.py プロジェクト: aculich/allofplos

def get_article_dtd(article_file):
    """
    For more information on these DTD tagsets, see https://jats.nlm.nih.gov/1.1d3/ and https://dtd.nlm.nih.gov/3.0/
    """
    try:
        dtd = get_article_xml(article_file=article_file,
                              tag_path_elements=["/", "article"])
        dtd = dtd[0].attrib['dtd-version']
        if str(dtd) == '3.0':
            dtd = 'NLM 3.0'
        elif dtd == '1.1d3':
            dtd = 'JATS 1.1d3'
    except KeyError:
        print('Error parsing DTD from', article_file)
        dtd = 'N/A'
    return dtd

コード例 #8

0

ファイルを表示

ファイル: corpus_analysis.py プロジェクト: aculich/allofplos

def get_plos_article_type(article_file):
    article_categories = get_article_xml(article_file=article_file,
                                         tag_path_elements=[
                                             "/", "article", "front",
                                             "article-meta",
                                             "article-categories"
                                         ])
    subject_list = article_categories[0].getchildren()

    for i, subject in enumerate(subject_list):
        if subject.get('subj-group-type') == "heading":
            subject_instance = subject_list[i][0]
            s = ''
            for text in subject_instance.itertext():
                s = s + text
                PLOS_article_type = s
    return PLOS_article_type

コード例 #9

0

ファイルを表示

ファイル: corpus_analysis.py プロジェクト: aculich/allofplos

def get_article_counts(article_file):
    """
    For a single article, return a dictionary of the several counts functions that are available
    (figures: fig-count, pages: page-count, tables: table-count)
    :param article_file: file path/DOI of the article
    :return: counts dictionary
    """
    counts = {}

    tag_path = ["/", "article", "front", "article-meta", "counts"]
    raw_xml = get_article_xml(article_file=article_file,
                              tag_path_elements=tag_path)
    for element in raw_xml:
        for count_item in element:
            count = count_item.get('count')
            count_type = count_item.tag
            counts[count_type] = count
    if len(counts) > 3:
        print(counts)
    return counts