Python MarkupFormatter Exemples, anthology.formatter.MarkupFormatter Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : add_dois.py Projet : sashank06/acl-anthology

def process_volume(anthology_volume):

    collection_id, volume_id, _ = deconstruct_anthology_id(anthology_volume)

    print(f'Attempting to add DOIs for {anthology_volume}', file=sys.stderr)

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml', f'{collection_id}.xml')
    tree = ET.parse(xml_file)

    formatter = MarkupFormatter()

    num_added = 0

    volume = tree.getroot().find(f"./volume[@id='{volume_id}']")
    if volume is not None:
        volume_booktitle = volume.find(f"./meta/booktitle")
        volume_title = formatter.as_text(volume_booktitle)
        print(f'-> found existing volume "{volume_title}"', file=sys.stderr)

        # Iterate through all papers
        for paper in chain(volume.find('frontmatter'), volume.findall('paper')):
            added = add_doi(paper, collection_id, volume_id, force=args.force)
            if added:
                num_added += 1
                sleep(1)

        indent(tree.getroot())

        tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
        print(f'-> added {num_added} DOIs to to the XML for collection {collection_id}', file=sys.stderr)

    else:
        print(f'-> FATAL: volume {volume} not found in the Anthology', file=sys.stderr)
        sys.exit(1)

Exemple #2

0

Afficher le fichier

Fichier : generate_crossref_doi_metadata.py Projet : strubell/acl-anthology

def main(volumes):

    formatter = MarkupFormatter()

    ## Assemble container
    doi_batch = make_simple_element(
        "doi_batch",
        attrib={
            "xmlns": "http://www.crossref.org/schema/4.4.1",
            "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation":
            "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd",
            "version": "4.4.1",
        },
        namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"},
    )
    new_volume = etree.ElementTree(doi_batch)

    ## Assemble head
    head = make_simple_element("head", parent=new_volume.getroot())
    dbi = make_simple_element("doi_batch_id",
                              text=str(int(time.time())),
                              parent=head)

    timestamp = make_simple_element("timestamp",
                                    text=str(int(time.time())),
                                    parent=head)

    depositor = make_simple_element("depositor", parent=head)
    depositor_name = make_simple_element("depositor_name",
                                         text=DEPOSITOR_NAME,
                                         parent=depositor)
    email_address = make_simple_element("email_address",
                                        text=EMAIL_ADDRESS,
                                        parent=depositor)

    registrant = make_simple_element("registrant",
                                     text=REGISTRANT,
                                     parent=head)

    ## Assemble body
    body = make_simple_element("body", parent=new_volume.getroot())

    year = ""
    start_month = ""
    end_month = ""

    for full_volume_id in sorted(volumes):
        collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id)

        collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                       "data", "xml", f"{collection_id}.xml")
        tree = etree.parse(collection_file)

        v = tree.getroot().find(f"./volume[@id='{volume_id}']")
        if v is None:
            print(f"* Can't find volume {full_volume_id}", file=sys.stderr)
            continue

        ## Assemble frontmatter
        c = make_simple_element("conference", parent=body)
        contribs = make_simple_element("contributors", parent=c)
        editor_index = 0

        meta = v.find("./meta")
        for tag in meta:
            if tag.tag == "year":
                year = tag.text
            elif tag.tag == "month":
                month = tag.text
                try:
                    start_month = MONTH_HASH[re.split("[-–]", month)[0]]
                    end_month = MONTH_HASH[re.split("[-–]", month)[1]]
                except IndexError as e:  # only one month
                    start_month = MONTH_HASH[month]
                    end_month = MONTH_HASH[month]
                except Exception as e:
                    print(
                        f"FATAL: can't parse month {month} in {full_volume_id}",
                        file=sys.stderr,
                    )
                    sys.exit(1)
            elif tag.tag == "url":
                url = tag.text
            elif tag.tag == "booktitle":
                booktitle = formatter.as_text(tag)
            elif tag.tag == "address":
                address = tag.text
            elif tag.tag == "publisher":
                publisher = tag.text
            elif tag.tag == "editor":
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "chair",
                        "sequence":
                        "first" if editor_index == 0 else "additional",
                    },
                )
                editor_index += 1

                for name_part in tag:
                    if name_part.tag == "first":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

        # Assemble Event Metadata
        em = make_simple_element("event_metadata", parent=c)
        cn = make_simple_element("conference_name", parent=em, text=booktitle)
        cl = make_simple_element("conference_location",
                                 parent=em,
                                 text=address)
        cd = make_simple_element(
            "conference_date",
            parent=em,
            attrib={
                "start_year": year,
                "end_year": year,
                "start_month": start_month,
                "end_month": end_month,
            },
        )

        # Assemble Proceedings Metadata
        pm = make_simple_element("proceedings_metadata",
                                 parent=c,
                                 attrib={"language": "en"})
        pt = make_simple_element("proceedings_title",
                                 parent=pm,
                                 text=booktitle)
        p = make_simple_element("publisher", parent=pm)
        pn = make_simple_element("publisher_name", parent=p, text=publisher)
        pp = make_simple_element("publisher_place",
                                 parent=p,
                                 text=PUBLISHER_PLACE)
        pd = make_simple_element("publication_date", parent=pm)
        y = make_simple_element("year", parent=pd, text=year)
        noisbn = make_simple_element("noisbn",
                                     parent=pm,
                                     attrib={"reason": "simple_series"})

        # DOI assignation data
        dd = make_simple_element("doi_data", parent=pm)
        doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url)
        resource = make_simple_element("resource",
                                       parent=dd,
                                       text=ANTHOLOGY_URL.format(url))

        for paper in v.findall("./paper"):
            ## Individual Paper Data

            # TODO: this is not future-proof, should use anthology.util library functions
            aa_id = ""
            if len(url) == 6:
                aa_id = "{:02d}".format(int(paper.attrib["id"]))
            else:
                if len(url) == 5:
                    aa_id = "{:03d}".format(int(paper.attrib["id"]))

            cp = make_simple_element("conference_paper", parent=c)

            # contributors
            contribs = make_simple_element("contributors", parent=cp)
            author_index = 0
            for author in paper.findall("./author"):
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "author",
                        "sequence":
                        "first" if author_index == 0 else "additional",
                    },
                )
                author_index += 1

                for name_part in author:
                    if name_part.tag == "first":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

            for title in paper.iter(tag="title"):
                o_titles = make_simple_element("titles", parent=cp)
                o_title = make_simple_element("title",
                                              parent=o_titles,
                                              text=formatter.as_text(title))

            pd = make_simple_element("publication_date", parent=cp)
            o_year = make_simple_element("year", parent=pd)
            o_year.text = year

            for pages in paper.iter(tag="pages"):
                o_pages = make_simple_element("pages", parent=cp)
                fp = make_simple_element("first_page", parent=o_pages)
                lp = make_simple_element("last_page", parent=o_pages)
                try:
                    fp.text = re.split("[-–]", pages.text)[0]
                    lp.text = re.split("[-–]", pages.text)[1]
                except IndexError as e:  # only one page
                    fp.text = pages.text
                    lp.text = pages.text

            # DOI assignation data
            dd = make_simple_element("doi_data", parent=cp)
            doi = make_simple_element("doi",
                                      parent=dd,
                                      text=DOI_PREFIX + url + aa_id)
            resource = make_simple_element("resource",
                                           parent=dd,
                                           text=ANTHOLOGY_URL.format(url +
                                                                     aa_id))

    print(
        etree.tostring(
            new_volume,
            pretty_print=True,
            encoding="UTF-8",
            xml_declaration=True,
            with_tail=True,
        ).decode("utf-8"))

Exemple #3

0

Afficher le fichier

def main(volumes):

    formatter = MarkupFormatter()

    ## Assemble container
    doi_batch = make_simple_element(
        'doi_batch',
        attrib={
            'xmlns': 'http://www.crossref.org/schema/4.4.1',
            '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation':
            'http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd',
            'version': '4.4.1'
        },
        namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'})
    new_volume = etree.ElementTree(doi_batch)

    ## Assemble head
    head = make_simple_element('head', parent=new_volume.getroot())
    dbi = make_simple_element('doi_batch_id',
                              text=str(int(time.time())),
                              parent=head)

    timestamp = make_simple_element('timestamp',
                                    text=str(int(time.time())),
                                    parent=head)

    depositor = make_simple_element('depositor', parent=head)
    depositor_name = make_simple_element('depositor_name',
                                         text=DEPOSITOR_NAME,
                                         parent=depositor)
    email_address = make_simple_element('email_address',
                                        text=EMAIL_ADDRESS,
                                        parent=depositor)

    registrant = make_simple_element('registrant',
                                     text=REGISTRANT,
                                     parent=head)

    ## Assemble body
    body = make_simple_element('body', parent=new_volume.getroot())

    year = ""
    start_month = ""
    end_month = ""

    for full_volume_id in sorted(volumes):
        collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id)

        collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..',
                                       'data', 'xml', f'{collection_id}.xml')
        tree = etree.parse(collection_file)

        v = tree.getroot().find(f"./volume[@id='{volume_id}']")
        if v is None:
            print(f"* Can't find volume {full_volume_id}", file=sys.stderr)
            continue

        ## Assemble frontmatter
        c = make_simple_element('conference', parent=body)
        contribs = make_simple_element('contributors', parent=c)
        editor_index = 0

        meta = v.find('./meta')
        for tag in meta:
            if tag.tag == 'year':
                year = tag.text
            elif tag.tag == 'month':
                month = tag.text
                try:
                    start_month = MONTH_HASH[re.split('[-–]', month)[0]]
                    end_month = MONTH_HASH[re.split('[-–]', month)[1]]
                except IndexError as e:  # only one month
                    start_month = MONTH_HASH[month]
                    end_month = MONTH_HASH[month]
            elif tag.tag == 'url':
                url = tag.text
            elif tag.tag == 'booktitle':
                booktitle = tag.text
            elif tag.tag == 'address':
                address = tag.text
            elif tag.tag == 'publisher':
                publisher = tag.text
            elif tag.tag == 'editor':
                pn = make_simple_element(
                    'person_name',
                    parent=contribs,
                    attrib={
                        'contributor_role': 'chair',
                        'sequence':
                        'first' if editor_index == 0 else 'additional'
                    })
                editor_index += 1

                for name_part in tag:
                    if name_part.tag == 'first':
                        gn = make_simple_element('given_name',
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == 'last':
                        sn = make_simple_element('surname',
                                                 text=name_part.text,
                                                 parent=pn)

        # Assemble Event Metadata
        em = make_simple_element('event_metadata', parent=c)
        cn = make_simple_element('conference_name', parent=em, text=booktitle)
        cl = make_simple_element('conference_location',
                                 parent=em,
                                 text=address)
        cd = make_simple_element('conference_date',
                                 parent=em,
                                 attrib={
                                     'start_year': year,
                                     'end_year': year,
                                     'start_month': start_month,
                                     'end_month': end_month
                                 })

        # Assemble Proceedings Metadata
        pm = make_simple_element('proceedings_metadata',
                                 parent=c,
                                 attrib={'language': 'en'})
        pt = make_simple_element('proceedings_title',
                                 parent=pm,
                                 text=booktitle)
        p = make_simple_element('publisher', parent=pm)
        pn = make_simple_element('publisher_name', parent=p, text=publisher)
        pp = make_simple_element('publisher_place',
                                 parent=p,
                                 text=PUBLISHER_PLACE)
        pd = make_simple_element('publication_date', parent=pm)
        y = make_simple_element('year', parent=pd, text=year)
        noisbn = make_simple_element('noisbn',
                                     parent=pm,
                                     attrib={'reason': 'simple_series'})

        # DOI assignation data
        dd = make_simple_element('doi_data', parent=pm)
        doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url)
        resource = make_simple_element('resource',
                                       parent=dd,
                                       text=ANTHOLOGY_URL.format(url))

        for paper in v.findall('./paper'):
            ## Individual Paper Data

            # TODO: this is not future-proof, should use anthology.util library functions
            aa_id = ""
            if (len(url) == 6):
                aa_id = '{:02d}'.format(int(paper.attrib['id']))
            else:
                if (len(url) == 5):
                    aa_id = '{:03d}'.format(int(paper.attrib['id']))

            cp = make_simple_element('conference_paper', parent=c)

            # contributors
            contribs = make_simple_element('contributors', parent=cp)
            author_index = 0
            for author in paper.findall('./author'):
                pn = make_simple_element(
                    'person_name',
                    parent=contribs,
                    attrib={
                        'contributor_role': 'author',
                        'sequence':
                        'first' if author_index == 0 else 'additional'
                    })
                author_index += 1

                for name_part in author:
                    if name_part.tag == 'first':
                        gn = make_simple_element('given_name',
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == 'last':
                        sn = make_simple_element('surname',
                                                 text=name_part.text,
                                                 parent=pn)

            for title in paper.iter(tag='title'):
                o_titles = make_simple_element('titles', parent=cp)
                o_title = make_simple_element('title',
                                              parent=o_titles,
                                              text=formatter.as_text(title))

            pd = make_simple_element('publication_date', parent=cp)
            o_year = make_simple_element('year', parent=pd)
            o_year.text = year

            for pages in paper.iter(tag='pages'):
                o_pages = make_simple_element('pages', parent=cp)
                fp = make_simple_element('first_page', parent=o_pages)
                lp = make_simple_element('last_page', parent=o_pages)
                try:
                    fp.text = re.split('[-–]', pages.text)[0]
                    lp.text = re.split('[-–]', pages.text)[1]
                except IndexError as e:  # only one page
                    fp.text = pages.text
                    lp.text = pages.text

            # DOI assignation data
            dd = make_simple_element('doi_data', parent=cp)
            doi = make_simple_element('doi',
                                      parent=dd,
                                      text=DOI_PREFIX + url + aa_id)
            resource = make_simple_element('resource',
                                           parent=dd,
                                           text=ANTHOLOGY_URL.format(url +
                                                                     aa_id))

    print(
        etree.tostring(new_volume,
                       pretty_print=True,
                       encoding='UTF-8',
                       xml_declaration=True,
                       with_tail=True).decode('utf-8'))