Python ANTHOLOGY_URL Exemples, anthology.data.ANTHOLOGY_URL Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : generate_crossref_doi_metadata.py Projet : strubell/acl-anthology

def main(volumes):

    formatter = MarkupFormatter()

    ## Assemble container
    doi_batch = make_simple_element(
        "doi_batch",
        attrib={
            "xmlns": "http://www.crossref.org/schema/4.4.1",
            "{http://www.w3.org/2001/XMLSchema-instance}schemaLocation":
            "http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd",
            "version": "4.4.1",
        },
        namespaces={"xsi": "http://www.w3.org/2001/XMLSchema-instance"},
    )
    new_volume = etree.ElementTree(doi_batch)

    ## Assemble head
    head = make_simple_element("head", parent=new_volume.getroot())
    dbi = make_simple_element("doi_batch_id",
                              text=str(int(time.time())),
                              parent=head)

    timestamp = make_simple_element("timestamp",
                                    text=str(int(time.time())),
                                    parent=head)

    depositor = make_simple_element("depositor", parent=head)
    depositor_name = make_simple_element("depositor_name",
                                         text=DEPOSITOR_NAME,
                                         parent=depositor)
    email_address = make_simple_element("email_address",
                                        text=EMAIL_ADDRESS,
                                        parent=depositor)

    registrant = make_simple_element("registrant",
                                     text=REGISTRANT,
                                     parent=head)

    ## Assemble body
    body = make_simple_element("body", parent=new_volume.getroot())

    year = ""
    start_month = ""
    end_month = ""

    for full_volume_id in sorted(volumes):
        collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id)

        collection_file = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                       "data", "xml", f"{collection_id}.xml")
        tree = etree.parse(collection_file)

        v = tree.getroot().find(f"./volume[@id='{volume_id}']")
        if v is None:
            print(f"* Can't find volume {full_volume_id}", file=sys.stderr)
            continue

        ## Assemble frontmatter
        c = make_simple_element("conference", parent=body)
        contribs = make_simple_element("contributors", parent=c)
        editor_index = 0

        meta = v.find("./meta")
        for tag in meta:
            if tag.tag == "year":
                year = tag.text
            elif tag.tag == "month":
                month = tag.text
                try:
                    start_month = MONTH_HASH[re.split("[-–]", month)[0]]
                    end_month = MONTH_HASH[re.split("[-–]", month)[1]]
                except IndexError as e:  # only one month
                    start_month = MONTH_HASH[month]
                    end_month = MONTH_HASH[month]
                except Exception as e:
                    print(
                        f"FATAL: can't parse month {month} in {full_volume_id}",
                        file=sys.stderr,
                    )
                    sys.exit(1)
            elif tag.tag == "url":
                url = tag.text
            elif tag.tag == "booktitle":
                booktitle = formatter.as_text(tag)
            elif tag.tag == "address":
                address = tag.text
            elif tag.tag == "publisher":
                publisher = tag.text
            elif tag.tag == "editor":
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "chair",
                        "sequence":
                        "first" if editor_index == 0 else "additional",
                    },
                )
                editor_index += 1

                for name_part in tag:
                    if name_part.tag == "first":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

        # Assemble Event Metadata
        em = make_simple_element("event_metadata", parent=c)
        cn = make_simple_element("conference_name", parent=em, text=booktitle)
        cl = make_simple_element("conference_location",
                                 parent=em,
                                 text=address)
        cd = make_simple_element(
            "conference_date",
            parent=em,
            attrib={
                "start_year": year,
                "end_year": year,
                "start_month": start_month,
                "end_month": end_month,
            },
        )

        # Assemble Proceedings Metadata
        pm = make_simple_element("proceedings_metadata",
                                 parent=c,
                                 attrib={"language": "en"})
        pt = make_simple_element("proceedings_title",
                                 parent=pm,
                                 text=booktitle)
        p = make_simple_element("publisher", parent=pm)
        pn = make_simple_element("publisher_name", parent=p, text=publisher)
        pp = make_simple_element("publisher_place",
                                 parent=p,
                                 text=PUBLISHER_PLACE)
        pd = make_simple_element("publication_date", parent=pm)
        y = make_simple_element("year", parent=pd, text=year)
        noisbn = make_simple_element("noisbn",
                                     parent=pm,
                                     attrib={"reason": "simple_series"})

        # DOI assignation data
        dd = make_simple_element("doi_data", parent=pm)
        doi = make_simple_element("doi", parent=dd, text=DOI_PREFIX + url)
        resource = make_simple_element("resource",
                                       parent=dd,
                                       text=ANTHOLOGY_URL.format(url))

        for paper in v.findall("./paper"):
            ## Individual Paper Data

            # TODO: this is not future-proof, should use anthology.util library functions
            aa_id = ""
            if len(url) == 6:
                aa_id = "{:02d}".format(int(paper.attrib["id"]))
            else:
                if len(url) == 5:
                    aa_id = "{:03d}".format(int(paper.attrib["id"]))

            cp = make_simple_element("conference_paper", parent=c)

            # contributors
            contribs = make_simple_element("contributors", parent=cp)
            author_index = 0
            for author in paper.findall("./author"):
                pn = make_simple_element(
                    "person_name",
                    parent=contribs,
                    attrib={
                        "contributor_role": "author",
                        "sequence":
                        "first" if author_index == 0 else "additional",
                    },
                )
                author_index += 1

                for name_part in author:
                    if name_part.tag == "first":
                        gn = make_simple_element("given_name",
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == "last":
                        sn = make_simple_element("surname",
                                                 text=name_part.text,
                                                 parent=pn)

            for title in paper.iter(tag="title"):
                o_titles = make_simple_element("titles", parent=cp)
                o_title = make_simple_element("title",
                                              parent=o_titles,
                                              text=formatter.as_text(title))

            pd = make_simple_element("publication_date", parent=cp)
            o_year = make_simple_element("year", parent=pd)
            o_year.text = year

            for pages in paper.iter(tag="pages"):
                o_pages = make_simple_element("pages", parent=cp)
                fp = make_simple_element("first_page", parent=o_pages)
                lp = make_simple_element("last_page", parent=o_pages)
                try:
                    fp.text = re.split("[-–]", pages.text)[0]
                    lp.text = re.split("[-–]", pages.text)[1]
                except IndexError as e:  # only one page
                    fp.text = pages.text
                    lp.text = pages.text

            # DOI assignation data
            dd = make_simple_element("doi_data", parent=cp)
            doi = make_simple_element("doi",
                                      parent=dd,
                                      text=DOI_PREFIX + url + aa_id)
            resource = make_simple_element("resource",
                                           parent=dd,
                                           text=ANTHOLOGY_URL.format(url +
                                                                     aa_id))

    print(
        etree.tostring(
            new_volume,
            pretty_print=True,
            encoding="UTF-8",
            xml_declaration=True,
            with_tail=True,
        ).decode("utf-8"))

Exemple #2

0

Afficher le fichier

def main(volumes):

    formatter = MarkupFormatter()

    ## Assemble container
    doi_batch = make_simple_element(
        'doi_batch',
        attrib={
            'xmlns': 'http://www.crossref.org/schema/4.4.1',
            '{http://www.w3.org/2001/XMLSchema-instance}schemaLocation':
            'http://www.crossref.org/schema/4.4.1 http://www.crossref.org/schema/deposit/crossref4.4.1.xsd',
            'version': '4.4.1'
        },
        namespaces={'xsi': 'http://www.w3.org/2001/XMLSchema-instance'})
    new_volume = etree.ElementTree(doi_batch)

    ## Assemble head
    head = make_simple_element('head', parent=new_volume.getroot())
    dbi = make_simple_element('doi_batch_id',
                              text=str(int(time.time())),
                              parent=head)

    timestamp = make_simple_element('timestamp',
                                    text=str(int(time.time())),
                                    parent=head)

    depositor = make_simple_element('depositor', parent=head)
    depositor_name = make_simple_element('depositor_name',
                                         text=DEPOSITOR_NAME,
                                         parent=depositor)
    email_address = make_simple_element('email_address',
                                        text=EMAIL_ADDRESS,
                                        parent=depositor)

    registrant = make_simple_element('registrant',
                                     text=REGISTRANT,
                                     parent=head)

    ## Assemble body
    body = make_simple_element('body', parent=new_volume.getroot())

    year = ""
    start_month = ""
    end_month = ""

    for full_volume_id in sorted(volumes):
        collection_id, volume_id, _ = deconstruct_anthology_id(full_volume_id)

        collection_file = os.path.join(os.path.dirname(sys.argv[0]), '..',
                                       'data', 'xml', f'{collection_id}.xml')
        tree = etree.parse(collection_file)

        v = tree.getroot().find(f"./volume[@id='{volume_id}']")
        if v is None:
            print(f"* Can't find volume {full_volume_id}", file=sys.stderr)
            continue

        ## Assemble frontmatter
        c = make_simple_element('conference', parent=body)
        contribs = make_simple_element('contributors', parent=c)
        editor_index = 0

        meta = v.find('./meta')
        for tag in meta:
            if tag.tag == 'year':
                year = tag.text
            elif tag.tag == 'month':
                month = tag.text
                try:
                    start_month = MONTH_HASH[re.split('[-–]', month)[0]]
                    end_month = MONTH_HASH[re.split('[-–]', month)[1]]
                except IndexError as e:  # only one month
                    start_month = MONTH_HASH[month]
                    end_month = MONTH_HASH[month]
            elif tag.tag == 'url':
                url = tag.text
            elif tag.tag == 'booktitle':
                booktitle = tag.text
            elif tag.tag == 'address':
                address = tag.text
            elif tag.tag == 'publisher':
                publisher = tag.text
            elif tag.tag == 'editor':
                pn = make_simple_element(
                    'person_name',
                    parent=contribs,
                    attrib={
                        'contributor_role': 'chair',
                        'sequence':
                        'first' if editor_index == 0 else 'additional'
                    })
                editor_index += 1

                for name_part in tag:
                    if name_part.tag == 'first':
                        gn = make_simple_element('given_name',
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == 'last':
                        sn = make_simple_element('surname',
                                                 text=name_part.text,
                                                 parent=pn)

        # Assemble Event Metadata
        em = make_simple_element('event_metadata', parent=c)
        cn = make_simple_element('conference_name', parent=em, text=booktitle)
        cl = make_simple_element('conference_location',
                                 parent=em,
                                 text=address)
        cd = make_simple_element('conference_date',
                                 parent=em,
                                 attrib={
                                     'start_year': year,
                                     'end_year': year,
                                     'start_month': start_month,
                                     'end_month': end_month
                                 })

        # Assemble Proceedings Metadata
        pm = make_simple_element('proceedings_metadata',
                                 parent=c,
                                 attrib={'language': 'en'})
        pt = make_simple_element('proceedings_title',
                                 parent=pm,
                                 text=booktitle)
        p = make_simple_element('publisher', parent=pm)
        pn = make_simple_element('publisher_name', parent=p, text=publisher)
        pp = make_simple_element('publisher_place',
                                 parent=p,
                                 text=PUBLISHER_PLACE)
        pd = make_simple_element('publication_date', parent=pm)
        y = make_simple_element('year', parent=pd, text=year)
        noisbn = make_simple_element('noisbn',
                                     parent=pm,
                                     attrib={'reason': 'simple_series'})

        # DOI assignation data
        dd = make_simple_element('doi_data', parent=pm)
        doi = make_simple_element('doi', parent=dd, text=DOI_PREFIX + url)
        resource = make_simple_element('resource',
                                       parent=dd,
                                       text=ANTHOLOGY_URL.format(url))

        for paper in v.findall('./paper'):
            ## Individual Paper Data

            # TODO: this is not future-proof, should use anthology.util library functions
            aa_id = ""
            if (len(url) == 6):
                aa_id = '{:02d}'.format(int(paper.attrib['id']))
            else:
                if (len(url) == 5):
                    aa_id = '{:03d}'.format(int(paper.attrib['id']))

            cp = make_simple_element('conference_paper', parent=c)

            # contributors
            contribs = make_simple_element('contributors', parent=cp)
            author_index = 0
            for author in paper.findall('./author'):
                pn = make_simple_element(
                    'person_name',
                    parent=contribs,
                    attrib={
                        'contributor_role': 'author',
                        'sequence':
                        'first' if author_index == 0 else 'additional'
                    })
                author_index += 1

                for name_part in author:
                    if name_part.tag == 'first':
                        gn = make_simple_element('given_name',
                                                 parent=pn,
                                                 text=name_part.text)
                    elif name_part.tag == 'last':
                        sn = make_simple_element('surname',
                                                 text=name_part.text,
                                                 parent=pn)

            for title in paper.iter(tag='title'):
                o_titles = make_simple_element('titles', parent=cp)
                o_title = make_simple_element('title',
                                              parent=o_titles,
                                              text=formatter.as_text(title))

            pd = make_simple_element('publication_date', parent=cp)
            o_year = make_simple_element('year', parent=pd)
            o_year.text = year

            for pages in paper.iter(tag='pages'):
                o_pages = make_simple_element('pages', parent=cp)
                fp = make_simple_element('first_page', parent=o_pages)
                lp = make_simple_element('last_page', parent=o_pages)
                try:
                    fp.text = re.split('[-–]', pages.text)[0]
                    lp.text = re.split('[-–]', pages.text)[1]
                except IndexError as e:  # only one page
                    fp.text = pages.text
                    lp.text = pages.text

            # DOI assignation data
            dd = make_simple_element('doi_data', parent=cp)
            doi = make_simple_element('doi',
                                      parent=dd,
                                      text=DOI_PREFIX + url + aa_id)
            resource = make_simple_element('resource',
                                           parent=dd,
                                           text=ANTHOLOGY_URL.format(url +
                                                                     aa_id))

    print(
        etree.tostring(new_volume,
                       pretty_print=True,
                       encoding='UTF-8',
                       xml_declaration=True,
                       with_tail=True).decode('utf-8'))

Exemple #3

0

Afficher le fichier

Fichier : summarize_additions.py Projet : xinru1414/acl-anthology

def main(args):
    scriptdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), '..',
                             'data')
    anthology = Anthology(importdir=scriptdir)

    attachments = defaultdict(list)
    revisions = []
    errata = []
    for line in sys.stdin:
        if not line.startswith("+"):
            continue

        line = line[1:].strip()
        if line.startswith("<attachment"):
            try:
                match_str = rf'<attachment type="(\w+)">({ANTHOLOGY_ID_REGEX}).*'
                match = re.match(match_str, line)
                attach_type, anthology_id = match.groups()
            except:
                print(f"* Couldn't match '{match_str}' to '{line}'",
                      file=sys.stderr)

            attachments[attach_type].append((
                anthology.papers[anthology_id].get_title('plain'),
                ANTHOLOGY_URL.format(anthology_id),
            ))

        elif line.startswith("<revision"):
            try:
                match_str = rf'<revision.*href="({ANTHOLOGY_ID_REGEX}).*>.*'
                match = re.match(match_str, line)
                anthology_id = match.group(1)
            except:
                print(f"* Couldn't match '{match_str}' to '{line}'",
                      file=sys.stderr)

            paper = anthology.papers[anthology_id]
            explanation = paper.attrib["revision"][-1]["explanation"]

            revisions.append((
                paper.get_title("plain"),
                ANTHOLOGY_URL.format(anthology_id),
                explanation,
            ))

        elif line.startswith("<errat"):
            try:
                match_str = rf"<errat.*?>({ANTHOLOGY_ID_REGEX}).*"
                match = re.match(match_str, line)
                anthology_id = match.group(1)
            except:
                print(f"* Couldn't match '{match_str}' to '{line}'",
                      file=sys.stderr)

            errata.append((
                anthology.papers[anthology_id].get_title('plain'),
                ANTHOLOGY_URL.format(anthology_id),
            ))

    inflector = inflect.engine()
    for attach_type, attachments in attachments.items():
        phrase = inflector.a(attach_type)
        print(f"\nAdded {phrase}:")
        for title, url in attachments:
            print("-", title, "\n ", url, "\n")

    if len(revisions):
        print(f"\nRevisions:")
        for title, url, explanation in revisions:
            print("-", title, "\n ", url, "\n ", explanation, "\n")

    if len(errata):
        print(f"\nErrata:")
        for title, url in errata:
            print("-", title, "\n ", url, "\n")

Exemple #4

0

Afficher le fichier

def main(args):

    change_type = 'erratum' if args.erratum else 'revision'
    change_letter = 'e' if args.erratum else 'v'

    print(f'Processing {change_type} to {args.anthology_id}...')

    # TODO: make sure path exists, or download URL to temp file
    if args.path.startswith('http'):
        _, input_file_path = tempfile.mkstemp()
        try:
            print(f'-> Downloading file from {args.path}', file=sys.stderr)
            with urllib.request.urlopen(args.path) as url, open(
                    input_file_path, mode='wb') as input_file_fh:
                input_file_fh.write(url.read())
        except ssl.SSLError:
            print('An SSL error was encountered in downloading the files.',
                  file=sys.stderr)
            sys.exit(1)
    else:
        input_file_path = args.path

    collection_id, volume_id, paper_id = deconstruct_anthology_id(
        args.anthology_id)
    paper_extension = args.path.split('.')[-1]

    # The new version
    revno = None

    # Update XML
    xml_file = os.path.join(os.path.dirname(sys.argv[0]), '..', 'data', 'xml',
                            f'{collection_id}.xml')
    tree = ET.parse(xml_file)
    paper = tree.getroot().find(
        f"./volume[@id='{volume_id}']/paper[@id='{paper_id}']")
    if paper is not None:
        revisions = paper.findall(change_type)
        revno = 1 if args.erratum else 2
        for revision in revisions:
            revno = int(revision.attrib['id']) + 1

        if args.do:
            revision = ET.Element(change_type)
            revision.attrib['id'] = str(revno)
            revision.attrib[
                'href'] = f'{args.anthology_id}{change_letter}{revno}'
            revision.text = args.explanation

            # Set tails to maintain proper indentation
            paper[-1].tail += '  '
            revision.tail = '\n    '  # newline and two levels of indent

            paper.append(revision)

            indent(tree.getroot())

            tree.write(xml_file, encoding="UTF-8", xml_declaration=True)
            print(f'-> Added {change_type} node "{revision.text}" to XML',
                  file=sys.stderr)

    else:
        print(
            f'-> FATAL: paper ID {args.anthology_id} not found in the Anthology',
            file=sys.stderr)
        sys.exit(1)

    output_dir = os.path.join(args.anthology_dir, 'pdf', collection_id[0],
                              collection_id)

    # Make sure directory exists
    if not os.path.exists(output_dir):
        print(f'-> Creating directory {output_dir}', file=sys.stderr)
        os.makedirs(output_dir)

    canonical_path = os.path.join(output_dir, f'{args.anthology_id}.pdf')

    if not args.erratum and revno == 2:
        # There are no versioned files the first time around, so create the first one
        # (essentially backing up the original version)
        revised_file_v1_path = os.path.join(
            output_dir, f'{args.anthology_id}{change_letter}1.pdf')

        current_version = ANTHOLOGY_URL.format(args.anthology_id)
        if args.do:
            try:
                print(
                    f'-> Downloading file from {args.path} to {revised_file_v1_path}',
                    file=sys.stderr)
                with urllib.request.urlopen(current_version) as url, open(
                        revised_file_v1_path, mode='wb') as fh:
                    fh.write(url.read())
            except ssl.SSLError:
                print(
                    f'-> FATAL: An SSL error was encountered in downloading {args.path}.',
                    file=sys.stderr)
                sys.exit(1)
        else:
            print(
                f'-> DRY RUN: Downlading file from {args.path} to {revised_file_v1_path}',
                file=sys.stderr)

    revised_file_versioned_path = os.path.join(
        output_dir, f'{args.anthology_id}{change_letter}{revno}.pdf')

    maybe_copy(input_file_path, revised_file_versioned_path, args.do)
    maybe_copy(input_file_path, canonical_path, args.do)

    if args.path.startswith('http'):
        os.remove(input_file_path)