Python VenueIndex.VenueIndex Examples

Programming Language: Python

Namespace/Package Name: anthology.venues

Class/Type: VenueIndex

Method/Function: VenueIndex

Examples at hotexamples.com: 3

Python VenueIndex.VenueIndex - 3 examples found. These are the top rated real world Python examples of anthology.venues.VenueIndex.VenueIndex extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

VenueIndex(3)

add_venue(3)

dump(3)

get_slug(2)

items(2)

Example #1

Show file

def main(args):
    datadir = os.path.join(os.path.dirname(sys.argv[0]), "..", "data")
    venues = VenueIndex(srcdir=datadir)

    print(f"Adding '{args.acronym}' ({args.name})")
    venues.add_venue(args.acronym, args.name, is_acl=args.acl, url=args.url)

    venues.dump(datadir)

Example #2

Show file

File: ingest.py Project: namrathaurs/acl-anthology

def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_keys = [
        venue["slug"].lower()
        for _, venue in VenueIndex(srcdir=anthology_datadir).items()
    ]

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_name = meta["abbrev"].lower()

        if venue_name not in venue_keys:
            unseen_venues.append(meta["abbrev"])

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = (meta["year"] + "." +
                                                 meta["abbrev"].lower())
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        print("FATAL: The following venue(s) don't exist in venues.yaml")
        for venue in unseen_venues:
            print(f"- {venue}")
        print("Please create entries for them and re-ingest.")
        sys.exit(1)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = meta["abbrev"] + "-" + year
        book_src_path = os.path.join(root_path, book_src_filename) + ".pdf"
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run and not os.path.exists(book_dest_path):
                log(f"Copying {book_src_path} -> {book_dest_path}",
                    args.dry_run)
                shutil.copyfile(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run and not os.path.exists(pdf_dest_path):
                    log(f"Copying {pdf_src_path} -> {pdf_dest_path}",
                        args.dry_run)
                    shutil.copyfile(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)
                # print(etree.tostring(paper_node, pretty_print=True))

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)

Example #3

Show file

def main(args):
    collections = defaultdict(OrderedDict)
    volumes = {}

    anthology_datadir = os.path.join(os.path.dirname(sys.argv[0]), "..",
                                     "data")
    venue_index = VenueIndex(srcdir=anthology_datadir)
    venue_keys = [venue["slug"].lower() for _, venue in venue_index.items()]

    sig_index = SIGIndex(srcdir=anthology_datadir)

    # Build list of volumes, confirm uniqueness
    unseen_venues = []
    for proceedings in args.proceedings:
        meta = read_meta(os.path.join(proceedings, "meta"))

        venue_abbrev = meta["abbrev"]
        venue_slug = venue_index.get_slug(venue_abbrev)

        if str(datetime.now().year) in venue_abbrev:
            print(
                f"Fatal: Venue assembler put year in acronym: '{venue_abbrev}'"
            )
            sys.exit(1)

        if venue_slug not in venue_keys:
            unseen_venues.append((venue_slug, venue_abbrev, meta["title"]))

        meta["path"] = proceedings

        meta["collection_id"] = collection_id = meta["year"] + "." + venue_slug
        volume_name = meta["volume"].lower()
        volume_full_id = f"{collection_id}-{volume_name}"

        if volume_full_id in volumes:
            print("Error: ")

        collections[collection_id][volume_name] = {}
        volumes[volume_full_id] = meta

        if "sig" in meta:
            print(
                f"Add this line to {anthology_datadir}/sigs/{meta['sig'].lower()}.yaml:"
            )
            print(f"  - {meta['year']}:")
            print(f"    - {volume_full_id} # {meta['booktitle']}")

    # Make sure all venues exist
    if len(unseen_venues) > 0:
        for venue in unseen_venues:
            slug, abbrev, title = venue
            print(f"Creating venue '{abbrev}' ({title})")
            venue_index.add_venue(abbrev, title)
        venue_index.dump(directory=anthology_datadir)

    # Copy over the PDFs and attachments
    for volume, meta in volumes.items():
        root_path = os.path.join(meta["path"], "cdrom")
        collection_id = meta["collection_id"]
        venue_name = meta["abbrev"].lower()
        volume_name = meta["volume"].lower()
        year = meta["year"]

        pdfs_dest_dir = os.path.join(args.pdfs_dir, venue_name)
        if not os.path.exists(pdfs_dest_dir):
            os.makedirs(pdfs_dest_dir)

        # copy the book
        book_src_filename = f'{year}.{meta["abbrev"]}-{volume_name}.pdf'
        book_src_path = os.path.join(root_path, book_src_filename)
        book_dest_path = None
        if os.path.exists(book_src_path):
            book_dest_path = (
                os.path.join(pdfs_dest_dir, f"{collection_id}-{volume_name}") +
                ".pdf")

            if not args.dry_run:
                maybe_copy(book_src_path, book_dest_path)

        # copy the paper PDFs
        pdf_src_dir = os.path.join(root_path, "pdf")
        for pdf_file in os.listdir(pdf_src_dir):
            # Skip . files
            if os.path.basename(pdf_file).startswith("."):
                continue

            # names are {abbrev}{number}.pdf
            match = re.match(rf".*\.(\d+)\.pdf", pdf_file)

            if match is not None:
                paper_num = int(match[1])
                paper_id_full = f"{collection_id}-{volume_name}.{paper_num}"

                bib_path = os.path.join(
                    root_path,
                    "bib",
                    pdf_file.replace("/pdf", "/bib/").replace(".pdf", ".bib"),
                )

                pdf_src_path = os.path.join(pdf_src_dir, pdf_file)
                pdf_dest_path = os.path.join(
                    pdfs_dest_dir,
                    f"{collection_id}-{volume_name}.{paper_num}.pdf")
                if not args.dry_run:
                    maybe_copy(pdf_src_path, pdf_dest_path)

                collections[collection_id][volume_name][paper_num] = {
                    "anthology_id": paper_id_full,
                    "bib": bib_path,
                    "pdf": pdf_dest_path,
                    "attachments": [],
                }

        # copy the attachments
        if os.path.exists(os.path.join(root_path, "additional")):
            attachments_dest_dir = os.path.join(args.attachments_dir,
                                                venue_name)
            if not os.path.exists(attachments_dest_dir):
                os.makedirs(attachments_dest_dir)
            for attachment_file in os.listdir(
                    os.path.join(root_path, "additional")):
                if os.path.basename(attachment_file).startswith("."):
                    continue
                attachment_file_path = os.path.join(root_path, "additional",
                                                    attachment_file)
                match = re.match(
                    rf"{year}\.{venue_name}-\w+\.(\d+)_?(\w+)\.(\w+)$",
                    attachment_file)
                if match is None:
                    print(
                        f"* Warning: no attachment match for {attachment_file}",
                        file=sys.stderr,
                    )
                    sys.exit(2)

                paper_num, type_, ext = match.groups()
                paper_num = int(paper_num)

                file_name = f"{collection_id}-{volume_name}.{paper_num}.{type_}.{ext}"
                dest_path = os.path.join(attachments_dest_dir, file_name)
                if not args.dry_run and not os.path.exists(dest_path):
                    log(f"Copying {attachment_file} -> {dest_path}",
                        args.dry_run)
                    shutil.copyfile(attachment_file_path, dest_path)

                collections[collection_id][volume_name][paper_num][
                    "attachments"].append((dest_path, type_))

    people = AnthologyIndex(None, srcdir=anthology_datadir)

    def correct_caps(person, name_node, anth_id):
        """
        Many people submit their names in "ALL CAPS" or "all lowercase".
        Correct this with heuristics.
        """
        name = name_node.text
        if name.islower() or name.isupper():
            # capitalize all parts
            corrected = " ".join(
                list(map(lambda x: x.capitalize(), name.split())))
            print(
                f"-> Correcting capitalization of '{name}' to '{corrected}'",
                file=sys.stderr,
            )
            name_node.text = corrected

    def disambiguate_name(node, anth_id):
        name = PersonName.from_element(node)
        ids = people.get_ids(name)

        if len(ids) > 1:
            choice = -1
            while choice < 0 or choice >= len(ids):
                print(
                    f"({anth_id}): ambiguous author {name}; Please choose from the following:"
                )
                for i, id_ in enumerate(ids):
                    print(f"[{i}] {id_} ({people.get_comment(id_)})")
                choice = int(input("--> "))

            node.attrib["id"] = ids[choice]

    for collection_id, collection in collections.items():
        # Newly added volumes, so we can normalize and name-disambig later
        newly_added_volumes = []

        collection_file = os.path.join(args.anthology_dir, "data", "xml",
                                       f"{collection_id}.xml")
        if os.path.exists(collection_file):
            root_node = etree.parse(collection_file).getroot()
        else:
            root_node = make_simple_element("collection",
                                            attrib={"id": collection_id})

        for volume_id, volume in collection.items():
            volume_node = make_simple_element(
                "volume",
                attrib={
                    "id": volume_id,
                    "ingest-date": args.ingest_date
                },
            )

            # Replace the existing one if present
            existing_volume_node = root_node.find(
                f"./volume[@id='{volume_id}']")
            for i, child in enumerate(root_node):
                if child.attrib["id"] == volume_id:
                    root_node[i] = volume_node
                    break
            else:
                root_node.append(volume_node)

            meta_node = None

            for paper_num, paper in sorted(volume.items()):
                paper_id_full = paper["anthology_id"]
                bibfile = paper["bib"]
                paper_node = bib2xml(bibfile, paper_id_full)

                if paper_node.attrib["id"] == "0":
                    # create metadata subtree
                    meta_node = make_simple_element("meta", parent=volume_node)
                    title_node = paper_node.find("title")
                    title_node.tag = "booktitle"
                    meta_node.append(title_node)
                    for author_or_editor in chain(
                            paper_node.findall("./author"),
                            paper_node.findall("./editor")):
                        meta_node.append(author_or_editor)
                        author_or_editor.tag = "editor"
                    meta_node.append(paper_node.find("publisher"))
                    meta_node.append(paper_node.find("address"))
                    meta_node.append(paper_node.find("month"))
                    meta_node.append(paper_node.find("year"))
                    if book_dest_path is not None:
                        make_simple_element(
                            "url",
                            text=f"{collection_id}-{volume_name}",
                            attrib={
                                "hash": compute_hash_from_file(book_dest_path)
                            },
                            parent=meta_node,
                        )

                    # modify frontmatter tag
                    paper_node.tag = "frontmatter"
                    del paper_node.attrib["id"]
                else:
                    # remove unneeded fields
                    for child in paper_node:
                        if child.tag in [
                                "editor",
                                "address",
                                "booktitle",
                                "publisher",
                                "year",
                                "month",
                        ]:
                            paper_node.remove(child)

                url = paper_node.find("./url")
                if url is not None:
                    url.attrib["hash"] = compute_hash_from_file(paper["pdf"])

                for path, type_ in paper["attachments"]:
                    make_simple_element(
                        "attachment",
                        text=os.path.basename(path),
                        attrib={
                            "type": type_,
                            "hash": compute_hash_from_file(path),
                        },
                        parent=paper_node,
                    )

                if len(paper_node) > 0:
                    volume_node.append(paper_node)

                # Normalize
                for oldnode in paper_node:
                    normalize(oldnode, informat="latex")

                # Adjust the language tag
                language_node = paper_node.find("./language")
                if language_node is not None:
                    try:
                        lang = iso639.languages.get(name=language_node.text)
                    except KeyError:
                        raise Exception(
                            f"Can't find language '{language_node.text}'")
                    language_node.text = lang.part3
                    print(language_node.text)

                # Fix author names
                for name_node in chain(paper_node.findall("./author"),
                                       paper_node.findall("./editor")):
                    disambiguate_name(name_node, paper_id_full)
                    person = PersonName.from_element(name_node)
                    for name_part in name_node:
                        correct_caps(person, name_part, paper_id_full)

        # Other data from the meta file
        if "isbn" in meta:
            make_simple_element("isbn", meta["isbn"], parent=meta_node)

        indent(root_node)
        tree = etree.ElementTree(root_node)
        tree.write(collection_file,
                   encoding="UTF-8",
                   xml_declaration=True,
                   with_tail=True)