Ejemplo n.º 1
0
def import_cpdl_work_wikitext(work_wikitext):
    composition = cpdl.composition_wikitext_to_music_composition(work_wikitext)
    composer = composition['composer']
    if composer is not None:
        source = f'https://cpdl.org/wiki/index.php/{composer.replace(" ", "_")}'
        existing_composer_ceid = get_existing_person_by_source(source)
        if not existing_composer_ceid:
            existing_composer_ceid = import_cpdl_composer(composer)
        if existing_composer_ceid:
            musiccomp_ceid = get_or_create_musiccomposition(
                composition['work'])
            link_musiccomposition_and_composers(musiccomp_ceid,
                                                [existing_composer_ceid])
            mediaobjects = cpdl.composition_wikitext_to_mediaobjects(
                work_wikitext)
            for mo in mediaobjects:
                xml = mo["xml"]
                xmlmediaobject_ceid = get_or_create_mediaobject(xml)
                link_musiccomposition_and_mediaobject(
                    composition_id=musiccomp_ceid,
                    mediaobject_id=xmlmediaobject_ceid)
                if "pdf" in mo and mo["pdf"] is not None:
                    pdf = mo["pdf"]
                    pdfmediaobject_ceid = get_or_create_mediaobject(pdf)
                    link_musiccomposition_and_mediaobject(
                        composition_id=musiccomp_ceid,
                        mediaobject_id=pdfmediaobject_ceid)
                    # In CPDL, we know that PDFs are generated from the source xml file
                    # TODO: Are there any situations where this isn't the case?
                    link_mediaobject_was_derived_from(
                        source_id=xmlmediaobject_ceid,
                        derived_id=pdfmediaobject_ceid)
        else:
            logger.info(" - missing composer?")
Ejemplo n.º 2
0
def load_musiccomposition_from_imslp_by_file(reverselookup):
    """Using an IMSLP Special:ReverseLookup url, find the composition and import
       - composer
       - work
       - file
    """
    if not reverselookup.startswith(
            "https://imslp.org/wiki/Special:ReverseLookup/"):
        raise ValueError("Should be a Special:ReverseLookup url")

    composition, filename = imslp.get_composition_and_filename_from_permalink(
        reverselookup)
    # The composition url will end with a #anchor, remove it
    if "#" in composition:
        composition = composition[:composition.index("#")]
    load_musiccomposition_from_imslp_name(composition, load_files=False)

    # Once we loaded the composition, we look it up again to get the id
    url = "https://imslp.org/wiki/" + composition.replace(" ", "_")
    composition_id = get_existing_musiccomposition_by_source(url)
    print("got id", composition_id)

    wikitext = imslp.get_wiki_content_for_pages([composition])
    if composition_id:
        if wikitext:
            file = imslp.get_mediaobject_for_filename(
                wikitext[0], filename.replace(" ", "_"))
            if file:
                mediaobject_ceid = get_or_create_imslp_mediaobject(file)
                link_musiccomposition_and_mediaobject(
                    composition_id=composition_id,
                    mediaobject_id=mediaobject_ceid)
    else:
        logger.info(" - cannot find composition after importing it once")
Ejemplo n.º 3
0
def load_musiccomposition_from_musicbrainz(work_mbid):
    logger.info("Importing musicbrainz work %s", work_mbid)
    meta = musicbrainz.load_work_from_musicbrainz(work_mbid)

    # Create composition, or get its id if it already exists
    musiccomp_ceid = get_or_create_musiccomposition(meta['work'])

    # Import the work's composer if it doesn't exist
    # This will hit MB for the artist lookup, but won't write to the CE if the composer already exists
    composer = meta['composer_mbid']
    # Returns all composer ids of all exactMatches for this composer
    persons = load_artist_from_musicbrainz(composer)
    composer_ids = create_persons_and_link(persons)

    all_part_ids = []
    # For each part, import the part and then link it to the main work
    for part in meta['parts']:
        part_id = get_or_create_musiccomposition(part)
        all_part_ids.append(part_id)

    link_musiccomposition_and_parts(musiccomp_ceid, all_part_ids)
    link_musiccomposition_and_composers(musiccomp_ceid, composer_ids)
    # Link composer to all parts
    for part_id in all_part_ids:
        link_musiccomposition_and_composers(part_id, composer_ids)

    return {
        "musiccomposition_id": musiccomp_ceid,
        "part_ids": all_part_ids,
        "person_ids": composer_ids
    }
Ejemplo n.º 4
0
def import_cpdl_composer(composer_name):
    """Import a single composer"""
    composerwikitext = cpdl.get_wikitext_for_titles([composer_name])
    if composerwikitext:
        composer = composerwikitext[0]
        logger.info("Importing CPDL composer %s", composer['title'])
        import_cpdl_composer_wikitext(composer)
        source = f'https://cpdl.org/wiki/index.php/{composer["title"].replace(" ", "_")}'
        existing_composer_ceid = get_existing_person_by_source(source)
        return existing_composer_ceid
    else:
        return None
Ejemplo n.º 5
0
def import_cpdl_works_for_category(cpdl_category):
    """Given a category in CPDL, find all of its works. Then, filter to only include works
    with a musicxml file. Import each of these works and the xml files.
    This assumes that import_cpdl_composers_for_category has been run first and that Person
    objects exist in the CE for each Composer"""

    titles = cpdl.get_titles_in_category(cpdl_category)
    wikitext = cpdl.get_wikitext_for_titles(titles)
    xmlwikitext = cpdl.get_works_with_xml(wikitext)

    total = len(xmlwikitext)
    for i, work in enumerate(xmlwikitext, 1):
        logger.info("Importing CPDL work %s/%s %s", i, total, work['title'])
        import_cpdl_work_wikitext(work)
Ejemplo n.º 6
0
def import_cpdl_composers_for_category(cpdl_category):
    """Given a category in CPDL, find all of its works. Then, filter to only include works
    with a musicxml file and get a unique list of composers for these works.
    For each composer, import it along with links to imslp and wikipedia if they exist."""

    titles = cpdl.get_titles_in_category(cpdl_category)
    wikitext = cpdl.get_wikitext_for_titles(titles)
    xmlwikitext = cpdl.get_works_with_xml(wikitext)
    composers = cpdl.get_composers_for_works(xmlwikitext)
    composerwikitext = cpdl.get_wikitext_for_titles(composers)

    total = len(composerwikitext)
    for i, composer in enumerate(composerwikitext, 1):
        logger.info("Importing CPDL composer %s/%s %s", i, total,
                    composer['title'])
        import_cpdl_composer_wikitext(composer)
Ejemplo n.º 7
0
def load_artist_from_musicbrainz(artist_mbid):
    logger.info("Importing musicbrainz artist %s", artist_mbid)
    persons = []
    mb_person = musicbrainz.load_person_from_musicbrainz(artist_mbid)
    persons.append(mb_person)

    rels = musicbrainz.load_person_relations_from_musicbrainz(artist_mbid)
    if 'viaf' in rels:
        viaf_person = viaf.load_person_from_viaf(rels['viaf'])
        persons.append(viaf_person)
    if 'imslp' in rels:
        # TODO: If there are more rels in imslp that aren't in MB we could use them here
        imslp_url = rels['imslp']
        imslp_person = imslp.api_composer(
            imslp_url.replace("https://imslp.org/wiki/", "").replace("_", " "))
        persons.append(imslp_person)
    if 'worldcat' in rels:
        worldcat_person = worldcat.load_person_from_worldcat(rels['worldcat'])
        persons.append(worldcat_person)
    if 'loc' in rels:
        loc_person = loc.load_person_from_loc(rels['loc'])
        persons.append(loc_person)
    if 'isni' in rels:
        isni_url = f"https://isni.org/isni/{rels['isni']}"
        isni_person = isni.load_person_from_isni(isni_url)
        persons.append(isni_person)
    if 'wikidata' in rels:
        wd_person = wikidata.load_person_from_wikidata_url(rels['wikidata'])
        if wd_person:
            persons.append(wd_person)
        wp_person = wikidata.load_person_from_wikipedia_wikidata_url(
            rels['wikidata'], 'en')
        if wp_person:
            persons.append(wp_person)

    # dedup by source
    ret = []
    seen = set()
    for p in persons:
        if 'source' in p:
            if p['source'] not in seen:
                ret.append(p)
                seen.add(p['source'])
    return ret
Ejemplo n.º 8
0
def files_for_work(work_wikitext):
    """Get MediaObject information for files relevant to the work

    If the work has an xml file, get the xml and the pdf associated with it

    Arguments:
        work_wikitext: the result of get_wiki_content_for_pages of a work
    """
    parsed = mwph.parse(work_wikitext["content"])

    # A page should have one node, the #fte:imslppage template
    nodes = parsed.nodes
    if not isinstance(nodes[0], mwph.nodes.template.Template):
        logger.info("First node doesn't appear to be a template, skipping")
        return {}
    if not nodes or str(nodes[0].name).strip() != "#fte:imslppage":
        logger.info("Cannot find #fte:imslppage node, skipping")
        return {}
    node = nodes[0]

    # One of the parameters in this template is ' *****FILES***** '
    files_param = None
    for param in node.params:
        if param.name == ' *****FILES***** ':
            files_param = param
            break

    # the .value of this parameter is another Wikicode
    if files_param:
        files = files_param.value
        # There are another number of nodes in this Wikicode
        # Some are text, and some are #fte:imslpfile templates
        # We go looking for the #fte:imslpfile template that has an xml file in it,
        # and keep track of the previous node, which should be the title
        last_node = None
        xml_node = None
        for node in files.nodes:
            is_xml_node = False
            if hasattr(node, 'name') and node.name.strip() == "#fte:imslpfile":
                for fileparam in node.params:
                    if "File Description" in fileparam.name and "XML" in fileparam.value:
                        is_xml_node = True
                        break
                if is_xml_node:
                    xml_node = node
                    break
            last_node = node

    else:
        xml_node = last_node = None

    mediaobjects = []
    if xml_node:
        node_to_dict = {
            str(n.name): str(n.value).strip()
            for n in xml_node.params
        }
        num_files = len([
            n.name for n in xml_node.params if str(n).startswith("File Name")
        ])

        desc_match = re.search("=====(.*)=====", str(last_node))
        if desc_match:
            desc_match = desc_match.group(1)

        license = node_to_dict.get("Copyright")
        title = work_wikitext["title"].replace(" ", "_")
        url = "http://imslp.org/wiki/" + title

        for i in range(1, num_files + 1):
            this_file = node_to_dict[f"File Name {i}"]
            this_desc = node_to_dict[f"File Description {i}"]
            if desc_match:
                this_desc = desc_match + ", " + this_desc

            this_file = "File:" + this_file
            # TODO: This isn't a great way of going back and forth between filenames
            permalink = get_permalink_from_filename(
                title, this_file.replace("_", " "))
            file_url = "http://imslp.org/wiki/" + this_file
            file_title = get_page_title(file_url)

            # TODO: Person who published, transcribed work. Date of publication on imslp?
            file_dict = {
                'title': file_title,
                'name': this_file,
                'contributor': 'https://imslp.org',
                'source': url,
                'url': permalink,
                'format_': 'text/html',
                'language': 'en',
                'license': license,
                'description': this_desc,
            }
            mediaobjects.append(file_dict)

    return mediaobjects
Ejemplo n.º 9
0
def get_mediaobject_for_filename(work_wikitext, filename):
    """
    If we have a specific file that we want to import (looked up from a Special:ReverseLookup)
    then find that file in the provided wikitext and return information to create a MediaObject
    TODO: This shares a lot of common code with `files_for_work`
    """
    # Filename doesn't include File: prefix in the template
    if filename.startswith("File:"):
        filename = filename.replace("File:", "")

    parsed = mwph.parse(work_wikitext["content"])
    # A page should have one node, the #fte:imslppage template
    nodes = parsed.nodes
    if not nodes or str(nodes[0].name).strip() != "#fte:imslppage":
        logger.info("Cannot find #fte:imslppage node, skipping")
        return {}
    node = nodes[0]

    # One of the parameters in this template is ' *****FILES***** '
    files_param = None
    for param in node.params:
        if param.name == ' *****FILES***** ':
            files_param = param
            break

    if files_param:
        files = files_param.value
        file_node = None
        for node in files.nodes:
            is_file_node = False
            if hasattr(node, 'name') and node.name.strip() == "#fte:imslpfile":
                for fileparam in node.params:
                    if "File Name" in fileparam.name and str(
                            fileparam.value).strip() == filename.strip():
                        file_node = node
                        is_file_node = True
                    break
            if is_file_node:
                break
        if file_node:
            node_to_dict = {
                str(n.name): str(n.value).strip()
                for n in file_node.params
            }
            chosen_file = [n for n, v in node_to_dict.items() if v == filename]
            file_index = chosen_file[0].replace("File Name ", "")

            license = node_to_dict.get("Copyright")
            title = work_wikitext["title"].replace(" ", "_")
            url = "http://imslp.org/wiki/" + title

            this_file = node_to_dict[f"File Name {file_index}"]
            this_desc = node_to_dict[f"File Description {file_index}"]

            this_file = "File:" + this_file
            permalink = get_permalink_from_filename(
                title, this_file.replace("_", " "))
            file_url = "http://imslp.org/wiki/" + this_file
            file_title = get_page_title(file_url)

            # TODO: Person who published, transcribed work. Date of publication on imslp?
            file_dict = {
                'title': file_title,
                'name': this_file,
                'contributor': 'https://imslp.org',
                'source': url,
                'url': permalink,
                'format_': 'text/html',
                'language': 'en',
                'license': license,
                'description': this_desc,
            }
            return file_dict
    return {}
Ejemplo n.º 10
0
def import_cpdl_work(work_names):
    """Import a single work"""
    wikitext = cpdl.get_wikitext_for_titles(work_names)
    for work in wikitext:
        logger.info("Importing CPDL work %s", work['title'])
        import_cpdl_work_wikitext(work)
Ejemplo n.º 11
0
def load_musiccomposition_from_imslp_name(imslp_name, load_files=True):
    """Load a MusicComposition from a single page on IMSLP,
    and also load any musicxml files as MediaObjects and any related PDFs
    """

    logger.info("Importing imslp work %s", imslp_name)
    work = imslp.api_work(imslp_name)
    musiccomposition = work["work"]
    composer = work["composer"]
    musicbrainz_work_id = work["musicbrainz_work_id"]

    if composer:
        composition_id = get_or_create_musiccomposition(musiccomposition)

        composer_source = f'https://imslp.org/wiki/{composer.replace(" ", "_")}'

        existing_composer_ceid = get_existing_person_by_source(composer_source)
        if not existing_composer_ceid:
            persons = load_artist_from_imslp(composer)
            create_persons_and_link(persons)
            existing_composer_ceid = get_existing_person_by_source(
                composer_source)

        link_musiccomposition_and_composers(composition_id,
                                            [existing_composer_ceid])

        if musicbrainz_work_id:
            mb_work = load_musiccomposition_from_musicbrainz(
                musicbrainz_work_id)
            mb_work_ceid = mb_work["musiccomposition_id"]
            link_musiccomposition_exactmatch([composition_id, mb_work_ceid])

        if not load_files:
            return

        wikitext = imslp.get_wiki_content_for_pages([imslp_name])
        files = imslp.files_for_work(wikitext[0])
        # We expect to see just one xml file, and maybe one pdf
        # TODO, there could be more than one, we need to support this too
        if len(files) == 0:
            logger.info(" - expected at least one file but got none")
        if len(files) == 1:
            file = files[0]
            if "XML" not in file["description"]:
                logger.info(
                    " - Only got one file but it's not an xml, not sure what to do"
                )
            else:
                xmlmediaobject_ceid = get_or_create_imslp_mediaobject(file)
                link_musiccomposition_and_mediaobject(
                    composition_id=composition_id,
                    mediaobject_id=xmlmediaobject_ceid)
        else:
            xmlfile = [f for f in files if "XML" in f["description"]]
            pdffiles = [f for f in files if f["name"].endswith("pdf")]
            if not xmlfile or not pdffiles:
                logger.info(
                    " - expected one xml and some pdfs, but this isn't the case"
                )
                print(files)
            else:
                xmlfile = xmlfile[0]
                xmlmediaobject_ceid = get_or_create_imslp_mediaobject(xmlfile)
                link_musiccomposition_and_mediaobject(
                    composition_id=composition_id,
                    mediaobject_id=xmlmediaobject_ceid)

                logger.info(" - got %s pdf files, importing each of them",
                            len(pdffiles))
                for pdffile in pdffiles:
                    pdfmediaobject_ceid = get_or_create_imslp_mediaobject(
                        pdffile)
                    link_musiccomposition_and_mediaobject(
                        composition_id=composition_id,
                        mediaobject_id=pdfmediaobject_ceid)

                    # In IMSLP, a PDF that comes linked with an XML file is a rendering of that file ,
                    # so the pdf is derived from the score
                    # TODO: We should check if this is the case all the time.
                    link_mediaobject_was_derived_from(
                        source_id=xmlmediaobject_ceid,
                        derived_id=pdfmediaobject_ceid)
    else:
        logger.info(" - No composer??, skipping")
Ejemplo n.º 12
0
def load_artist_from_imslp(url):
    logger.info("Importing imslp artist %s", url)
    if "Category:" not in url:
        raise Exception("Url should be an imslp Category: url")

    if url.startswith("https://imslp.org"):
        url = "/".join(url.split("/")[4:])

    people = []

    imslp_person = imslp.api_composer(url)
    if imslp_person:
        people.append(imslp_person)

    rels = imslp.api_composer_get_relations(url)
    if 'worldcat' in rels:
        worldcat_person = worldcat.load_person_from_worldcat(rels['worldcat'])
        people.append(worldcat_person)
    if 'viaf' in rels:
        viaf_person = viaf.load_person_from_viaf(rels['viaf'])
        people.append(viaf_person)
    if 'wikipedia' in rels:
        wikidata_id = wikidata.get_wikidata_id_from_wikipedia_url(
            rels['wikipedia'])
        if wikidata_id:
            wd_person = wikidata.load_person_from_wikidata_url(
                rels['wikidata'])
            if wd_person:
                people.append(wd_person)
            wp_person = wikidata.load_person_from_wikipedia_wikidata_url(
                rels['wikidata'], 'en')
            if wp_person:
                people.append(wp_person)
    if 'musicbrainz' in rels:
        mb_person = musicbrainz.load_person_from_musicbrainz(
            rels['musicbrainz'])
        people.append(mb_person)
    if 'isni' in rels:
        isni_person = isni.load_person_from_isni(rels['isni'])
        people.append(isni_person)
    if 'loc' in rels:
        loc_person = loc.load_person_from_loc(rels['loc'])
        people.append(loc_person)

    # If no link to musicbrainz from imslp, do a reverse lookup in musicbrainz to see if it's there
    if 'musicbrainz' not in rels:
        artist_mbid = musicbrainz.get_artist_mbid_by_imslp_url(url)
        # TODO: If the artist exists in MB, then we should also import all of the other
        #  relationships that exist, by using `load_artist_from_musicbrainz`
        if artist_mbid:
            mb_person = musicbrainz.load_person_from_musicbrainz(artist_mbid)
            people.append(mb_person)

    # dedup by source
    ret = []
    seen = set()
    for p in people:
        if 'source' in p:
            if p['source'] not in seen:
                ret.append(p)
                seen.add(p['source'])
    return ret