Beispiel #1
0
def determine_type_from_page(page):
    types = set()
    reasons = []
    background_field = infobox_fields["background"][page.lang]
    background = page.infobox.get(background_field, "")
    if background == "solo_singer" or background == "vocal" or background == "instrumentiste":
        types.add("person")
        reasons.append('Infobox has "' + background_field + " = " + background + '".')
    if page.persondata.get("name"):
        types.add("person")
        reasons.append('Contains the "Persondata" infobox.')
    if background == "group_or_band" or background == "groupe":
        types.add("group")
        reasons.append('Infobox has "' + background_field + " = " + background + '".')
    relevant_categories = []
    for category in page.categories:
        if page.lang == "fr":
            if category.startswith("Groupe"):
                types.add("group")
                relevant_categories.append(category)
        else:
            if category.endswith("groups") or category.startswith("Musical groups"):
                types.add("group")
                relevant_categories.append(category)

    if relevant_categories:
        reasons.append("Belongs to %s." % join_names("category", relevant_categories))
    return types, " ".join(reasons)
Beispiel #2
0
def determine_type_from_page(page):
    types = set()
    reasons = []
    background_field = infobox_fields['background'][page.lang]
    background = page.infobox.get(background_field, '')
    if background == 'solo_singer' or background == 'vocal' or background == 'instrumentiste':
        types.add('person')
        reasons.append('Infobox has "'+background_field+' = '+background+'".')
    if page.persondata.get('name'):
        types.add('person')
        reasons.append('Contains the "Persondata" infobox.')
    if background == 'group_or_band' or background == 'groupe':
        types.add('group')
        reasons.append('Infobox has "'+background_field+' = '+background+'".')
    relevant_categories = []
    for category in page.categories:
        if page.lang == 'fr':
            if category.startswith('Groupe'):
                types.add('group')
                relevant_categories.append(category)
        else:
            if category.endswith('groups') or category.startswith('Musical groups'):
                types.add('group')
                relevant_categories.append(category)

    if relevant_categories:
        reasons.append('Belongs to %s.' % join_names('category', relevant_categories))
    return types, ' '.join(reasons)
Beispiel #3
0
def determine_country_from_text(page):
    countries = set()
    relevant_links = []
    find_countries_in_text(countries, relevant_links, page.abstract, page.lang)
    reason = 'The first paragraph links to %s.' % join_names(
        '', relevant_links)
    return countries, reason
def determine_country_from_infobox(infobox):
    countries = set()
    relevant_links = []
    for field in ['origin', 'born', 'birth_place']:
        text = infobox.get(field, '')
        find_countries_in_text(countries, relevant_links, text)
    reason = 'Infobox links to %s.' % join_names('', relevant_links)
    return countries, reason
Beispiel #5
0
def determine_country_from_infobox(page):
    countries = set()
    relevant_links = []
    for field in infobox_fields['country'][page.lang]:
        field = field.decode('utf8')
        text = page.infobox.get(field, '')
        #if len(text) > 0:
        #    out("Text from infobox (field=%s): %s" % (field, text))
        find_countries_in_text(countries, relevant_links, text, page.lang)
    reason = 'Infobox links to %s.' % join_names('', relevant_links)
    return countries, reason
def determine_gender_from_categories(categories):
    genders = set()
    relevant_categories = []
    for category in categories:
        if re.search(r'\bmale\b', category, re.I):
            genders.add('male')
            relevant_categories.append(category)
        if re.search(r'\bfemale\b', category, re.I):
            genders.add('female')
            relevant_categories.append(category)
    reason = 'Belongs to %s.' % join_names('category', relevant_categories)
    return genders, reason
def determine_country_from_categories(categories):
    countries = set()
    relevant_categories = []
    for category in categories:
        category = category.replace('_', ' ')
        for name, code in category_countries.iteritems():
            if category.startswith(name + ' '):
                countries.add(code)
                relevant_categories.append(category)
        for name in link_us_states:
            if category.endswith('from ' + name):
                countries.add('US')
                relevant_categories.append(category)
    reason = 'Belongs to %s.' % join_names('category', relevant_categories)
    return countries, reason, len(relevant_categories)
Beispiel #8
0
def determine_country_from_categories(page):
    countries = set()
    relevant_categories = []
    for category in page.categories:
        category = category.replace('_', ' ')
        for name, code in demonyms[page.lang].iteritems():
            if name.decode('utf8') in category:
                countries.add(code)
                relevant_categories.append(category)
        for name in wp_us_states_links:
            if category.endswith('from ' + name):
                countries.add('US')
                relevant_categories.append(category)
    reason = 'Belongs to %s.' % join_names('category', relevant_categories)
    return countries, reason, len(relevant_categories)
Beispiel #9
0
def determine_country_from_categories(page):
    countries = set()
    relevant_categories = []
    for category in page.categories:
        category = category.replace("_", " ")
        for name, code in demonyms[page.lang].iteritems():
            if name.decode("utf8") in category:
                countries.add(code)
                relevant_categories.append(category)
        for name in wp_us_states_links:
            if category.endswith("from " + name):
                countries.add("US")
                relevant_categories.append(category)
    reason = "Belongs to %s." % join_names("category", relevant_categories)
    return countries, reason, len(relevant_categories)
Beispiel #10
0
def determine_gender_from_categories(page):
    genders = set()
    relevant_categories = []
    for category in page.categories:
        if re.search(r"\bmale\b", category, re.I):
            genders.add("male")
            relevant_categories.append(category)
        if re.search(r"\bfemale\b", category, re.I):
            genders.add("female")
            relevant_categories.append(category)
        if re.search(r"^(Chanteur|Acteur|Animateur)\b", category, re.I):
            genders.add("male")
            relevant_categories.append(category)
        if re.search(r"^(Chanteuse|Actrice|Animatrice)\b", category, re.I):
            genders.add("female")
            relevant_categories.append(category)
    reason = "Belongs to %s." % join_names("category", relevant_categories)
    return genders, reason
Beispiel #11
0
def determine_gender_from_categories(page):
    genders = set()
    relevant_categories = []
    for category in page.categories:
        if re.search(r'\bmale\b', category, re.I):
            genders.add('male')
            relevant_categories.append(category)
        if re.search(r'\bfemale\b', category, re.I):
            genders.add('female')
            relevant_categories.append(category)
        if re.search(r'^(Chanteur|Acteur|Animateur)\b', category, re.I):
            genders.add('male')
            relevant_categories.append(category)
        if re.search(r'^(Chanteuse|Actrice|Animatrice)\b', category, re.I):
            genders.add('female')
            relevant_categories.append(category)
    reason = 'Belongs to %s.' % join_names('category', relevant_categories)
    return genders, reason
def determine_type_from_page(page):
    types = set()
    reasons = []
    background = page.infobox.get('background', '')
    if background == 'solo_singer':
        types.add('person')
        reasons.append('Infobox has "background = solo_singer".')
    if page.persondata.get('name'):
        types.add('person')
        reasons.append('Contains the "Persondata" infobox.')
    if background == 'group_or_band':
        types.add('group')
        reasons.append('Infobox has "background = group_or_band".')
    relevant_categories = []
    for category in page.categories:
        if category.endswith('groups') or category.startswith('Musical groups'):
            types.add('group')
            relevant_categories.append(category)
    if relevant_categories:
        reasons.append('Belongs to %s.' % join_names('category', relevant_categories))
    return types, ' '.join(reasons)
        if "disambiguationpages" in page:
            print " * disambiguation or album page, skipping"
            continue
        if "recordlabels" not in page:
            print " * not a record label page, skipping"
            continue
        page_title = pages[0]["title"]
        print ' * trying article "%s"' % (page_title,)
        artists = set([r[0] for r in db.execute(query_label_artists, (id,))])
        if name in artists:
            artists.remove(name)
        if not artists:
            continue
        found_artists = []
        for artist in artists:
            mangled_artist = mangle_name(artist)
            if len(mangled_artist) > 5 and mangled_artist in page:
                found_artists.append(artist)
        ratio = len(found_artists) * 1.0 / len(artists)
        print " * ratio: %s, has artists: %s, found artists: %s" % (ratio, len(artists), len(found_artists))
        if len(found_artists) < 2:
            continue
        url = "https://en.wikipedia.org/wiki/%s" % (quote_page_title(page_title),)
        text = "Matched based on the name. The page mentions %s." % (join_names("artist", found_artists),)
        print " * linking to %s" % (url,)
        print " * edit note: %s" % (text,)
        time.sleep(60)
        mb.add_url("label", gid, 216, url, text)
        break
    db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid,))
        # Examine albums
        found_albums = []
        albums = set([r[0] for r in db.execute(query_artist_albums, (artist['id'],) * 2)])
        albums_to_ignore = set()
        for album in albums:
            if mangle_name(artist['name']) in mangle_name(album):
                albums_to_ignore.add(album)
        albums -= albums_to_ignore
        if not albums:
            continue
        for album in albums:
            mangled_album = mangle_name(album)
            if len(mangled_album) > 6 and mangled_album in page:
                found_albums.append(album)
        if (found_albums):
            reasons.append(join_names('album', found_albums))
            out(' * has albums: %s, found albums: %s' % (len(albums), len(found_albums)))

        # Examine works
        found_works = []
        page = mangle_name(page_orig)
        works = set([r[0] for r in db.execute(query_artist_works, (artist['id'],) * 2)])
        for work in works:
            mangled_work = mangle_name(work)
            if mangled_work in page:
                found_works.append(work)
        if (found_works):
            reasons.append(join_names('work', found_works))
            out(' * has works: %s, found works: %s' % (len(works), len(found_works)))

        # Examine urls
Beispiel #15
0
        albums = set([
            r[0] for r in db.execute(query_artist_albums, (artist['id'], ) * 2)
        ])
        albums_to_ignore = set()
        for album in albums:
            if mangle_name(artist['name']) in mangle_name(album):
                albums_to_ignore.add(album)
        albums -= albums_to_ignore
        if not albums:
            continue
        for album in albums:
            mangled_album = mangle_name(album)
            if len(mangled_album) > 6 and mangled_album in page:
                found_albums.append(album)
        if (found_albums):
            reasons.append(join_names('album', found_albums))
            out(' * has albums: %s, found albums: %s' %
                (len(albums), len(found_albums)))

        # Examine works
        found_works = []
        page = mangle_name(page_orig)
        works = set([
            r[0] for r in db.execute(query_artist_works, (artist['id'], ) * 2)
        ])
        for work in works:
            mangled_work = mangle_name(work)
            if mangled_work in page:
                found_works.append(work)
        if (found_works):
            reasons.append(join_names('work', found_works))
            print ' * disambiguation or album page, skipping'
            continue
        if 'recordlabels' not in page:
            print ' * not a record label page, skipping'
            continue
        page_title = pages[0]['title']
        print ' * trying article "%s"' % (page_title,)
        artists = set([r[0] for r in db.execute(query_label_artists, (id,))])
        if name in artists:
            artists.remove(name)
        if not artists:
            continue
        found_artists = []
        for artist in artists:
            mangled_artist = mangle_name(artist)
            if len(mangled_artist) > 5 and mangled_artist in page:
                found_artists.append(artist)
        ratio = len(found_artists) * 1.0 / len(artists)
        print ' * ratio: %s, has artists: %s, found artists: %s' % (ratio, len(artists), len(found_artists))
        if len(found_artists) < 2:
            continue
        url = 'http://en.wikipedia.org/wiki/%s' % (urllib.quote(page_title.encode('utf8').replace(' ', '_')),)
        text = 'Matched based on the name. The page mentions %s.' % (join_names('artist', found_artists),)
        print ' * linking to %s' % (url,)
        print ' * edit note: %s' % (text,)
        time.sleep(60)
        mb.add_url("label", gid, 216, url, text)
        break
    db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid,))

Beispiel #17
0
        if 'recordlabels' not in page:
            print ' * not a record label page, skipping'
            continue
        page_title = pages[0]['title']
        print ' * trying article "%s"' % (page_title, )
        artists = set([r[0] for r in db.execute(query_label_artists, (id, ))])
        if name in artists:
            artists.remove(name)
        if not artists:
            continue
        found_artists = []
        for artist in artists:
            mangled_artist = mangle_name(artist)
            if len(mangled_artist) > 5 and mangled_artist in page:
                found_artists.append(artist)
        ratio = len(found_artists) * 1.0 / len(artists)
        print ' * ratio: %s, has artists: %s, found artists: %s' % (
            ratio, len(artists), len(found_artists))
        if len(found_artists) < 2:
            continue
        url = 'https://en.wikipedia.org/wiki/%s' % (
            quote_page_title(page_title), )
        text = 'Matched based on the name. The page mentions %s.' % (
            join_names('artist', found_artists), )
        print ' * linking to %s' % (url, )
        print ' * edit note: %s' % (text, )
        time.sleep(60)
        mb.add_url("label", gid, 216, url, text)
        break
    db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid, ))
        page_title = pages[0]["title"]
        found_albums = []
        albums = set([r[0] for r in db.execute(query_artist_albums, (id, id))])
        albums_to_ignore = set()
        for album in albums:
            if mangle_name(name) in mangle_name(album):
                albums_to_ignore.add(album)
        albums -= albums_to_ignore
        if not albums:
            continue
        for album in albums:
            mangled_album = mangle_name(album)
            if len(mangled_album) > 4 and mangled_album in page:
                found_albums.append(album)
        ratio = len(found_albums) * 1.0 / len(albums)
        print " * ratio: %s, has albums: %s, found albums: %s" % (ratio, len(albums), len(found_albums))
        min_ratio = 0.2
        if len(found_albums) < 2:
            continue
        if ratio < min_ratio:
            continue
        url = "http://ja.wikipedia.org/wiki/%s" % (quote_page_title(page_title),)
        text = "Matched based on the name. The page mentions %s." % (join_names("album", found_albums),)
        print " * linking to %s" % (url,)
        print " * edit note: %s" % (text,)
        mb.add_url("artist", gid, 179, url, text)
        break
    db.execute("INSERT INTO bot_wp_artist_ja (gid) VALUES (%s)", (gid,))

print processed, skipped
        tracks = set([r[0] for r in db.execute(query_album_tracks, (rg_id,))])
        tracks_to_ignore = set()
        for track in tracks:
            mangled_track = mangle_name(track)
            if len(mangled_track) <= 4 or mangle_name(rg_name) in mangle_name(track):
                tracks_to_ignore.add(track)
        tracks -= tracks_to_ignore
        if len(tracks) < 5:
            continue
        for track in tracks:
            mangled_track = mangle_name(track)
            if len(mangled_track) > 4 and mangled_track in page:
                found_tracks.append(track)
        ratio = len(found_tracks) * 1.0 / len(tracks)
        out(' * ratio: %s, has tracks: %s, found tracks: %s' % (ratio, len(tracks), len(found_tracks)))
        min_ratio = 0.7 if len(rg_name) > 4 else 1.0
        if ratio < min_ratio:
            colored_out(bcolors.WARNING, '  => ratio too low (min = %s)' % min_ratio)
            continue
        auto = ratio > 0.75 and (rg_sec_types is None or ('Compilation' not in rg_sec_types and 'Soundtrack' not in rg_sec_types))
        text = 'Matched based on the name. The page mentions artist "%s" and %s.' % (ac_name, join_names('track', found_tracks),)
        colored_out(bcolors.OKGREEN, ' * linking to %s' % (url,))
        out(' * edit note: %s' % (text,))
        time.sleep(5)
        mb.add_url("release_group", rg_gid, 89, url, text, auto=auto)
        break
    if processed is None:
        db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)", (rg_gid, wp_lang))
    else:
        db.execute("UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (rg_gid, wp_lang))
            mangled_track = mangle_name(track)
            if len(mangled_track) > 4 and mangled_track in page:
                found_tracks.append(track)
        ratio = len(found_tracks) * 1.0 / len(tracks)
        out(' * ratio: %s, has tracks: %s, found tracks: %s' %
            (ratio, len(tracks), len(found_tracks)))
        min_ratio = 0.7 if len(rg_name) > 4 else 1.0
        if ratio < min_ratio:
            colored_out(bcolors.WARNING,
                        '  => ratio too low (min = %s)' % min_ratio)
            continue
        auto = ratio > 0.75 and (rg_sec_types is None or
                                 ('Compilation' not in rg_sec_types
                                  and 'Soundtrack' not in rg_sec_types))
        text = 'Matched based on the name. The page mentions artist "%s" and %s.' % (
            ac_name,
            join_names('track', found_tracks),
        )
        colored_out(bcolors.OKGREEN, ' * linking to %s' % (url, ))
        out(' * edit note: %s' % (text, ))
        time.sleep(5)
        mb.add_url("release_group", rg_gid, 89, url, text, auto=auto)
        break
    if processed is None:
        db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)",
                   (rg_gid, wp_lang))
    else:
        db.execute(
            "UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)",
            (rg_gid, wp_lang))
        found_albums = []
        albums = set([r[0] for r in db.execute(query_artist_albums, (id, id))])
        albums_to_ignore = set()
        for album in albums:
            if mangle_name(name) in mangle_name(album):
                albums_to_ignore.add(album)
        albums -= albums_to_ignore
        if not albums:
            continue
        for album in albums:
            mangled_album = mangle_name(album)
            if len(mangled_album) > 4 and mangled_album in page:
                found_albums.append(album)
        ratio = len(found_albums) * 1.0 / len(albums)
        print ' * ratio: %s, has albums: %s, found albums: %s' % (ratio, len(albums), len(found_albums))
        min_ratio = 0.2
        if len(found_albums) < 2:
            continue
        #if ratio < min_ratio:
        #    continue
        url = 'http://ko.wikipedia.org/wiki/%s' % (quote_page_title(page_title),)
        text = 'Matched based on the name. The page mentions %s.' % (join_names('album', found_albums),)
        print ' * linking to %s' % (url,)
        print ' * edit note: %s' % (text,)
        mb.add_url("artist", gid, 179, url, text)
        break
    db.execute("INSERT INTO bot_wp_artist_ko (gid) VALUES (%s)", (gid,))

print processed, skipped

Beispiel #22
0
            print ' * disambiguation or album page, skipping'
            continue
        if 'recordlabels' not in page:
            print ' * not a record label page, skipping'
            continue
        page_title = pages[0]['title']
        print ' * trying article "%s"' % (page_title,)
        artists = set([r[0] for r in db.execute(query_label_artists, (id,))])
        if name in artists:
            artists.remove(name)
        if not artists:
            continue
        found_artists = []
        for artist in artists:
            mangled_artist = mangle_name(artist)
            if len(mangled_artist) > 5 and mangled_artist in page:
                found_artists.append(artist)
        ratio = len(found_artists) * 1.0 / len(artists)
        print ' * ratio: %s, has artists: %s, found artists: %s' % (ratio, len(artists), len(found_artists))
        if len(found_artists) < 2:
            continue
        url = 'http://en.wikipedia.org/wiki/%s' % (quote_page_title(page_title),)
        text = 'Matched based on the name. The page mentions %s.' % (join_names('artist', found_artists),)
        print ' * linking to %s' % (url,)
        print ' * edit note: %s' % (text,)
        time.sleep(60)
        mb.add_url("label", gid, 216, url, text)
        break
    db.execute("INSERT INTO bot_wp_label (gid) VALUES (%s)", (gid,))

        albums_to_ignore = set()
        for album in albums:
            if mangle_name(name) in mangle_name(album):
                albums_to_ignore.add(album)
        albums -= albums_to_ignore
        if not albums:
            continue
        for album in albums:
            mangled_album = mangle_name(album)
            if len(mangled_album) > 4 and mangled_album in page:
                found_albums.append(album)
        ratio = len(found_albums) * 1.0 / len(albums)
        print ' * ratio: %s, has albums: %s, found albums: %s' % (
            ratio, len(albums), len(found_albums))
        min_ratio = 0.2
        if len(found_albums) < 2:
            continue
        #if ratio < min_ratio:
        #    continue
        url = 'https://ko.wikipedia.org/wiki/%s' % (
            quote_page_title(page_title), )
        text = 'Matched based on the name. The page mentions %s.' % (
            join_names('album', found_albums), )
        print ' * linking to %s' % (url, )
        print ' * edit note: %s' % (text, )
        mb.add_url("artist", gid, 179, url, text)
        break
    db.execute("INSERT INTO bot_wp_artist_ko (gid) VALUES (%s)", (gid, ))

print processed, skipped
Beispiel #24
0
        tracks = set([r[0] for r in db.execute(query_album_tracks, (rg_id,))])
        tracks_to_ignore = set()
        for track in tracks:
            mangled_track = mangle_name(track)
            if len(mangled_track) <= 4 or mangle_name(rg_name) in mangle_name(track):
                tracks_to_ignore.add(track)
        tracks -= tracks_to_ignore
        if len(tracks) < 5:
            continue
        for track in tracks:
            mangled_track = mangle_name(track)
            if len(mangled_track) > 4 and mangled_track in page:
                found_tracks.append(track)
        ratio = len(found_tracks) * 1.0 / len(tracks)
        out(' * ratio: %s, has tracks: %s, found tracks: %s' % (ratio, len(tracks), len(found_tracks)))
        min_ratio = 0.7 if len(rg_name) > 4 else 1.0
        if ratio < min_ratio:
            colored_out(bcolors.WARNING, '  => ratio too low (min = %s)' % min_ratio)
            continue
        auto = ratio > 0.75 and (rg_sec_types is None or ('Compilation' not in rg_sec_types and 'Soundtrack' not in rg_sec_types))
        text = 'Matched based on the name. The page mentions artist "%s" and %s.' % (ac_name, join_names('track', found_tracks),)
        colored_out(bcolors.OKGREEN, ' * linking to %s' % (url,))
        out(' * edit note: %s' % (text,))
        time.sleep(5)
        mb.add_url("release_group", rg_gid, 89, url, text, auto=auto)
        break
    if processed is None:
        db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)", (rg_gid, wp_lang))
    else:
        db.execute("UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (rg_gid, wp_lang))
        found_albums = []
        albums = set([r[0] for r in db.execute(query_artist_albums, (id, id))])
        albums_to_ignore = set()
        for album in albums:
            if mangle_name(name) in mangle_name(album):
                albums_to_ignore.add(album)
        albums -= albums_to_ignore
        if not albums:
            continue
        for album in albums:
            mangled_album = mangle_name(album)
            if len(mangled_album) > 4 and mangled_album in page:
                found_albums.append(album)
        ratio = len(found_albums) * 1.0 / len(albums)
        print ' * ratio: %s, has albums: %s, found albums: %s' % (ratio, len(albums), len(found_albums))
        min_ratio = 0.2
        if len(found_albums) < 2:
            continue
        #if ratio < min_ratio:
        #    continue
        url = 'http://ko.wikipedia.org/wiki/%s' % (quote_page_title(page_title),)
        text = 'Matched based on the name. The page mentions %s.' % (join_names('album', found_albums),)
        print ' * linking to %s' % (url,)
        print ' * edit note: %s' % (text,)
        mb.add_url("artist", gid, 179, url, text)
        break
    db.execute("INSERT INTO bot_wp_artist_ko (gid) VALUES (%s)", (gid,))

print processed, skipped

Beispiel #26
0
            colored_out(bcolors.WARNING,
                        '  => ratio too low (min = %s)' % min_ratio)
            continue
        auto = ratio > 0.75 and (rg_sec_types is None or
                                 ('Compilation' not in rg_sec_types
                                  and 'Soundtrack' not in rg_sec_types))

        wp_url = 'https://%s.wikipedia.org/wiki/%s' % (
            wp_lang,
            quote_page_title(page_title),
        )
        wd_url = 'https://www.wikidata.org/wiki/%s' % wikipage.wikidata_id.upper(
        )
        text = 'Wikidata identifier found from matching Wikipedia page %s. The page mentions artist "%s" and %s.' % (
            wp_url,
            ac_name,
            join_names('track', found_tracks),
        )
        colored_out(bcolors.OKGREEN, ' * linking to %s' % (wd_url, ))
        out(' * edit note: %s' % (text, ))
        time.sleep(5)
        mb.add_url("release_group", rg_gid, 353, wd_url, text, auto=auto)
        break
    if processed is None:
        db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)",
                   (rg_gid, wp_lang))
    else:
        db.execute(
            "UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)",
            (rg_gid, wp_lang))
Beispiel #27
0
            mangled_track = mangle_name(track)
            if len(mangled_track) <= 4 or mangle_name(rg_name) in mangle_name(track):
                tracks_to_ignore.add(track)
        tracks -= tracks_to_ignore
        if len(tracks) < 5:
            continue
        for track in tracks:
            mangled_track = mangle_name(track)
            if len(mangled_track) > 4 and mangled_track in page:
                found_tracks.append(track)
        ratio = len(found_tracks) * 1.0 / len(tracks)
        out(' * ratio: %s, has tracks: %s, found tracks: %s' % (ratio, len(tracks), len(found_tracks)))
        min_ratio = 0.7 if len(rg_name) > 4 else 1.0
        if ratio < min_ratio:
            colored_out(bcolors.WARNING, '  => ratio too low (min = %s)' % min_ratio)
            continue
        auto = ratio > 0.75 and (rg_sec_types is None or ('Compilation' not in rg_sec_types and 'Soundtrack' not in rg_sec_types))

        wp_url = 'https://%s.wikipedia.org/wiki/%s' % (wp_lang, quote_page_title(page_title),)
        wd_url = 'https://www.wikidata.org/wiki/%s' % wikipage.wikidata_id.upper()
        text = 'Wikidata identifier found from matching Wikipedia page %s. The page mentions artist "%s" and %s.' % (wp_url, ac_name, join_names('track', found_tracks),)
        colored_out(bcolors.OKGREEN, ' * linking to %s' % (wd_url,))
        out(' * edit note: %s' % (text,))
        time.sleep(5)
        mb.add_url("release_group", rg_gid, 353, wd_url, text, auto=auto)
        break
    if processed is None:
        db.execute("INSERT INTO bot_wp_rg_link (gid, lang) VALUES (%s, %s)", (rg_gid, wp_lang))
    else:
        db.execute("UPDATE bot_wp_rg_link SET processed = now() WHERE (gid, lang) = (%s, %s)", (rg_gid, wp_lang))
def determine_country_from_text(page):
    countries = set()
    relevant_links = []
    find_countries_in_text(countries, relevant_links, page.abstract)
    reason = 'The first paragraph links to %s.' % join_names('', relevant_links)
    return countries, reason