Beispiel #1
0
def query(q):
    return SparqlQuery.select(q)
Beispiel #2
0
sparql_query = SparqlQuery()

fromID = int(sys.argv[1])
toID = int(sys.argv[2])


def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]


for chunk in batch(range(fromID, toID), BATCH_SIZE):
    candidates = set()
    props = ' '.join(["p:P" + str(x) for x in chunk])
    sparql = QUERY % props
    #    print(sparql)
    items = sparql_query.select(sparql)
    #    print(items)
    for item in items:
        results = sparql_query.select(START_END_QUERY % item['p'])
        if len(results) > 5:
            candidates.add(item['p'][len('http://www.wikidata.org/prop/'):])
    props = ' '.join(
        ["wd:" + x['p'][len('http://www.wikidata.org/prop/'):] for x in items])
    results = sparql_query.select(LABELS % props)
    for res in results:
        propID = res['p'][len('http://www.wikidata.org/entity/'):]
        print("%s %s%s" % (propID, res['pLabel'],
                           "" if propID in candidates else " <-- no data"))
Beispiel #3
0
}"""
sparql_query = SparqlQuery()

fromID=int(sys.argv[1])
toID=int(sys.argv[2])

def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

for chunk in batch(range(fromID, toID), BATCH_SIZE):
    candidates = set()
    props = ' '.join(["p:P" + str(x) for x in chunk])
    sparql = QUERY % props
 #   print(sparql)
    items = sparql_query.select(sparql)
    for item in items:
        results = sparql_query.select(POINT_QUERY % item['p'])
        if len(results) > 5:
            candidates.add(item['p'][len('http://www.wikidata.org/prop/'):])
    props = ' '.join(["wd:" + x['p'][len('http://www.wikidata.org/prop/'):] for x in items])
    results = sparql_query.select(LABELS % props)
    for res in results:
        propID = res['p'][len('http://www.wikidata.org/entity/'):]
        print("%s %s%s" % (
                    propID,
                    res['pLabel'],
                    "" if propID in candidates else " <-- no data"
        ))
Beispiel #4
0
def sync_edition_olids_by_isbns(dry_run=False, limit=None):
    """
    Find editions on Wikidata and Open Library with the same ISBNs and add the
    Open Library ID to Wikidata and the Wikidata ID to Open Library.
    """
    wd = pywikibot.Site("wikidata", "wikidata")
    wd_repo = wd.data_repository()
    wdqs = SparqlQuery()  # Wikidata Query Service

    ol = OpenLibrary()

    # append date to query avoid getting cached results
    query = QUERY + f"\n # {datetime.datetime.now()}"
    sparql_results = wdqs.select(query)

    # Group by key (sparql hits timeouts when we do the grouping there)
    qid_to_isbns = dict()
    for row in sparql_results:
        qid = row['item'].split('/')[-1]
        if qid not in qid_to_isbns:
            qid_to_isbns[qid] = []
        qid_to_isbns[qid].append(normalize_isbn(row['isbn']))

    logger.info("Found %d editions to update", len(qid_to_isbns))
    ol_books_modified = 0
    wd_items_modified = 0
    for qid, isbns in qid_to_isbns.items():
        logger.debug("Processing %s", qid)

        for isbn_len in [10, 13]:
            count = len([isbn for isbn in isbns if len(isbn) == isbn_len])
            if count > 1:
                logger.warning("%s has multiple isbn%ss (%d)", qid, isbn_len,
                               count)

        ol_books = [ol.Edition.get(isbn=isbn) for isbn in isbns]
        ol_books = [book for book in ol_books if book and book.olid != 'None']
        ol_books = remove_dupes(ol_books, lambda ed: ed.olid)

        logger.info("Found %d Open Library book(s) for %s (isbns %s)",
                    len(ol_books), qid, ', '.join(isbns))
        if len(ol_books) > 1:
            logger.warning(
                "Multiple (%d) Open Library books for %s (isbns %s)",
                len(ol_books), qid, ', '.join(isbns))

        # update open library data
        for book in ol_books:
            if 'wikidata' not in book.identifiers:
                book.identifiers['wikidata'] = []

            book_qids = book.identifiers['wikidata']

            if qid in book_qids:
                logger.warning("%s already has qid %s", book.olid, qid)
                continue

            book_qids.append(qid)
            if len(book_qids) > 1:
                logger.warning("%s now has multiple (%d) qids (%s)", book.olid,
                               len(book_qids), ', '.join(book_qids))
            if not dry_run:
                book.save("[sync_edition_olids] add wikidata identifier")
            logger.debug("Added %s to %s", qid, book.olid)
            ol_books_modified += 1

        # update wikidata data
        for book in ol_books:
            item = pywikibot.ItemPage(wd_repo, qid)
            claim = make_str_claim(wd_repo, 'P648', book.olid)
            if not dry_run:
                item.addClaim(claim, bot=True)
            logger.debug("Added %s to %s", book.olid, qid)
            wd_items_modified += 1

        if limit:
            ol_books_limit = ol_books_modified >= limit
            wd_items_limit = wd_items_modified >= limit
            if ol_books_limit and wd_items_limit:
                logger.info(
                    "Hit limit of %s on both Open Library and Wikidata; Stopping.",
                    limit)
            elif ol_books_limit:
                logger.info("Hit limit of %s on Open Library; Stopping.",
                            limit)
            elif wd_items_limit:
                logger.info("Hit limit of %s on Wikidata; Stopping.", limit)
            if ol_books_limit or wd_items_limit:
                break
    logger.info("Updated %d Open Library books and %d Wikidata items",
                ol_books_modified, wd_items_modified)
Beispiel #5
0
repo = site.data_repository()
sparql_query = SparqlQuery()
TITLES_QUERY = """
SELECT ?p ?title WHERE {
  ?p wdt:P31 wd:Q3305213 .
  ?p (wdt:P1705|wdt:P1476) ?title .
  BIND(lang(?title) as ?lt)
  FILTER(?lt != "und")
  FILTER NOT EXISTS {
    ?p rdfs:label ?pl .
    FILTER(lang(?pl) = ?lt)
  }
} LIMIT 100
"""

results = sparql_query.select(TITLES_QUERY, full_data=True)
#print(results)
for result in results:
    lang = result['title'].language
    title = result['title'].value
    if "-" in lang:
        print("Skipping hyphenaten language %s for now" % lang)
        continue
    item = pywikibot.ItemPage(repo, result['p'].getID())
    item.get()
    if lang in item.labels:
        print("%s already has label %s" % (result['p'].getID(), lang))
    else:
        print("Adding %s for %s:%s" % (title, result['p'].getID(), lang))
        item.labels[lang] = title
        item.editLabels(item.labels, summary="Set label from title")