def query(q): return SparqlQuery.select(q)
sparql_query = SparqlQuery() fromID = int(sys.argv[1]) toID = int(sys.argv[2]) def batch(iterable, n=1): l = len(iterable) for ndx in range(0, l, n): yield iterable[ndx:min(ndx + n, l)] for chunk in batch(range(fromID, toID), BATCH_SIZE): candidates = set() props = ' '.join(["p:P" + str(x) for x in chunk]) sparql = QUERY % props # print(sparql) items = sparql_query.select(sparql) # print(items) for item in items: results = sparql_query.select(START_END_QUERY % item['p']) if len(results) > 5: candidates.add(item['p'][len('http://www.wikidata.org/prop/'):]) props = ' '.join( ["wd:" + x['p'][len('http://www.wikidata.org/prop/'):] for x in items]) results = sparql_query.select(LABELS % props) for res in results: propID = res['p'][len('http://www.wikidata.org/entity/'):] print("%s %s%s" % (propID, res['pLabel'], "" if propID in candidates else " <-- no data"))
}""" sparql_query = SparqlQuery() fromID=int(sys.argv[1]) toID=int(sys.argv[2]) def batch(iterable, n=1): l = len(iterable) for ndx in range(0, l, n): yield iterable[ndx:min(ndx + n, l)] for chunk in batch(range(fromID, toID), BATCH_SIZE): candidates = set() props = ' '.join(["p:P" + str(x) for x in chunk]) sparql = QUERY % props # print(sparql) items = sparql_query.select(sparql) for item in items: results = sparql_query.select(POINT_QUERY % item['p']) if len(results) > 5: candidates.add(item['p'][len('http://www.wikidata.org/prop/'):]) props = ' '.join(["wd:" + x['p'][len('http://www.wikidata.org/prop/'):] for x in items]) results = sparql_query.select(LABELS % props) for res in results: propID = res['p'][len('http://www.wikidata.org/entity/'):] print("%s %s%s" % ( propID, res['pLabel'], "" if propID in candidates else " <-- no data" ))
def sync_edition_olids_by_isbns(dry_run=False, limit=None): """ Find editions on Wikidata and Open Library with the same ISBNs and add the Open Library ID to Wikidata and the Wikidata ID to Open Library. """ wd = pywikibot.Site("wikidata", "wikidata") wd_repo = wd.data_repository() wdqs = SparqlQuery() # Wikidata Query Service ol = OpenLibrary() # append date to query avoid getting cached results query = QUERY + f"\n # {datetime.datetime.now()}" sparql_results = wdqs.select(query) # Group by key (sparql hits timeouts when we do the grouping there) qid_to_isbns = dict() for row in sparql_results: qid = row['item'].split('/')[-1] if qid not in qid_to_isbns: qid_to_isbns[qid] = [] qid_to_isbns[qid].append(normalize_isbn(row['isbn'])) logger.info("Found %d editions to update", len(qid_to_isbns)) ol_books_modified = 0 wd_items_modified = 0 for qid, isbns in qid_to_isbns.items(): logger.debug("Processing %s", qid) for isbn_len in [10, 13]: count = len([isbn for isbn in isbns if len(isbn) == isbn_len]) if count > 1: logger.warning("%s has multiple isbn%ss (%d)", qid, isbn_len, count) ol_books = [ol.Edition.get(isbn=isbn) for isbn in isbns] ol_books = [book for book in ol_books if book and book.olid != 'None'] ol_books = remove_dupes(ol_books, lambda ed: ed.olid) logger.info("Found %d Open Library book(s) for %s (isbns %s)", len(ol_books), qid, ', '.join(isbns)) if len(ol_books) > 1: logger.warning( "Multiple (%d) Open Library books for %s (isbns %s)", len(ol_books), qid, ', '.join(isbns)) # update open library data for book in ol_books: if 'wikidata' not in book.identifiers: book.identifiers['wikidata'] = [] book_qids = book.identifiers['wikidata'] if qid in book_qids: logger.warning("%s already has qid %s", book.olid, qid) continue book_qids.append(qid) if len(book_qids) > 1: logger.warning("%s now has multiple (%d) qids (%s)", book.olid, len(book_qids), ', '.join(book_qids)) if not dry_run: book.save("[sync_edition_olids] add wikidata identifier") logger.debug("Added %s to %s", qid, book.olid) ol_books_modified += 1 # update wikidata data for book in ol_books: item = pywikibot.ItemPage(wd_repo, qid) claim = make_str_claim(wd_repo, 'P648', book.olid) if not dry_run: item.addClaim(claim, bot=True) logger.debug("Added %s to %s", book.olid, qid) wd_items_modified += 1 if limit: ol_books_limit = ol_books_modified >= limit wd_items_limit = wd_items_modified >= limit if ol_books_limit and wd_items_limit: logger.info( "Hit limit of %s on both Open Library and Wikidata; Stopping.", limit) elif ol_books_limit: logger.info("Hit limit of %s on Open Library; Stopping.", limit) elif wd_items_limit: logger.info("Hit limit of %s on Wikidata; Stopping.", limit) if ol_books_limit or wd_items_limit: break logger.info("Updated %d Open Library books and %d Wikidata items", ol_books_modified, wd_items_modified)
repo = site.data_repository() sparql_query = SparqlQuery() TITLES_QUERY = """ SELECT ?p ?title WHERE { ?p wdt:P31 wd:Q3305213 . ?p (wdt:P1705|wdt:P1476) ?title . BIND(lang(?title) as ?lt) FILTER(?lt != "und") FILTER NOT EXISTS { ?p rdfs:label ?pl . FILTER(lang(?pl) = ?lt) } } LIMIT 100 """ results = sparql_query.select(TITLES_QUERY, full_data=True) #print(results) for result in results: lang = result['title'].language title = result['title'].value if "-" in lang: print("Skipping hyphenaten language %s for now" % lang) continue item = pywikibot.ItemPage(repo, result['p'].getID()) item.get() if lang in item.labels: print("%s already has label %s" % (result['p'].getID(), lang)) else: print("Adding %s for %s:%s" % (title, result['p'].getID(), lang)) item.labels[lang] = title item.editLabels(item.labels, summary="Set label from title")