def compare_by_url(url0, url1, w, h): tfs = [url_to_file(url) for url in (url0, url1)] inputs = [["JPEG:" + tf.name, '-resize', '%dx%d' % (w, h)] for tf in tfs] rmse = float(subprocess.check_output(["convert", "-metric", "rmse"] + sum(inputs, []) + ["-compare", "-format", "%[distortion]", "info:"])) bot.log("RMSE between newest 2 versions: %f" % rmse) return rmse
def replace_file(self, page, newurl): bot.log("Uploading from %s" % (newurl, )) success = page.upload( newurl, comment="Higher-resolution version from Geograph.", ignore_warnings=['exists']) if not success: raise UploadFailed("upload from %s to %s failed" % (newurl, page)) return
def gridimage_id_from_tree(self, tree): geograph_template = self.get_template(tree, "Geograph") try: gridimage_id = int(str(geograph_template.get(1))) except ValueError: raise BadTemplate("broken {{Geograph}} template") except IndexError: raise BadTemplate("broken {{Geograph}} template") bot.log("Geograph ID is %d" % (gridimage_id, )) return gridimage_id
def unmodified_on_geograph_since_upload(self, page, row): commons_dt = page.oldest_revision.timestamp # For some reason, pywikibot.Timestamps aren't timezone-aware. commons_dt = commons_dt.replace(tzinfo=timezone.utc) geograph_date = row['upd_timestamp'] geograph_dt = (datetime.strptime( geograph_date, "%Y-%m-%d %H:%M:%S").replace(tzinfo=gettz("Europe/London"))) bot.log("Commons timestamp: %s; Geograph timestamp: %s" % (commons_dt, geograph_dt)) return geograph_dt < commons_dt
def treat_page(self): try: self.process_page(self.current_page) except NotEligible as e: bot.log(str(e)) except TooManyTemplates as e: bot.log(str(e)) except MinorProblem as e: bot.warning(str(e)) except MajorProblem as e: bot.error(str(e))
def stopme(): """Drop this process from the throttle log, after pending threads finish. Can be called manually if desired, but if not, will be called automatically at Python exit. """ global stopped _logger = "wiki" if not stopped: debug("stopme() called", _logger) def remaining(): remainingPages = page_put_queue.qsize() - 1 # -1 because we added a None element to stop the queue remainingSeconds = datetime.timedelta(seconds=(remainingPages * config.put_throttle)) return (remainingPages, remainingSeconds) page_put_queue.put((None, [], {})) stopped = True if page_put_queue.qsize() > 1: num, sec = remaining() format_values = dict(num=num, sec=sec) output( "\03{lightblue}" "Waiting for %(num)i pages to be put. " "Estimated time remaining: %(sec)s" "\03{default}" % format_values ) while _putthread.isAlive(): try: _putthread.join(1) except KeyboardInterrupt: if input_yn( "There are %i pages remaining in the queue. " "Estimated time remaining: %s\nReally exit?" % remaining(), default=False, automatic_quit=False, ): return # only need one drop() call because all throttles use the same global pid try: list(_sites.values())[0].throttle.drop() log("Dropped throttle(s).") except IndexError: pass
def get_geograph_size_url(gridimage_id, info, size): if size != 'original' and size <= 640: return info['url'] # Evil hack, but better than digging it out of HTML. m = re.search(r"_([0-9a-f]{8})\.jpg$", info['url']) if not m: raise StrangeURL(info['url']) imgkey = m.group(1) bot.log("imgkey is %s" % (repr(imgkey), )) return ("https://www.geograph.org.uk/reuse.php?" + urlencode({ 'id': gridimage_id, 'download': imgkey, 'size': size }))
def stopme(): """Drop this process from the throttle log, after pending threads finish. Can be called manually if desired, but if not, will be called automatically at Python exit. """ global stopped _logger = "wiki" if not stopped: debug(u"stopme() called", _logger) def remaining(): remainingPages = page_put_queue.qsize() - 1 # -1 because we added a None element to stop the queue remainingSeconds = datetime.timedelta( seconds=(remainingPages * config.put_throttle)) return (remainingPages, remainingSeconds) page_put_queue.put((None, [], {})) stopped = True if page_put_queue.qsize() > 1: num, sec = remaining() format_values = dict(num=num, sec=sec) output(u'\03{lightblue}' u'Waiting for %(num)i pages to be put. ' u'Estimated time remaining: %(sec)s' u'\03{default}' % format_values) while(_putthread.isAlive()): try: _putthread.join(1) except KeyboardInterrupt: answer = inputChoice(u"""\ There are %i pages remaining in the queue. Estimated time remaining: %s Really exit?""" % remaining(), ['yes', 'no'], ['y', 'N'], 'N') if answer == 'y': return # only need one drop() call because all throttles use the same global pid try: list(_sites.values())[0].throttle.drop() log(u"Dropped throttle(s).") except IndexError: pass
def treat_page(self): try: gridimage_id = -1 if hasattr(self.current_page, 'gridimage_id'): gridimage_id = self.current_page.gridimage_id if self.current_page.namespace() != 6: return # Not a file page self.process_page(FilePage(self.current_page)) except NotEligible as e: print("%d: %s" % (gridimage_id, str(e)), file=whynot) bot.log(str(e)) except MinorProblem as e: print("%d: %s" % (gridimage_id, str(e)), file=whynot) bot.warning(str(e)) except MajorProblem as e: print("%d: %s" % (gridimage_id, str(e)), file=whynot) bot.error(str(e)) except HTTPError as e: print("%d: %s" % (gridimage_id, str(e)), file=whynot) bot.error(str(e))
def should_set_location(self, old_template, new_template, desc): oldparam = location_params(old_template) newparam = location_params(new_template) # We generally want to synchronise with Geograph. should_set = True # but not if there's no change (e.g. both are None) if old_template == new_template: should_set = False # but not yet if old template has no gridref if (old_template != None and new_template != None and '-' not in oldparam['source']): if old_template.has(4): should_set = False bot.log("%s template is DMS with no gridref: not updating" % (desc.capitalize(),)) else: (azon, azno, dist) = az_dist_between_locations( old_template, new_template) if dist < int(str(new_template.get('prec').value)): bot.log("%s has only moved by %d m: not updating" % (desc.capitalize(), dist)) should_set = False # and not if gridref hasn't changed if (old_template != None and new_template != None and oldparam['source'] == newparam['source']): should_set = False bot.log("%s gridref unchanged: not updating" % (desc.capitalize(),)) return should_set
def is_original_location(self, page, location_template): first_tree = self.get_original_tree(page) try: first_location = get_location(first_tree) except IndexError: return False # Can't be original if there's no original if location_template == first_location: bot.log("Location identical to original") return True try: lat = float(str(first_location.get(1))) lon = float(str(first_location.get(2))) except ValueError: return False # Can't do arithmetic on this def possible_roundings(orig): # Try rounding to 3, 4, and five d.p., both rounding # towards zero and rounding to nearest. Our aim here is # to detect cases where someone has mechanically rounded # the co-ordinates generated from Geograph without adding # any creativity, so it's OK for the bot to overwite them. return [ "%.5f" % (orig, ), "%.5f" % (orig - copysign(0.000005, orig), ), "%.4f" % (orig, ), "%.4f" % (orig - copysign(0.00005, orig), ), "%.3f" % (orig, ), "%.3f" % (orig - copysign(0.0005, orig), ), ] # Try both rounding towards zero and rounding to nearest. for rlat in possible_roundings(lat): for rlon in possible_roundings(lon): first_location.add(1, rlat) first_location.add(2, rlon) if location_template == first_location: bot.log("Location matches rounded original") return True return False
def replace_file_indirect(self, page, newurl): bot.log("Fetching from %s" % (newurl, )) r = client.get(newurl) r.raise_for_status() newimg = r.content bot.log("Got %d bytes of image" % (len(newimg), )) tf = tempfile.NamedTemporaryFile() tf.write(newimg) tf.flush() bot.log("File written to %s" % (tf.name, )) page.upload(tf.name, comment="Higher-resolution version from Geograph.", ignore_warnings=['exists'])
def process_page(self, page): location_added = False location_replaced = False location_removed = False location_was_mine = False object_location_added = False creditline_added = False revid = page.latest_revision_id tree = mwparserfromhell.parse(page.text) gridimage_id = get_gridimage_id(tree) c = geodb.cursor() c.execute( """ SELECT * FROM gridimage_base NATURAL JOIN gridimage_geo NATURAL JOIN gridimage_extra WHERE gridimage_id = ? """, (gridimage_id, )) row = c.fetchone() if row == None: raise NotInGeographDatabase("Geograph ID %d not in database" % (gridimage_id, )) try: location_template = get_location(tree) except IndexError: location_template = None new_location = location_from_row(row) minor = True bot.log("Existing location: %s" % (location_template, )) if (location_template != None and location_template.name == 'Location dec' and self.is_original_location(page, location_template) and self.is_geographbot_upload(page) and new_location != location_template): bot.log("Proposed location: %s" % (new_location, )) if (new_location != None and new_location.get('prec').value != '1000'): set_location(tree, new_location) azon, azno, distance = (az_dist_between_locations( location_template, new_location)) bot.log("Distance moved: %.1f m" % (distance, )) if distance > float(str(new_location.get('prec').value)): minor = False location_replaced = True else: set_location(tree, None) minor = False location_removed = True if (new_location != None and location_template == new_location and new_location.get('prec').value == '1000'): set_location(tree, None) minor = False location_removed = True location_was_mine = True if (location_template == None and new_location != None and new_location.get('prec').value != '1000'): set_location(tree, new_location) minor = False location_added = True bot.log("New camera location: %s" % (new_location, )) if not has_object_location(tree): objloc = object_location_from_row(row) if (objloc.get('prec').value == '1000' and not (location_removed or location_template == None)): bot.log("Skipping object location: precision is 1km") else: bot.log("New object location: %s" % (objloc, )) set_object_location(tree, objloc) minor = False object_location_added = True creditline = creditline_from_row(row) if (can_add_creditline(tree, creditline) and (self.unmodified_on_geograph_since_upload(page, row) or self.is_original_title(page, row['title']))): add_creditline(tree, creditline) creditline_added = True minor = False else: bot.log("Cannot add credit line") newtext = str(tree) if newtext != page.text: if location_replaced: if object_location_added: summary = ( "Replace dubious [[User:GeographBot|GeographBot]]-" "sourced camera location (moved %.1f m %s) and " "add object location, both from Geograph (%s)" % (distance, format_direction(azon), format_row(row))) else: summary = ( "Replace dubious [[User:GeographBot|GeographBot]]-" "sourced camera location (moved %.1f m %s), " "from Geograph (%s)" % (distance, format_direction(azon), format_row(row))) elif location_removed: if location_was_mine: summary = ( "Remove vague camera location (probably added by me)") else: summary = ( "Remove dubious [[User:GeographBot|GeographBot]]-" "sourced camera location") if object_location_added: summary += (" and add object location from Geograph (%s)" % (format_row(row), )) elif location_added: if object_location_added: summary = ( "Add camera and object locations from Geograph (%s)" % (format_row(row), )) else: summary = ("Add camera location from Geograph (%s)" % (format_row(row), )) elif object_location_added: summary = ("Add object location from Geograph (%s)" % (format_row(row), )) else: summary = "" if creditline_added: if summary == "": summary = "Add credit line with title from Geograph" else: summary += "; add credit line with title from Geograph" bot.log("edit summary: %s" % (summary, )) # Before we save, make sure pywikibot's view of the latest # revision hasn't changed. If it has, that invalidates # our parse tree, and we need to start again. if page.latest_revision_id != revid: bot.log("page has changed (%d != %d): restarting edit" % (page.latest_revision_id, revid)) self.process_page(page) return page.text = newtext page.save(summary, minor=minor)
def process_page(self, page): reason = None location_replaced = False location_removed = False object_location_added = False location_added = False creditline_added = False revid = page.latest_revision_id tree = mwparserfromhell.parse(page.text) try: old_location = get_location(tree) except IndexError: raise NotEligible("no location present") oldcamparam = location_params(old_location) bot.log("param: %s" % (repr(oldcamparam),)) if 'source' in oldcamparam: raise NotEligible("location already has source") if old_location.name not in ('Location dec', 'location dec'): raise NotEligible("location using unusual template") firstrev = page.oldest_revision.hist_entry() first_tree = mwparserfromhell.parse(page.getOldVersion(firstrev.revid)) try: first_location = get_location(first_tree) except IndexError: # Location added since upload. Maybe added by DschwenBot? for oldrev in page.revisions(): if oldrev.user == 'DschwenBot': if (oldrev.comment == "adding missing Location data from " "www.geograph.org.uk"): added_tree = mwparserfromhell.parse( page.getOldVersion(oldrev.revid)) added_location = get_location(added_tree) if old_location != added_location: raise NotEligible("location changed since added") reason = "added by [[User:DschwenBot]]" if not reason: raise NotEligible("location added since upload") else: if old_location == first_location: if (firstrev.comment in ("Transferred from geograph.co.uk using " "[https://geograph2commons.toolforge.org/ " "geograph2commons]", # Some uploads have this curious typo'd version of the summary. "Transferred from geograph.co.uk using " "[https://geograph2commons.toolforge.org/ " "grograph2commons]")): # Would like to check tag, but pywikibot doesn't seem to # expose it. reason = ("set at upload by " "[[toollabs:geograph2commons|geograph2commons]]") elif (firstrev.user in ("File Upload Bot (Magnus Manske)", "GeographBot")): reason = ("set at upload by [[User:%s|%s]]" % (firstrev.user, firstrev.user)) else: # Location changed since first upload. # This may have been BotMultichill fixing a broken upload. for oldrev in page.revisions(): if oldrev.user == 'BotMultichill': if (oldrev.comment == "Fixing location"): fixed_tree = mwparserfromhell.parse( page.getOldVersion(oldrev.revid)) fixed_location = get_location(fixed_tree) if old_location != fixed_location: raise NotEligible("location changed since fixing") reason = "fixed by [[User:BotMultichill]]" if not reason: raise NotEligible("location changed since upload") if reason: try: paramstr = str(old_location.get(3).value) except ValueError: paramstr = "" if paramstr != "": paramstr += "_" old_location.add(3, paramstr + "source:geograph") newtext = str(tree) if newtext != page.text: summary = ("Mark Geograph-derived location (%s) with appropriate " "\"source\" parameter" % (reason,)) bot.log("edit summary: %s" % (summary,)) # Before we save, make sure pywikibot's view of the latest # revision hasn't changed. If it has, that invalidates # our parse tree, and we need to start again. if page.latest_revision_id != revid: bot.log("page has changed (%d != %d): restarting edit" % (page.latest_revision_id, revid)) self.process_page(page) return page.text = newtext page.save(summary)
def mark_for_attention(site, title, comment): bot.log("marking for human review") page = pywikibot.Page(site, title) page.text += "\n[[Category:Dubious uploads by Geograph Update Bot]]" page.save("Marking last upload for human attention (%s)" % (comment,))
def get_geograph_full(gridimage_id, info): url = get_geograph_full_url(gridimage_id, info) bot.log("Fetching from %s" % (url, )) r = client.get(url) r.raise_for_status() return r.content
def process_page(self, page): camera_action = None object_action = None sdc_camera_action = None sdc_object_action = None creditline_added = False sdc_edits = {} revid = page.latest_revision_id tree = mwparserfromhell.parse(page.text) try: gridimage_id = get_gridimage_id(tree) except ValueError as e: raise BadTemplate(str(e)) except IndexError as e: raise BadTemplate(str(e)) mapit = MapItSettings() c = geodb.cursor() c.execute(""" SELECT * FROM gridimage_base NATURAL JOIN gridimage_geo NATURAL JOIN gridimage_extra WHERE gridimage_id = ? """, (gridimage_id,)) row = c.fetchone() if row == None: raise NotInGeographDatabase("Geograph ID %d not in database" % (gridimage_id,)) try: old_location = get_location(tree) except IndexError: old_location = None try: old_object_location = get_object_location(tree) except IndexError: old_object_location = None minor = False # May need fixing bot.log("Old cam: %s" % (old_location,)) bot.log("Old obj: %s" % (old_object_location,)) if old_location == None and old_object_location == None: minor = False mapit.allowed = True # No geocoding at all: add from Geograph new_location = location_from_row(row, mapit=mapit) new_object_location = object_location_from_row(row, mapit=mapit) if new_location and new_location.get('prec').value != '1000': set_location(tree, new_location) camera_action = 'add' set_object_location(tree, new_object_location) object_action = 'add' else: oldcamparam = location_params(old_location) oldobjparam = location_params(old_object_location) if ((old_location == None or re.match(r'^geograph(-|$)', oldcamparam.get('source',''))) and (old_object_location == None or re.match(r'^geograph(-|$)', oldobjparam.get('source','')))): bot.log("Old geocoding is from Geograph") # Existing geocoding all from Geograph, so updating # from Geograph OK if needed. new_location = location_from_row(row, mapit=mapit) new_object_location = object_location_from_row(row, mapit=mapit) # Should we update locations? should_set_cam = self.should_set_location( old_location, new_location, "camera") should_set_obj = self.should_set_location( old_object_location, new_object_location, "object") if ((should_set_cam and old_location != None) or (should_set_obj and old_object_location != None)): # Check if SDC has location templates. statements = self.get_sdc_statements(page) for s in statements.get('P1259', []): if (should_set_cam and old_location != None and statement_matches_template(s, old_location)): s_new = camera_statement_from_row(row) if s_new == None: s_new = dict(id=s['id'], remove="") bot.log("Removing %s statement %s" % (s['mainsnak']['property'], s['id'])) sdc_camera_action = 'remove' else: s_new['id'] = s['id'] bot.log("Updating %s statement %s" % (s['mainsnak']['property'], s['id'])) sdc_camera_action = 'update' sdc_edits.setdefault('claims', []) sdc_edits['claims'].append(s_new) for s in statements.get('P625', []): if (should_set_obj and old_object_location != None and statement_matches_template(s, old_object_location)): s_new = object_statement_from_row(row) if s_new == None: s_new = dict(id=s['id'], remove="") bot.log("Removing %s statement %s" % (s['mainsnak']['property'], s['id'])) sdc_object_action = 'remove' else: s_new['id'] = s['id'] bot.log("Updating %s statement %s" % (s['mainsnak']['property'], s['id'])) sdc_object_action = 'update' sdc_edits.setdefault('claims', []) sdc_edits['claims'].append(s_new) # Do it if necessary: mapit.allowed = True if should_set_cam: set_location(tree, location_from_row(row, mapit=mapit)) if old_location == None: if new_location != None: camera_action = 'add' else: if new_location == None: camera_action = 'remove' else: camera_action = 'update' if should_set_obj: set_object_location(tree, object_location_from_row(row, mapit=mapit)) if old_object_location == None: if new_object_location != None: object_action = 'add' else: if new_object_location == None: object_action = 'remove' else: object_action = 'update' creditline = creditline_from_row(row) if (can_add_creditline(tree, creditline) and self.unmodified_on_geograph_since_upload(page, row)): add_creditline(tree, creditline) creditline_added = True minor = False else: bot.log("Cannot add credit line") newtext = str(tree) if newtext != page.text: format_params = dict(row=format_row(row)) if camera_action == 'update': format_params['camera_move'] = ( self.describe_move(old_location, new_location)) if object_action == 'update': format_params['object_move'] = ( self.describe_move(old_object_location, new_object_location)) summary = (self.summary_formats[(camera_action, object_action)] .format(**format_params)) if creditline_added: if summary == "": summary = "Add credit line with title from Geograph" else: summary += "; add credit line with title from Geograph" if mapit.used: # Requested credit where MapIt is used: # 'Please attribute us with the text “Powered by Mapit” # and a link back to the MapIt front page.' summary += ( " [powered by MapIt: http://global.mapit.mysociety.org]") bot.log("edit summary: %s" % (summary,)) # Before we save, make sure pywikibot's view of the latest # revision hasn't changed. If it has, that invalidates # our parse tree, and we need to start again. if page.latest_revision_id != revid: bot.log("page has changed (%d != %d): restarting edit" % (page.latest_revision_id, revid)) self.process_page(page) return page.text = newtext page.save(summary, minor=minor) if sdc_edits: sdc_summary = (self.summary_formats[(sdc_camera_action, sdc_object_action)] .format(**format_params)) bot.log("SDC edit summary: %s" % (sdc_summary,)) self.site._simple_request( action='wbeditentity', format='json', id='M%d' % (page.pageid,), data=json.dumps(sdc_edits), token=self.site.tokens['csrf'], summary=sdc_summary, bot=True, baserevid=revid).submit()
def process_page(self, page): if not page.botMayEdit(): raise NotEligible("bot forbidden from editing this page") tree = mwparserfromhell.parse(page.text) try: geograph_template = tlgetone(tree, ['Geograph']) except IndexError: raise NotEligible("No {{Geograph}} template") try: gridimage_id = int(str(geograph_template.get(1).value)) commons_author = str(geograph_template.get(2).value) except ValueError: raise BadTemplate("broken {{Geograph}} template") except IndexError: raise BadTemplate("broken {{Geograph}} template") bot.log("Geograph ID is %d" % (gridimage_id, )) c = geodb.cursor() c.execute( """ SELECT * FROM gridimage_base NATURAL JOIN gridimage_size WHERE gridimage_id = ? """, (gridimage_id, )) row = c.fetchone() if row == None: raise NotInGeographDatabase("Geograph ID %d not in database" % (gridimage_id, )) gwidth, gheight, original_width, original_height, original_diff = [ row[x] for x in ('width', 'height', 'original_width', 'original_height', 'original_diff') ] if original_width == 0: raise NotEligible("no high-res version available") fi = page.latest_file_info bot.log("%d × %d version available" % (original_width, original_height)) bot.log("current Commons version is %d × %d" % (fi.width, fi.height)) if fi.width >= original_width and fi.height >= original_height: raise NotEligible("no higher-resolution version on Geograph") if not aspect_ratios_match(fi.width, fi.height, original_width, original_height): raise NotEligible("aspect ratios of images differ") if (fi.width, fi.height) == (gwidth, gheight): if original_diff == 'yes': raise NotEligible("Geograph says pictures are different") else: if max(fi.width, fi.height) not in (800, 1024): raise NotEligible("dimensions do not match any Geograph image") for ofi in page.get_file_history().values(): if ofi.user == "Geograph Update Bot": raise NotEligible("file already uploaded by me") geograph_info = get_geograph_info(gridimage_id) if (canonicalise_name(geograph_info['author_name']) != canonicalise_name(commons_author)): raise NotEligible( "author does not match Geograph (%s vs. %s)" % (repr(commons_author), repr(geograph_info['author_name']))) try: credit_line = tlgetone(tree, ['Credit line']) except IndexError: pass else: commons_title = ''.join([ str(x) for x in credit_line.get('Other').value.filter_text() ]).strip() bot.log("Title on Commons: %s" % (commons_title, )) if (canonicalise_name(commons_title) != canonicalise_name( geograph_info['title'])): raise NotEligible( "title does not match Geograph (%s vs. %s)" % (repr(commons_title), repr(geograph_info['title']))) geograph_image = get_geograph_size(gridimage_id, geograph_info, max(fi.width, fi.height)) if hashlib.sha1(geograph_image).hexdigest() != fi.sha1: raise NotEligible("SHA-1 does not match Geograph %d px image." % (max(fi.width, fi.height), )) bot.log("Image matches. Update possible.") self.replace_file(page, get_geograph_full_url(gridimage_id, geograph_info)) compare_revisions(self.site, parameters=dict(titles=page.title()))