def process_contributor(self, contributor): if self._skip_revision: return if contributor is None: self._skip_revision = True sender_tag = contributor.find(self.tag["username"]) if sender_tag is None: try: self._sender = contributor.find(self.tag["ip"]).text if self._sender is None: self._skip_revision = True self.counter_deleted += 1 except AttributeError: ## user deleted self._skip_revision = True self.counter_deleted += 1 else: try: self._sender = mwlib.normalize_pagename(sender_tag.text) except AttributeError: ## if username is defined but empty, look for id tag try: self._sender = contributor.find(self.tag["id"]).text except KeyError: self._skip_revision = True
def process_contributor(self, contributor): if self._skip_revision: return if contributor is None: logging.warning('contributor is None') self._skip_revision = True sender_tag = contributor.find(self.tag['username']) if sender_tag is None: self._skip_revision = True else: try: self._sender = mwlib.normalize_pagename(sender_tag.text) except AttributeError: ## if username is defined but empty, look for id tag self._sender = contributor.find(self.tag['id']).text
def process_text(self, elem): assert self.user, "User still not defined" text = elem.text if not (text and self.user): return if (mwlib.isHardRedirect(text) or mwlib.isSoftRedirect(text)): return talks = self.sig_finder.find(text) self.ecache.add(mwlib.normalize_pagename(self.user), talks) self.count += 1 if not self.count % 500: print self.count
def process_title(self, elem): if self._skip_revision: return title = elem.text a_title = title.split(':') if len(a_title) > 1 and a_title[0] in self.user_talk_names: self._receiver = mwlib.normalize_pagename(a_title[1]) else: self._skip = True return try: title.index('/') self.count_archive += 1 self._skip = True except ValueError: pass
def process_text(self, elem): assert self.user, "User still not defined" text = elem.text if not (text and self.user): return if (mwlib.isHardRedirect(text) or mwlib.isSoftRedirect(text)): return try: talks = self.sig_finder.find(text) self.ecache.add(mwlib.normalize_pagename(self.user), talks) # Checks if self.user is a valid pagename except AttributeError: self._skip = True return self.count += 1 if not self.count % 500: logging.info("Counter: %d", self.count)
def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "_receiver")) if self._skip_revision: return title = elem.text a_title = title.split(":") if len(a_title) > 1 and smart_str(a_title[0]) in self.user_talk_names: self._receiver = mwlib.normalize_pagename(a_title[1]) else: self._skip = True return try: title.index("/") self.count_archive += 1 self._skip = True except ValueError: pass finally: del title, a_title
def process_title(self, elem): self.delattr(("_counter", "_type", "_title", "_skip", "_date", "_receiver", "_time", "_id", "_username", "_ip")) if self._skip_revision: return title = elem.text a_title = title.split(':') if len(a_title) > 1 and smart_str(a_title[0]) in self.user_talk_names: self._receiver = mwlib.normalize_pagename(a_title[1]) else: self._skip = True return try: title.index('/') self.count_archive += 1 self._skip = True except ValueError: pass finally: del title, a_title