def generate_category_relation(input=None, output=None): fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore redirected pages for both article and talk pages if page.redirect: continue # only parse category page if page.namespace != 14: continue # only one revision on the current page for rev in page: try: wikicode = mwp.parse(rev.text) except: print(page.id, page.title, page.namespace) continue # parse the article page to extract category info of the article cate = page.title.lower()[len("category:"):] for link in wikicode.filter_wikilinks(): if link.title.startswith('Category:'): super_cate = link.title.lower().replace('category:', "") # categories.append(cate) record = { "cate": cate, "super_cate": super_cate, "cate_pid": page.id } from json import dumps print(dumps(record), file=fout)
def parse_file(input=None, output=None): fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore old version pages that were redirected if page.redirect: print(page.title) # print(page.redirect_title) continue for rev in page: from time import mktime, strptime pattern = '%Y-%m-%d%HT%M%SZ' pattern = '%Y%m%d%H%M%S' epoch = int(mktime(strptime(str(rev.timestamp), pattern))) record = { "rev_id": rev.id, "rev_page_id": page.id, "rev_page_title": page.title, "rev_user_id": rev.contributor.id, "rev_user_text": rev.contributor.user_text, "ns": page.namespace, "rev_comment": rev.comment, "rev_timestamp": epoch } from json import dumps print(dumps(record), file=fout)
def parse_file(input=None, output=None, bot_file=None): #bot_list = load_bots(bot_file) fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore old version pages that were redirected if page.redirect: continue diff_content = "" if page.namespace in [1, 3, 5]: print("{},{}".format(page.title, page.namespace)) revtext = [] for rev in page: from time import mktime, strptime pattern = '%Y%m%d%H%M%S' epoch = int(mktime(strptime(str(rev.timestamp), pattern))) current_revtext = useful_text(rev.text) diff_content = diff_sentences(revtext, current_revtext) record = { "rev_timestamp": epoch, "rev_id": rev.id, "rev_user_text": rev.contributor.user_text, "rev_user_id": rev.contributor.id, "rev_page_title": page.title, "rev_page_id": page.id, "ns": page.namespace, "rev_diff": diff_content } revtext = current_revtext from json import dumps print(dumps(record), file=fout) else: for rev in page: diff_content = "None" from time import mktime, strptime pattern = '%Y%m%d%H%M%S' epoch = int(mktime(strptime(str(rev.timestamp), pattern))) record = { "rev_timestamp": epoch, "rev_id": rev.id, "rev_user_text": rev.contributor.user_text, "rev_user_id": rev.contributor.id, "rev_page_title": page.title, "rev_page_id": page.id, "ns": page.namespace, "rev_diff": diff_content } from json import dumps print(dumps(record), file=fout) return
def parse_file(input=None, output=None): fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore old version pages that were redirected if page.redirect: continue record = { "page_title": page.title, "page_id": page.id, "ns": page.namespace } from json import dumps print(dumps(record), file=fout)
def parse_file(input=None, output=None, bot_file=None): bot_list = load_bots(bot_file) fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore old version pages that were redirected if page.redirect: continue for rev in page: user_text = rev.contributor.user_text from IPy import IP try: IP(user_text) continue except: if user_text in bot_list: continue from time import mktime, strptime pattern = '%Y%m%d%H%M%S' epoch = int(mktime(strptime(str(rev.timestamp), pattern))) record = { "rev_id": rev.id, "rev_page_id": page.id, "rev_page_title": page.title, "rev_user_id": rev.contributor.id, "rev_user_text": rev.contributor.user_text, "ns": page.namespace, "rev_timestamp": epoch } from json import dumps print(dumps(record), file=fout)
def parse_dump(dump_filename, wanted_page_ids, found_pages_dict, users_page_edits_dict, pages_pwr_dict, logfile): """ Parse the given dump, processing assessments for the given talk page IDs. @param dump_filename: path to the dump file to process @type dump_filename: str @param wanted_page_ids: dictionary where keys are talk page IDs, values don't matter, we're only using the dict for fast lookups @type wanted_page_ids: dict """ # Construct dump file iterator dump = Iterator.from_file(functions.open_file(dump_filename)) bots_file = "resources/wikipedia_bots_full.txt" bots = {} try: with open(bots_file, "r") as fin: csvreader = csv.reader(fin) for line in csvreader: bots[line[0].lower()] = True except: print("Invalid bots file - only text matching with 'bot' will be used") with open(logfile, "a") as fout: fout.write("Invalid bots file - only text regex with 'bot' followed by whitespace will be used.\n") scripts = ["commonsdelinker", "conversion script"] count = 0 # Iterate through pages for page in dump: # skip if not a page we want to process if not page.id in wanted_page_ids: continue try: with open(logfile, "a", encoding="utf-8", errors="backslashreplace") as fout: fout.write(str(datetime.now()) + ": " + page.title + "\n") print(page.title) except: with open(logfile, "a") as fout: fout.write(str(datetime.now()) + ": next spatial article.\n") print("next spatial article.") state = persistence.State() count += 1 counts_dict = { "total_edits": 0, "bot_edits": 0, "unverified_bot_edits": 0, "known_script_edits": 0, "anonymous_edits": 0, "awb_edits": 0, "minor_edits": 0, "wpcleaner_edits": 0, } # Iterate through a page's revisions for revision in page: # skip if there's no content if not revision.text: continue if revision.comment and "awb" in revision.comment.lower(): pwr = state.process(revision.text, revision="awb") else: pwr = state.process(revision.text, revision=revision.contributor.user_text) counts_dict["total_edits"] += 1 try: if revision.contributor.user_text: process_rev(revision, counts_dict, bots, scripts, users_page_edits_dict, page.id) except: try: print( "Error in revision.contributor.user_text {0} for page {1}".format( revision.contributor.user_text, page.title ) ) with open(logfile, "a") as fout: fout.write( "Error in revision.contributor.user_text {0} for page {1}\n".format( revision.contributor.user_text, page.title ) ) except: print("Error in a revision.contributor.user_text for a page.") with open(logfile, "a") as fout: fout.write("Error in a revision.contributor.user_text for a page.") found_pages_dict[page.id] = wanted_page_ids[page.id] found_pages_dict[page.id].update(counts_dict) current_state = { "total_tokens": 0, "bot_tokens": 0, "unverified_bot_tokens": 0, "known_script_tokens": 0, "anonymous_tokens": 0, "awb_tokens": 0, } for tk in pwr[0]: # loop through tokens in current state of the page current_state["total_tokens"] += 1 try: if tk.revisions[0]: process_current_page(tk.revisions[0].lower(), current_state, bots, scripts, pages_pwr_dict, page.id) except: try: print("Error in processing token {0} for page {1}".format(tk.text, page.id)) with open(logfile, "a", encoding="utf-8", errors="backslashreplace") as fout: fout.write("Error in processing token {0} for page {1}.\n".format(tk.text, str(page.id))) except: print("Error in processing a token for page {0}".format(page.id)) with open(logfile, "a") as fout: fout.write("Error in processing a token for page {0}.\n".format(page.id)) found_pages_dict[page.id].update(current_state) # ok, done return
def parse_file(input=None, output=None, wp_dir=None, cat_dir=None): wp_output = wp_dir + output.replace(FILE_TYPE, '') + '_wikiproject' + FILE_TYPE cat_output = cat_dir + output.replace(FILE_TYPE, '') + '_category' + FILE_TYPE wp_fout = open(wp_output, 'w') cat_fout = open(cat_output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # print(page.title, page.namespace) # ignore redirected pages for both article and talk pages if page.redirect: continue if page.namespace != 0 and page.namespace != 1: continue # only one revision on the current page for rev in page: # catch rare parsing errors try: wikicode = mwp.parse(rev.text) except: print(page.id, page.title, page.namespace) continue # parse the article page to extract category info of the article if page.namespace == 0: categories = [] title = page.title.lower() for link in wikicode.filter_wikilinks(): if link.title.startswith('Category:'): cate = link.title.lower().replace('category:', "") categories.append(cate) if not LIST_FORMAT: record = { "pageId": page.id, "title": title, "category": cate } from json import dumps print(dumps(record), file=cat_fout) if LIST_FORMAT: record = { "pageId": page.id, "title": title, "categories": categories } from json import dumps print(dumps(record), file=cat_fout) # parse the talk page to extract wikiproject info of the article if page.namespace == 1: title = page.title.lower().replace("talk:", "") cls = importance = "None" wikiprojects = [] for template in wikicode.filter_templates(): if template.name == 'WikiProjectBannerShell': continue if template.name.lower().startswith('wikiproject'): from re import search wikiproject = template.name.lower().replace( "wikiproject", "").strip() wikiprojects.append(wikiproject) template = str(template).replace("}", "|").replace( " ", "").replace("\n", "") try: cls = search(r'\|class=([a-z-A-Z]+)\|', template).group(1) importance = search(r'\|importance=([a-z-A-Z]+)\|', template).group(1) except AttributeError: pass if not LIST_FORMAT: record = { "pageId": page.id, "title": title, "wikiproject": wikiproject, "class": cls.lower(), "importance": importance.lower() } from json import dumps print(dumps(record), file=wp_fout) if LIST_FORMAT: record = { "pageId": page.id, "title": title, "wikiprojects": wikiprojects, "class": cls.lower(), "importance": importance.lower() } from json import dumps print(dumps(record), file=wp_fout)
) #geoip2.database.Reader(os.environ['GEO2_DIRECTORY'], maxminddb.MODE_MMAP_EXT) flagged = FlaggedTools.load('/home/alexander/flagged.pkl') users = DepRepo.flags() #UserFlagsTools.load(os.environ['USER_FLAGS']) ################# d1 = dt.datetime.now() pp = PageProcessor(flagged, users, db, geoip) #pp.clear() cnt = 0 totalcnt = 0 rcnt = 0 #pr = cProfile.Profile() #pr.enable() dump = Iterator.from_file(open_file(file_name)) for page in dump: totalcnt += 1 if totalcnt % 50 == 0: print( str(rcnt) + "/" + str(cnt) + "/" + str(totalcnt) + " pushed: " + str(pp.items_pushed)) gc.collect() excl = page.namespace != 0 and page.namespace != 10 if not excl: cnt += 1 # check page namespace rcnt += pp.process(page, excl)
def analyseArticle(file_name): # Container of relationships. relations = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = mwIterator.from_file(open_file(file_name)) # Iterate over the pages. for page in dumpIterator: i = 0 # Iterate over revisions of the article. for revision in page: vandalism = False # Update the information about the previous revision. revision_prev = revision_curr if (revision.sha1 == None): revision.sha1 = Text.calculateHash(revision.text) if (revision.sha1 in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.comment!= None and revision.comment.find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.id) revision_curr.length = len(revision.text) revision_curr.timestamp = revision.timestamp # Relation of the current relation. relation = Relation() relation.revision = int(revision.id) relation.length = len(revision.text) # Some revisions don't have contributor. if (revision.contributor != None): revision_curr.contributor_id = revision.contributor.id revision_curr.contributor_name = revision.contributor.user_text relation.author = revision.contributor.user_text else: revision_curr.contributor_id = 'Not Available ' + revision.id revision_curr.contribur_name = 'Not Available ' + revision.id relation.author = 'Not Available ' + revision.id # Content within the revision. text_curr = revision.text.lower() # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation) if (not vandalism): # Add the current revision with all the information. revisions.update({revision_curr.wikipedia_id : revision_curr}) relations.update({revision_curr.wikipedia_id : relation}) revision_order.append((revision_curr.wikipedia_id, False)) # Update the fake revision id. i = i+1 # Calculate the number of tokens in the revision. total = 0 for p in revision_curr.ordered_paragraphs: for paragraph_curr in revision_curr.paragraphs[p]: for hash_sentence_curr in paragraph_curr.sentences.keys(): for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]: total = total + len(sentence_curr.words) revision_curr.total_tokens = total relation.total_tokens = total else: revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev spam.append(revision.sha1) return (revisions, revision_order, relations)