def main(): dump = Iterator.from_file( bz2.open( "/mnt/documents/Divers/frwiki-20150331-pages-articles.xml.bz2")) RE_WORD = re.compile(r"[\w-]{1,30}", re.IGNORECASE) l = LeveldbStorage(3) # , path='/home/palkeo/Divers/stage_wikipedia') l.clear() i = 0 wcount = 0 start = datetime.datetime.now() sentences = None for page in dump: i += 1 print("Article %s, %s tokens, %s tokens/second" % (i, wcount, wcount // (datetime.datetime.now() - start).total_seconds())) text = str(next(iter(page)).text).lower() sentences = text.split(".") sentences = list( filter(None, map(lambda p: RE_WORD.findall(p), sentences))) for sentence in sentences: wcount += len(sentence) l.add_sentence(sentence)
def generate_category_relation(input=None, output=None): fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore redirected pages for both article and talk pages if page.redirect: continue # only parse category page if page.namespace != 14: continue # only one revision on the current page for rev in page: try: wikicode = mwp.parse(rev.text) except: print(page.id, page.title, page.namespace) continue # parse the article page to extract category info of the article cate = page.title.lower()[len("category:"):] for link in wikicode.filter_wikilinks(): if link.title.startswith('Category:'): super_cate = link.title.lower().replace('category:', "") # categories.append(cate) record = { "cate": cate, "super_cate": super_cate, "cate_pid": page.id } from json import dumps print(dumps(record), file=fout)
def parse_specific_pages_given_pageID_anonymous(xml_dump, page_id, output_file): dump_iter = Iterator.from_file(open(xml_dump, encoding='latin-1')) for page_iter in dump_iter: if page_iter.id == page_id: page = Page(page_iter.id, page_iter.title, page_iter.namespace, page_iter.redirect, page_iter.restrictions, page_iter.__iter__()) rev_iter_idx = 0 detector = reverts.Detector() # edit_list contains tuples <revision_id, user_id> to track previous revisions. For anonymous, saved in form -1,<IP address> edit_list = [] for rev_iter in page: rev_iter_idx = rev_iter_idx + 1 if rev_iter.contributor.id != None: edit_list.append([rev_iter.id, rev_iter.contributor.id]) else: edit_list.append( [rev_iter.id, rev_iter.contributor.user_text]) revert_info = detector.process(rev_iter.sha1, rev_iter.id) if revert_info != None: reverter = find_user(edit_list, revert_info.reverting) revertedTo = find_user(edit_list, revert_info.reverted_to) for i in range(len(revert_info.reverteds)): reverted = find_user(edit_list, revert_info.reverteds[i]) output_file.write(reverter, ",", revertedTo, ",", reverted, "\n") break
def get_id2properties(lang, date, output_dir): """Build lookup for length of page (bytes).""" Page = namedtuple('Page', ['title', 'length']) output_fn = os.path.join(output_dir, '{0}_page_props.tsv'.format(lang)) id2props = {} if os.path.exists(output_fn): with open(output_fn, 'r') as fin: tsvreader = csv.reader(fin, delimiter="\t") for line in tsvreader: pid = int(line[0]) title = line[1] plen = int(line[2]) id2props[pid] = Page(title, plen) else: file_path = build_local_currentpage_dump_fn(lang, date) print("Gathering page properties from dump.") with bz2.BZ2File(file_path, 'r') as fin: d = Iterator.from_file(fin) for i, page in enumerate(d, start=1): if not page.redirect and page.namespace == 0: curr_rev = next(page) id2props[page.id] = Page(page.title, len(curr_rev.text)) if i % 1000000 == 0: print("{0} pages evaluated. {1} retained.".format( i, len(id2props))) with open(output_fn, 'w') as fout: tsvwriter = csv.writer(fout, delimiter="\t") for pid in id2props: tsvwriter.writerow( [pid, id2props[pid].title, id2props[pid].length]) return id2props
def parse_file(input=None, output=None): fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore old version pages that were redirected if page.redirect: print(page.title) # print(page.redirect_title) continue for rev in page: from time import mktime, strptime pattern = '%Y-%m-%d%HT%M%SZ' pattern = '%Y%m%d%H%M%S' epoch = int(mktime(strptime(str(rev.timestamp), pattern))) record = { "rev_id": rev.id, "rev_page_id": page.id, "rev_page_title": page.title, "rev_user_id": rev.contributor.id, "rev_user_text": rev.contributor.user_text, "ns": page.namespace, "rev_comment": rev.comment, "rev_timestamp": epoch } from json import dumps print(dumps(record), file=fout)
def parse_file(input=None, output=None, bot_file=None): #bot_list = load_bots(bot_file) fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore old version pages that were redirected if page.redirect: continue diff_content = "" if page.namespace in [1, 3, 5]: print("{},{}".format(page.title, page.namespace)) revtext = [] for rev in page: from time import mktime, strptime pattern = '%Y%m%d%H%M%S' epoch = int(mktime(strptime(str(rev.timestamp), pattern))) current_revtext = useful_text(rev.text) diff_content = diff_sentences(revtext, current_revtext) record = { "rev_timestamp": epoch, "rev_id": rev.id, "rev_user_text": rev.contributor.user_text, "rev_user_id": rev.contributor.id, "rev_page_title": page.title, "rev_page_id": page.id, "ns": page.namespace, "rev_diff": diff_content } revtext = current_revtext from json import dumps print(dumps(record), file=fout) else: for rev in page: diff_content = "None" from time import mktime, strptime pattern = '%Y%m%d%H%M%S' epoch = int(mktime(strptime(str(rev.timestamp), pattern))) record = { "rev_timestamp": epoch, "rev_id": rev.id, "rev_user_text": rev.contributor.user_text, "rev_user_id": rev.contributor.id, "rev_page_title": page.title, "rev_page_id": page.id, "ns": page.namespace, "rev_diff": diff_content } from json import dumps print(dumps(record), file=fout) return
def parse_revisions(xml_dump, output_file): dump_iter = Iterator.from_file(open(xml_dump, encoding='latin-1')) #print("------------Site Metadata----------------", file=output_file) #print("\nSiteName: ",dump_iter.site_name,"\nBase: ",dump_iter.base,"\nGenerator: ",dump_iter.generator,"\nCase: ",dump_iter.case, file=output_file) #page_iter_idx = 0 # Number of pages #cumulative_rev_iter_idx = 0 # Total number of revisions of all pages ## Iterate through pages #for page_iter in dump_iter: # page_iter_idx = page_iter_idx+1 # rev_iter_idx = 0 # # Iterate through a page's revisions # for revision_iter in page_iter: # rev_iter_idx = rev_iter_idx+1 # cumulative_rev_iter_idx = cumulative_rev_iter_idx+1 # #print(revision_iter.id) # #print(page_iter_idx, cumulative_rev_iter_idx) page_iter_idx = 0 # Number of pages for page_iter in dump_iter: if page_iter_idx < 1000: page_iter_idx = page_iter_idx + 1 page = Page(page_iter.id, page_iter.title, page_iter.namespace, page_iter.redirect, page_iter.restrictions, page_iter.__iter__()) #print("\n",page_iter_idx,". PageID: ",page.id, file=output_file) output_file.write("#\n") rev_iter_idx = 0 detector = reverts.Detector() edit_list = [] for rev_iter in page: rev_iter_idx = rev_iter_idx + 1 # revision = Revision(rev_iter.id, rev_iter.timestamp) edit_list.append([rev_iter.id, rev_iter.contributor.id]) #print(edit_list, file=output_file) #print("\n\t",rev_iter_idx,".",rev_iter,"\n", file=output_file) revert_info = detector.process(rev_iter.sha1, rev_iter.id) if revert_info != None: reverter = find_user_including_anonymous( edit_list, revert_info.reverting) revertedTo = find_user_including_anonymous( edit_list, revert_info.reverted_to) for i in range(len(revert_info.reverteds)): reverted = find_user_including_anonymous( edit_list, revert_info.reverteds[i]) output_file.write(reverter, ",", revertedTo, ",", reverted, "\n")
def id2text_iterator(self): capture_ids = not self.page_ids with bz2.BZ2File(self.article_dump, 'r') as fin: d = Iterator.from_file(fin) for page in d: if not page.redirect and page.namespace == 0: wikitext = next(page).text plaintext = mwparserfromhell.parse(wikitext).strip_code() self.page_count += 1 if capture_ids: self.page_ids.append(page.id) yield plaintext else: self.skipped += 1 if capture_ids: print("{0}: {1} pages yielded. {2} skipped.".format( self.article_dump, self.page_count, self.skipped))
def parse_file(input=None, output=None): fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore old version pages that were redirected if page.redirect: continue record = { "page_title": page.title, "page_id": page.id, "ns": page.namespace } from json import dumps print(dumps(record), file=fout)
def parse_file(input=None, output=None, bot_file=None): bot_list = load_bots(bot_file) fout = open(output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # ignore old version pages that were redirected if page.redirect: continue for rev in page: user_text = rev.contributor.user_text from IPy import IP try: IP(user_text) continue except: if user_text in bot_list: continue from time import mktime, strptime pattern = '%Y%m%d%H%M%S' epoch = int(mktime(strptime(str(rev.timestamp), pattern))) record = { "rev_id": rev.id, "rev_page_id": page.id, "rev_page_title": page.title, "rev_user_id": rev.contributor.id, "rev_user_text": rev.contributor.user_text, "ns": page.namespace, "rev_timestamp": epoch } from json import dumps print(dumps(record), file=fout)
def main(): dump = Iterator.from_file(bz2.open('/mnt/documents/Divers/frwiki-20150331-pages-articles.xml.bz2')) RE_WORD = re.compile(r"[\w-]{1,30}", re.IGNORECASE) l = LeveldbStorage(3) #, path='/home/palkeo/Divers/stage_wikipedia') l.clear() i = 0 wcount = 0 start = datetime.datetime.now() sentences = None for page in dump: i += 1 print("Article %s, %s tokens, %s tokens/second" % (i, wcount, wcount // (datetime.datetime.now() - start).total_seconds())) text = str(next(iter(page)).text).lower() sentences = text.split('.') sentences = list(filter(None, map(lambda p: RE_WORD.findall(p), sentences))) for sentence in sentences: wcount += len(sentence) l.add_sentence(sentence)
def main(): #current events templates regexp currentevent_templates_r = { "cawiki": re.compile( r'(?im)(\{\{\s*(?:Actualitat|Fet[ _]actual|Fets[ _]recents)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))' ), "dewiki": re.compile( r'(?im)(\{\{\s*(?:Laufendes[ _]Ereignis|Laufende[ _]Veranstaltung|Aktuelles[ _]Ereignis)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))' ), "enwiki": re.compile( r'(?im)(\{\{\s*(?:Current|Current[ _]antics|Current[ _]?disaster|Current[ _]election|Current[ _]?events?|Current[ _]news|Current[ _]paragraph|Current[ _]?person|Current[ _-]?related|Currentsect|Current[ _-]?section|Current[ _]spaceflight|Current[ _]?sport|Current[ _]sport-related|Current[ _]sports[ _]transaction|Current[ _]tornado[ _]outbreak|Current[ _]tropical[ _]cyclone|Current[ _]war|Currentwarfare|Flux|Live|Developing|Developingstory|Ongoing[ _]election|Ongoing[ _]event|Recent[ _]?death|Recent[ _]death[ _]presumed|Recent[ _]?event|Recent[ _]news|Recent[ _]related|Related[ _]current)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))' ), "eswiki": re.compile( r'(?im)(\{\{\s*(?:Actual|Actualidad|Actualidad[ _]deporte|Current|EA|Evento[ _]actual|Launching|Muerte[ _]reciente|Sencillo[ _]actual|Single[ _]actual|Telenovela[ _]en[ _]emisión|Teleserie[ _]en[ _]emisión)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))' ), } #current events categories regexp currentevent_categories_r = { "cawiki": re.compile( r'(?im)\[\[\s*(?:Categoria|Category)\s*:\s*Articles[ _]d\'actualitat\s*[\|\]]' ), "dewiki": re.compile( r'(?im)\[\[\s*(?:Kategorie|Category)\s*:\s*Wikipedia:Laufendes[ _]Ereignis\s*[\|\]]' ), "enwiki": re.compile(r'(?im)\[\[\s*Category\s*:\s*Current[ _]events\s*[\|\]]'), "eswiki": re.compile( r'(?im)\[\[\s*(?:Categoría|Category)\s*:\s*Actualidad\s*[\|\]]'), } #namespaces to analyse wanted_namespaces = { "cawiki": [0], #main "dewiki": [0], #main "enwiki": [0], #main "eswiki": [0, 104], #main, anexo } #fields to generate fields = [ 'page_id', 'page_namespace', 'page_title', 'page_creator', 'page_creator_type', #ip, registered, unknown 'page_creation_date', 'it_rev_id', #it = inserted tag 'it_rev_timestamp', 'it_rev_username', 'it_rev_comment', 'rt_rev_id', #rt = removed tag 'rt_rev_timestamp', 'rt_rev_username', 'rt_rev_comment', 'tag_type', #template, category, both 'tag_string', 'tag_time_since_creation_(hours)', 'tag_duration_(hours)', 'tag_edits', 'tag_distinct_editors', #'maintenance_templates', #templates for maintenance during current event 'diff_len', 'diff_links', 'diff_extlinks', 'diff_refs', 'diff_templates', 'diff_images', 'page_moves', #ideas: diff_sections ] #maintenance templates maintenance_templates_r = { "eswiki": re.compile( r'(?im)(\{\{\s*(?:Actualizar|Ampliación[ _]propuesta|Archivo|Artículo[ _]indirecto/esbozo|Artículo[ _]infraesbozo|Autotrad|Aviso[ _]infraesbozo|Bulo|Cita[ _]requerida|Complejo|Contextualizar|Copyedit|Copyvio|Curiosidades|Desactualizado|Desambiguación|Destruir|Discusión[ _]sosegada|Discutido|En[ _]desarrollo|En[ _]uso|Evento[ _]actual|Evento[ _]futuro|Excesivamente[ _]detallado|Ficticio|Formato[ _]de[ _]cita|FP|Fuentes[ _]no[ _]fiables|Fuente[ _]primaria|Fusionando|Fusionar|Fusionar[ _]desde|Fusionar[ _]en|Infraesbozo|Irrelevante|Largo|Mal[ _]traducido|Mejorar[ _]redacción|No[ _]es[ _]un[ _]foro|No[ _]neutralidad|Página[ _]bloqueada|Plagio|Plagio[ _]externo|Polémico|Posible[ _]copyvio|Posible[ _]fusionar|Problemas[ _]artículo|Promocional|Publicidad|PVfan|Reducido|Referencias|Referencias[ _]adicionales|Renombrar|Revisar[ _]traducción|Separado[ _]de|Separar|Sin[ _]?relevancia|SRA|Traducción|Traducido[ _]de|Transferir[ _]a|Wikificar)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))' ), } #regexps for counts links_r = re.compile(r'(?im)(\[\[[^\[\]\r\n]+\]\])') # [[..|?..]] extlinks_r = re.compile(r'(?im)(://)') # :// refs_r = re.compile(r'(?im)< */ *ref *>') # </ref> templates_r = re.compile(r'(?im)((?:^|[^\{\}])\{\{[^\{\}])') # {{ images_r = re.compile( r'(?im)\[\[\s*(File|Image|Fitxer|Imatge|Datei|Bild|Archivo|Imagen)\s*:' ) #get parameters dumpfilename = sys.argv[1] chunkid = sys.argv[2] #input can be compressed or plain xml if dumpfilename.endswith('.7z'): #7za or 7zr are valid fp = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % dumpfilename, shell=True, stdout=subprocess.PIPE, bufsize=65535) pages = Iterator.from_file(fp.stdout) elif dumpfilename.endswith('.bz2'): import bz2 source = bz2.BZ2File(dumpfilename) pages = Iterator.from_file(source) else: source = open(dumpfilename) pages = Iterator.from_file(source) #get dump language and date dumplang = dumpfilename.split('/')[-1].split('-')[0] dumpdate = datetime.datetime.strptime( '%s 23:59:59' % (dumpfilename.split('/')[-1].split('-')[1]), '%Y%m%d %H:%M:%S') pagecount = 0 #blank CSV currentevents filename = 'currentevents-%s-%s.csv.%s' % ( dumplang, dumpdate.strftime('%Y%m%d'), chunkid) f = open(filename, 'w', encoding='utf-8') output = '{0}\n'.format('|'.join(fields)) f.write(output) f.close() #blank CSV pages filename = 'pages-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'), chunkid) g = open(filename, 'w', encoding='utf-8') output = 'page_id|page_namespace|page_title|page_creation_rev_id|page_creation_date|page_creator|page_is_redirect\n' g.write(output) g.close() #analyse dump for page in pages: if int(page.namespace ) not in wanted_namespaces[dumplang]: #skip unwanted namespaces continue msg = 'Analysing: {0}'.format(page.title) print(msg.encode('utf-8')) pagecount += 1 if pagecount % 100 == 0: msg = 'Analysed {0} pages'.format(pagecount) print(msg.encode('utf-8')) #if pagecount > 2000: # if dumpfilename.endswith('.7z'): # fp.kill() # break currentevents = [] tagged = False revcount = 0 page_creator = '' page_creator_type = '' pagecreationdate = '' page_is_redirect = page.redirect and 'True' or 'False' temp = {} # to detect wrongly removed templates prevrevtext = '' for rev in page: if revcount == 0: if rev.contributor: page_creator = rev.contributor.user_text and rev.contributor.user_text or '' page_creator_type = rev.contributor.id and rev.contributor.id != 0 and 'registered' or 'ip' else: page_creator = '' page_creator_type = 'unknown' pagecreationdate = rev.timestamp filename = 'pages-%s-%s.csv.%s' % ( dumplang, dumpdate.strftime('%Y%m%d'), chunkid) g = csv.writer(open(filename, 'a', encoding='utf-8'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) g.writerow([ page.id, page.namespace, page.title, rev.id, pagecreationdate.long_format(), page_creator, page_is_redirect ]) revcount += 1 #print (rev.id) rev_user_text = '' if rev.contributor: rev_user_text = rev.contributor.user_text and rev.contributor.user_text or '' revtext = rev.text and rev.text or '' revcomment = re.sub(r'\n', '', rev.comment and rev.comment or '') if re.search(currentevent_templates_r[dumplang], revtext) or re.search( currentevent_categories_r[dumplang], revtext): if tagged: #still is current event currentevents[-1]['tag_edits'] += 1 currentevents[-1]['tag_distinct_editors'].add( rev_user_text) #check page moves if pagemoved(revtext, prevrevtext): currentevents[-1]['page_moves'] += 1 else: #tagged as current event just now if temp: if timediffinhours( temp['rt_rev_timestamp'].long_format(), rev.timestamp.long_format()) <= 24 * 2: #if it was current event less than X days before, then the template was wrongly removed currentevents[-1] = temp.copy() currentevents[-1]['tag_edits'] += 1 currentevents[-1]['tag_distinct_editors'].add( rev_user_text) temp = {} tagged = currentevents[-1]['it_rev_timestamp'] continue tagged = rev.timestamp tag_time_since_creation = timediffinhours( pagecreationdate.long_format(), rev.timestamp.long_format()) print(page.title.encode('utf-8'), tag_time_since_creation) tag_string = 'unknown' if re.search(currentevent_templates_r[dumplang], revtext): #unify a bit the tag, to ease comparison later tag_string = re.findall( currentevent_templates_r[dumplang], revtext)[0].lower().strip() tag_string = re.sub(r'_', r' ', tag_string) tag_string = re.sub(r'\{\{\s+', r'{{', tag_string) tag_string = re.sub(r'\s+\}\}', r'}}', tag_string) tag_string = re.sub(r'\s*\|\s*', r'|', tag_string) tag_string = re.sub(r'\n', r'', tag_string) tag_string = re.sub(r'\|\|+', r'|', tag_string) tag_string = re.sub( r'(?i)\|\s*date\s*\=\s*[A-Za-z0-9 ]+', r'', tag_string) #remove |date=May 2014 in English WP tag_type = "" if re.search(currentevent_templates_r[dumplang], revtext): tag_type = "template" if re.search(currentevent_categories_r[dumplang], revtext): tag_type = "both" elif re.search(currentevent_categories_r[dumplang], revtext): tag_type = "category" currentevent = { 'page_id': str(page.id), 'page_namespace': str(page.namespace), 'page_title': page.title, 'page_creator': page_creator, 'page_creator_type': page_creator_type, 'page_creation_date': pagecreationdate, 'it_rev_id': str(rev.id), 'it_rev_timestamp': rev.timestamp, 'it_rev_username': rev.contributor.user_text, 'it_rev_comment': revcomment and revcomment or "", 'rt_rev_id': "", 'rt_rev_timestamp': "", 'rt_rev_username': "", 'rt_rev_comment': "", 'tag_type': tag_type, 'tag_string': tag_string, 'tag_time_since_creation_(hours)': str(tag_time_since_creation), 'tag_duration_(hours)': "", 'tag_edits': 1, #counter to increment 'tag_distinct_editors': set([rev_user_text]), #set of unique editors #prevrevtext to catch any change right when is marked as current event 'diff_len': len(prevrevtext), 'diff_links': len(re.findall(links_r, prevrevtext)), 'diff_extlinks': len(re.findall(extlinks_r, prevrevtext)), 'diff_refs': len(re.findall(refs_r, prevrevtext)), 'diff_templates': len(re.findall(templates_r, prevrevtext)), 'diff_images': len(re.findall(images_r, prevrevtext)), 'page_moves': 0, } currentevents.append(currentevent) else: if tagged: #tag has been removed just now temp = currentevents[-1].copy( ) #saving temporaly to check if it is added again shortly temp['rt_rev_timestamp'] = rev.timestamp currentevents[-1]['page_creation_date'] = currentevents[ -1]['page_creation_date'].long_format() currentevents[-1]['it_rev_timestamp'] = currentevents[-1][ 'it_rev_timestamp'].long_format() currentevents[-1]['rt_rev_id'] = str(rev.id) currentevents[-1][ 'rt_rev_timestamp'] = rev.timestamp.long_format() currentevents[-1][ 'rt_rev_username'] = rev.contributor.user_text currentevents[-1][ 'rt_rev_comment'] = revcomment and revcomment or "" currentevents[-1][ 'tag_duration_(hours)'] = timediffinhours( tagged.long_format(), rev.timestamp.long_format()) currentevents[-1]['tag_edits'] += 1 currentevents[-1]['tag_distinct_editors'].add( rev_user_text) currentevents[-1]['tag_distinct_editors'] = len( currentevents[-1]['tag_distinct_editors']) currentevents[-1]['diff_len'] = len( revtext) - currentevents[-1]['diff_len'] #revtext because it was current event until this very edit currentevents[-1]['diff_links'] = len( re.findall(links_r, revtext)) - currentevents[-1]['diff_links'] currentevents[-1]['diff_extlinks'] = len( re.findall( extlinks_r, revtext)) - currentevents[-1]['diff_extlinks'] currentevents[-1]['diff_refs'] = len( re.findall(refs_r, revtext)) - currentevents[-1]['diff_refs'] currentevents[-1]['diff_templates'] = len( re.findall( templates_r, revtext)) - currentevents[-1]['diff_templates'] currentevents[-1]['diff_images'] = len( re.findall(images_r, revtext)) - currentevents[-1]['diff_images'] currentevents[-1]['page_moves'] += 1 tagged = False else: if temp: #keep temp updated temp['tag_edits'] += 1 temp['tag_distinct_editors'].add(rev_user_text) #check page moves if pagemoved(revtext, prevrevtext): temp['page_moves'] += 1 prevrevtext = revtext #needed for diff stats if tagged: #tagged still as of dumpdate currentevents[-1]['page_creation_date'] = currentevents[-1][ 'page_creation_date'].long_format() currentevents[-1]['it_rev_timestamp'] = currentevents[-1][ 'it_rev_timestamp'].long_format() currentevents[-1]['tag_duration_(hours)'] = timediffinhours( tagged.long_format(), dumpdate.strftime("%Y-%m-%dT%H:%M:%SZ")) currentevents[-1]['tag_edits'] += 1 currentevents[-1]['tag_distinct_editors'].add(rev_user_text) currentevents[-1]['tag_distinct_editors'] = len( currentevents[-1]['tag_distinct_editors']) #use revtext and not prevrevtext because it is still current event currentevents[-1]['diff_len'] = len( revtext) - currentevents[-1]['diff_len'] currentevents[-1]['diff_links'] = len(re.findall( links_r, revtext)) - currentevents[-1]['diff_links'] currentevents[-1]['diff_extlinks'] = len( re.findall(extlinks_r, revtext)) - currentevents[-1]['diff_extlinks'] currentevents[-1]['diff_refs'] = len(re.findall( refs_r, revtext)) - currentevents[-1]['diff_refs'] currentevents[-1]['diff_templates'] = len( re.findall(templates_r, revtext)) - currentevents[-1]['diff_templates'] currentevents[-1]['diff_images'] = len( re.findall(images_r, revtext)) - currentevents[-1]['diff_images'] #print page.title.encode('utf-8'), currentevents[-1] tagged = False filename = 'currentevents-%s-%s.csv.%s' % ( dumplang, dumpdate.strftime('%Y%m%d'), chunkid) f = csv.writer(open(filename, 'a', encoding='utf-8'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for i in currentevents: row = [i[field] for field in fields] f.writerow(row) print('Finished correctly')
def main(): #current events templates regexp currentevent_templates_r = { "cawiki": re.compile(r'(?im)(\{\{\s*(?:Actualitat|Fet[ _]actual|Fets[ _]recents)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'), "dewiki": re.compile(r'(?im)(\{\{\s*(?:Laufendes[ _]Ereignis|Laufende[ _]Veranstaltung|Aktuelles[ _]Ereignis)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'), "enwiki": re.compile(r'(?im)(\{\{\s*(?:Current|Current[ _]antics|Current[ _]?disaster|Current[ _]election|Current[ _]?events?|Current[ _]news|Current[ _]paragraph|Current[ _]?person|Current[ _-]?related|Currentsect|Current[ _-]?section|Current[ _]spaceflight|Current[ _]?sport|Current[ _]sport-related|Current[ _]sports[ _]transaction|Current[ _]tornado[ _]outbreak|Current[ _]tropical[ _]cyclone|Current[ _]war|Currentwarfare|Flux|Live|Developing|Developingstory|Ongoing[ _]election|Ongoing[ _]event|Recent[ _]?death|Recent[ _]death[ _]presumed|Recent[ _]?event|Recent[ _]news|Recent[ _]related|Related[ _]current)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'), "eswiki": re.compile(r'(?im)(\{\{\s*(?:Actual|Actualidad|Actualidad[ _]deporte|Current|EA|Evento[ _]actual|Launching|Muerte[ _]reciente|Sencillo[ _]actual|Single[ _]actual|Telenovela[ _]en[ _]emisión|Teleserie[ _]en[ _]emisión)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'), } #current events categories regexp currentevent_categories_r = { "cawiki": re.compile(r'(?im)\[\[\s*(?:Categoria|Category)\s*:\s*Articles[ _]d\'actualitat\s*[\|\]]'), "dewiki": re.compile(r'(?im)\[\[\s*(?:Kategorie|Category)\s*:\s*Wikipedia:Laufendes[ _]Ereignis\s*[\|\]]'), "enwiki": re.compile(r'(?im)\[\[\s*Category\s*:\s*Current[ _]events\s*[\|\]]'), "eswiki": re.compile(r'(?im)\[\[\s*(?:Categoría|Category)\s*:\s*Actualidad\s*[\|\]]'), } #namespaces to analyse wanted_namespaces = { "cawiki": [0], #main "dewiki": [0], #main "enwiki": [0], #main "eswiki": [0, 104], #main, anexo } #fields to generate fields = [ 'page_id', 'page_namespace', 'page_title', 'page_creator', 'page_creator_type', #ip, registered, unknown 'page_creation_date', 'it_rev_id', #it = inserted tag 'it_rev_timestamp', 'it_rev_username', 'it_rev_comment', 'rt_rev_id', #rt = removed tag 'rt_rev_timestamp', 'rt_rev_username', 'rt_rev_comment', 'tag_type', #template, category, both 'tag_string', 'tag_time_since_creation_(hours)', 'tag_duration_(hours)', 'tag_edits', 'tag_distinct_editors', #'maintenance_templates', #templates for maintenance during current event 'diff_len', 'diff_links', 'diff_extlinks', 'diff_refs', 'diff_templates', 'diff_images', 'page_moves', #ideas: diff_sections ] #maintenance templates maintenance_templates_r = { "eswiki": re.compile(r'(?im)(\{\{\s*(?:Actualizar|Ampliación[ _]propuesta|Archivo|Artículo[ _]indirecto/esbozo|Artículo[ _]infraesbozo|Autotrad|Aviso[ _]infraesbozo|Bulo|Cita[ _]requerida|Complejo|Contextualizar|Copyedit|Copyvio|Curiosidades|Desactualizado|Desambiguación|Destruir|Discusión[ _]sosegada|Discutido|En[ _]desarrollo|En[ _]uso|Evento[ _]actual|Evento[ _]futuro|Excesivamente[ _]detallado|Ficticio|Formato[ _]de[ _]cita|FP|Fuentes[ _]no[ _]fiables|Fuente[ _]primaria|Fusionando|Fusionar|Fusionar[ _]desde|Fusionar[ _]en|Infraesbozo|Irrelevante|Largo|Mal[ _]traducido|Mejorar[ _]redacción|No[ _]es[ _]un[ _]foro|No[ _]neutralidad|Página[ _]bloqueada|Plagio|Plagio[ _]externo|Polémico|Posible[ _]copyvio|Posible[ _]fusionar|Problemas[ _]artículo|Promocional|Publicidad|PVfan|Reducido|Referencias|Referencias[ _]adicionales|Renombrar|Revisar[ _]traducción|Separado[ _]de|Separar|Sin[ _]?relevancia|SRA|Traducción|Traducido[ _]de|Transferir[ _]a|Wikificar)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'), } #regexps for counts links_r = re.compile(r'(?im)(\[\[[^\[\]\r\n]+\]\])') # [[..|?..]] extlinks_r = re.compile(r'(?im)(://)') # :// refs_r = re.compile(r'(?im)< */ *ref *>') # </ref> templates_r = re.compile(r'(?im)((?:^|[^\{\}])\{\{[^\{\}])') # {{ images_r = re.compile(r'(?im)\[\[\s*(File|Image|Fitxer|Imatge|Datei|Bild|Archivo|Imagen)\s*:') #get parameters dumpfilename = sys.argv[1] chunkid = sys.argv[2] #input can be compressed or plain xml if dumpfilename.endswith('.7z'): #7za or 7zr are valid fp = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % dumpfilename, shell=True, stdout=subprocess.PIPE, bufsize=65535) pages = Iterator.from_file(fp.stdout) elif dumpfilename.endswith('.bz2'): import bz2 source = bz2.BZ2File(dumpfilename) pages = Iterator.from_file(source) else: source = open(dumpfilename) pages = Iterator.from_file(source) #get dump language and date dumplang = dumpfilename.split('/')[-1].split('-')[0] dumpdate = datetime.datetime.strptime('%s 23:59:59' % (dumpfilename.split('/')[-1].split('-')[1]), '%Y%m%d %H:%M:%S') pagecount = 0 #blank CSV currentevents filename = 'currentevents-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'), chunkid) f = open(filename, 'w', encoding='utf-8') output = '{0}\n'.format('|'.join(fields)) f.write(output) f.close() #blank CSV pages filename = 'pages-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'), chunkid) g = open(filename, 'w', encoding='utf-8') output = 'page_id|page_namespace|page_title|page_creation_rev_id|page_creation_date|page_creator|page_is_redirect\n' g.write(output) g.close() #analyse dump for page in pages: if int(page.namespace) not in wanted_namespaces[dumplang]: #skip unwanted namespaces continue msg = 'Analysing: {0}'.format(page.title) print(msg.encode('utf-8')) pagecount += 1 if pagecount % 100 == 0: msg = 'Analysed {0} pages'.format(pagecount) print(msg.encode('utf-8')) #if pagecount > 2000: # if dumpfilename.endswith('.7z'): # fp.kill() # break currentevents = [] tagged = False revcount = 0 page_creator = '' page_creator_type = '' pagecreationdate = '' page_is_redirect = page.redirect and 'True' or 'False' temp = {} # to detect wrongly removed templates prevrevtext = '' for rev in page: if revcount == 0: if rev.contributor: page_creator = rev.contributor.user_text and rev.contributor.user_text or '' page_creator_type = rev.contributor.id and rev.contributor.id != 0 and 'registered' or 'ip' else: page_creator = '' page_creator_type = 'unknown' pagecreationdate = rev.timestamp filename = 'pages-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'), chunkid) g = csv.writer(open(filename, 'a', encoding='utf-8'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) g.writerow([page.id, page.namespace, page.title, rev.id, pagecreationdate.long_format(), page_creator, page_is_redirect]) revcount += 1 #print (rev.id) rev_user_text = '' if rev.contributor: rev_user_text = rev.contributor.user_text and rev.contributor.user_text or '' revtext = rev.text and rev.text or '' revcomment = re.sub(r'\n', '', rev.comment and rev.comment or '') if re.search(currentevent_templates_r[dumplang], revtext) or re.search(currentevent_categories_r[dumplang], revtext): if tagged: #still is current event currentevents[-1]['tag_edits'] += 1 currentevents[-1]['tag_distinct_editors'].add(rev_user_text) #check page moves if pagemoved(revtext, prevrevtext): currentevents[-1]['page_moves'] += 1 else: #tagged as current event just now if temp: if timediffinhours(temp['rt_rev_timestamp'].long_format(), rev.timestamp.long_format()) <= 24 * 2: #if it was current event less than X days before, then the template was wrongly removed currentevents[-1] = temp.copy() currentevents[-1]['tag_edits'] += 1 currentevents[-1]['tag_distinct_editors'].add(rev_user_text) temp = {} tagged = currentevents[-1]['it_rev_timestamp'] continue tagged = rev.timestamp tag_time_since_creation = timediffinhours(pagecreationdate.long_format(), rev.timestamp.long_format()) print(page.title.encode('utf-8'), tag_time_since_creation) tag_string = 'unknown' if re.search(currentevent_templates_r[dumplang], revtext): #unify a bit the tag, to ease comparison later tag_string = re.findall(currentevent_templates_r[dumplang], revtext)[0].lower().strip() tag_string = re.sub(r'_', r' ', tag_string) tag_string = re.sub(r'\{\{\s+', r'{{', tag_string) tag_string = re.sub(r'\s+\}\}', r'}}', tag_string) tag_string = re.sub(r'\s*\|\s*', r'|', tag_string) tag_string = re.sub(r'\n', r'', tag_string) tag_string = re.sub(r'\|\|+', r'|', tag_string) tag_string = re.sub(r'(?i)\|\s*date\s*\=\s*[A-Za-z0-9 ]+', r'', tag_string) #remove |date=May 2014 in English WP tag_type = "" if re.search(currentevent_templates_r[dumplang], revtext): tag_type = "template" if re.search(currentevent_categories_r[dumplang], revtext): tag_type = "both" elif re.search(currentevent_categories_r[dumplang], revtext): tag_type = "category" currentevent = { 'page_id': str(page.id), 'page_namespace': str(page.namespace), 'page_title': page.title, 'page_creator': page_creator, 'page_creator_type': page_creator_type, 'page_creation_date': pagecreationdate, 'it_rev_id': str(rev.id), 'it_rev_timestamp': rev.timestamp, 'it_rev_username': rev.contributor.user_text, 'it_rev_comment': revcomment and revcomment or "", 'rt_rev_id': "", 'rt_rev_timestamp': "", 'rt_rev_username': "", 'rt_rev_comment': "", 'tag_type': tag_type, 'tag_string': tag_string, 'tag_time_since_creation_(hours)': str(tag_time_since_creation), 'tag_duration_(hours)': "", 'tag_edits': 1, #counter to increment 'tag_distinct_editors': set([rev_user_text]), #set of unique editors #prevrevtext to catch any change right when is marked as current event 'diff_len': len(prevrevtext), 'diff_links': len(re.findall(links_r, prevrevtext)), 'diff_extlinks': len(re.findall(extlinks_r, prevrevtext)), 'diff_refs': len(re.findall(refs_r, prevrevtext)), 'diff_templates': len(re.findall(templates_r, prevrevtext)), 'diff_images': len(re.findall(images_r, prevrevtext)), 'page_moves': 0, } currentevents.append(currentevent) else: if tagged: #tag has been removed just now temp = currentevents[-1].copy() #saving temporaly to check if it is added again shortly temp['rt_rev_timestamp'] = rev.timestamp currentevents[-1]['page_creation_date'] = currentevents[-1]['page_creation_date'].long_format() currentevents[-1]['it_rev_timestamp'] = currentevents[-1]['it_rev_timestamp'].long_format() currentevents[-1]['rt_rev_id'] = str(rev.id) currentevents[-1]['rt_rev_timestamp'] = rev.timestamp.long_format() currentevents[-1]['rt_rev_username'] = rev.contributor.user_text currentevents[-1]['rt_rev_comment'] = revcomment and revcomment or "" currentevents[-1]['tag_duration_(hours)'] = timediffinhours(tagged.long_format(), rev.timestamp.long_format()) currentevents[-1]['tag_edits'] += 1 currentevents[-1]['tag_distinct_editors'].add(rev_user_text) currentevents[-1]['tag_distinct_editors'] = len(currentevents[-1]['tag_distinct_editors']) currentevents[-1]['diff_len'] = len(revtext) - currentevents[-1]['diff_len'] #revtext because it was current event until this very edit currentevents[-1]['diff_links'] = len(re.findall(links_r, revtext)) - currentevents[-1]['diff_links'] currentevents[-1]['diff_extlinks'] = len(re.findall(extlinks_r, revtext)) - currentevents[-1]['diff_extlinks'] currentevents[-1]['diff_refs'] = len(re.findall(refs_r, revtext)) - currentevents[-1]['diff_refs'] currentevents[-1]['diff_templates'] = len(re.findall(templates_r, revtext)) - currentevents[-1]['diff_templates'] currentevents[-1]['diff_images'] = len(re.findall(images_r, revtext)) - currentevents[-1]['diff_images'] currentevents[-1]['page_moves'] += 1 tagged = False else: if temp: #keep temp updated temp['tag_edits'] += 1 temp['tag_distinct_editors'].add(rev_user_text) #check page moves if pagemoved(revtext, prevrevtext): temp['page_moves'] += 1 prevrevtext = revtext #needed for diff stats if tagged: #tagged still as of dumpdate currentevents[-1]['page_creation_date'] = currentevents[-1]['page_creation_date'].long_format() currentevents[-1]['it_rev_timestamp'] = currentevents[-1]['it_rev_timestamp'].long_format() currentevents[-1]['tag_duration_(hours)'] = timediffinhours(tagged.long_format(), dumpdate.strftime("%Y-%m-%dT%H:%M:%SZ")) currentevents[-1]['tag_edits'] += 1 currentevents[-1]['tag_distinct_editors'].add(rev_user_text) currentevents[-1]['tag_distinct_editors'] = len(currentevents[-1]['tag_distinct_editors']) #use revtext and not prevrevtext because it is still current event currentevents[-1]['diff_len'] = len(revtext) - currentevents[-1]['diff_len'] currentevents[-1]['diff_links'] = len(re.findall(links_r, revtext)) - currentevents[-1]['diff_links'] currentevents[-1]['diff_extlinks'] = len(re.findall(extlinks_r, revtext)) - currentevents[-1]['diff_extlinks'] currentevents[-1]['diff_refs'] = len(re.findall(refs_r, revtext)) - currentevents[-1]['diff_refs'] currentevents[-1]['diff_templates'] = len(re.findall(templates_r, revtext)) - currentevents[-1]['diff_templates'] currentevents[-1]['diff_images'] = len(re.findall(images_r, revtext)) - currentevents[-1]['diff_images'] #print page.title.encode('utf-8'), currentevents[-1] tagged = False filename = 'currentevents-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'), chunkid) f = csv.writer(open(filename, 'a', encoding='utf-8'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL) for i in currentevents: row = [i[field] for field in fields] f.writerow(row) print('Finished correctly')
def parse_dump(dump_filename, wanted_page_ids, found_pages_dict, users_page_edits_dict, pages_pwr_dict, logfile): """ Parse the given dump, processing assessments for the given talk page IDs. @param dump_filename: path to the dump file to process @type dump_filename: str @param wanted_page_ids: dictionary where keys are talk page IDs, values don't matter, we're only using the dict for fast lookups @type wanted_page_ids: dict """ # Construct dump file iterator dump = Iterator.from_file(functions.open_file(dump_filename)) bots_file = "resources/wikipedia_bots_full.txt" bots = {} try: with open(bots_file, "r") as fin: csvreader = csv.reader(fin) for line in csvreader: bots[line[0].lower()] = True except: print("Invalid bots file - only text matching with 'bot' will be used") with open(logfile, "a") as fout: fout.write("Invalid bots file - only text regex with 'bot' followed by whitespace will be used.\n") scripts = ["commonsdelinker", "conversion script"] count = 0 # Iterate through pages for page in dump: # skip if not a page we want to process if not page.id in wanted_page_ids: continue try: with open(logfile, "a", encoding="utf-8", errors="backslashreplace") as fout: fout.write(str(datetime.now()) + ": " + page.title + "\n") print(page.title) except: with open(logfile, "a") as fout: fout.write(str(datetime.now()) + ": next spatial article.\n") print("next spatial article.") state = persistence.State() count += 1 counts_dict = { "total_edits": 0, "bot_edits": 0, "unverified_bot_edits": 0, "known_script_edits": 0, "anonymous_edits": 0, "awb_edits": 0, "minor_edits": 0, "wpcleaner_edits": 0, } # Iterate through a page's revisions for revision in page: # skip if there's no content if not revision.text: continue if revision.comment and "awb" in revision.comment.lower(): pwr = state.process(revision.text, revision="awb") else: pwr = state.process(revision.text, revision=revision.contributor.user_text) counts_dict["total_edits"] += 1 try: if revision.contributor.user_text: process_rev(revision, counts_dict, bots, scripts, users_page_edits_dict, page.id) except: try: print( "Error in revision.contributor.user_text {0} for page {1}".format( revision.contributor.user_text, page.title ) ) with open(logfile, "a") as fout: fout.write( "Error in revision.contributor.user_text {0} for page {1}\n".format( revision.contributor.user_text, page.title ) ) except: print("Error in a revision.contributor.user_text for a page.") with open(logfile, "a") as fout: fout.write("Error in a revision.contributor.user_text for a page.") found_pages_dict[page.id] = wanted_page_ids[page.id] found_pages_dict[page.id].update(counts_dict) current_state = { "total_tokens": 0, "bot_tokens": 0, "unverified_bot_tokens": 0, "known_script_tokens": 0, "anonymous_tokens": 0, "awb_tokens": 0, } for tk in pwr[0]: # loop through tokens in current state of the page current_state["total_tokens"] += 1 try: if tk.revisions[0]: process_current_page(tk.revisions[0].lower(), current_state, bots, scripts, pages_pwr_dict, page.id) except: try: print("Error in processing token {0} for page {1}".format(tk.text, page.id)) with open(logfile, "a", encoding="utf-8", errors="backslashreplace") as fout: fout.write("Error in processing token {0} for page {1}.\n".format(tk.text, str(page.id))) except: print("Error in processing a token for page {0}".format(page.id)) with open(logfile, "a") as fout: fout.write("Error in processing a token for page {0}.\n".format(page.id)) found_pages_dict[page.id].update(current_state) # ok, done return
x = True while (x): name = "tmpdump" + str(n) + ".bz2" if (os.path.isfile(name)): n = n + 1 else: x = False print("Downloading " + line + "...") urllib.request.urlretrieve(line, name, reporthook=dlProgress) print("\nAnalizing " + line + "...") dump = Iterator.from_file(bz2.open(name, "r")) do(dump, line.split("/")[-1] + ".csv") os.remove(name) else: dump = Iterator.from_file(bz2.open(arguments["-f"], "r")) do(dump, arguments['-o'])
def parse_file(input=None, output=None, wp_dir=None, cat_dir=None): wp_output = wp_dir + output.replace(FILE_TYPE, '') + '_wikiproject' + FILE_TYPE cat_output = cat_dir + output.replace(FILE_TYPE, '') + '_category' + FILE_TYPE wp_fout = open(wp_output, 'w') cat_fout = open(cat_output, 'w') dump = Iterator.from_file(functions.open_file(input)) for page in dump: # print(page.title, page.namespace) # ignore redirected pages for both article and talk pages if page.redirect: continue if page.namespace != 0 and page.namespace != 1: continue # only one revision on the current page for rev in page: # catch rare parsing errors try: wikicode = mwp.parse(rev.text) except: print(page.id, page.title, page.namespace) continue # parse the article page to extract category info of the article if page.namespace == 0: categories = [] title = page.title.lower() for link in wikicode.filter_wikilinks(): if link.title.startswith('Category:'): cate = link.title.lower().replace('category:', "") categories.append(cate) if not LIST_FORMAT: record = { "pageId": page.id, "title": title, "category": cate } from json import dumps print(dumps(record), file=cat_fout) if LIST_FORMAT: record = { "pageId": page.id, "title": title, "categories": categories } from json import dumps print(dumps(record), file=cat_fout) # parse the talk page to extract wikiproject info of the article if page.namespace == 1: title = page.title.lower().replace("talk:", "") cls = importance = "None" wikiprojects = [] for template in wikicode.filter_templates(): if template.name == 'WikiProjectBannerShell': continue if template.name.lower().startswith('wikiproject'): from re import search wikiproject = template.name.lower().replace( "wikiproject", "").strip() wikiprojects.append(wikiproject) template = str(template).replace("}", "|").replace( " ", "").replace("\n", "") try: cls = search(r'\|class=([a-z-A-Z]+)\|', template).group(1) importance = search(r'\|importance=([a-z-A-Z]+)\|', template).group(1) except AttributeError: pass if not LIST_FORMAT: record = { "pageId": page.id, "title": title, "wikiproject": wikiproject, "class": cls.lower(), "importance": importance.lower() } from json import dumps print(dumps(record), file=wp_fout) if LIST_FORMAT: record = { "pageId": page.id, "title": title, "wikiprojects": wikiprojects, "class": cls.lower(), "importance": importance.lower() } from json import dumps print(dumps(record), file=wp_fout)
def wikiParser(file_name): pageMetadata = [] #create table dbCreator.create_table() # Construct dump file iterator counter = 0 dump = Iterator.from_file(bz2.open(file_name)) # dump = Iterator.from_file(open("/Users/alessandro/Documents/PhD/trWiki/trSample.xml")) # Iterate through pages pageAll = [] for page in dump: if counter == 2500: conn = dbCreator.get_db_params() cur = conn.cursor() try: cur.executemany( """INSERT INTO revision_metadata (bytes, namespace, page, par_Id, rev_Id, revert, reverted, time_Stamp, user_Id, user_Name) VALUES (%(bytes)s, %(namespace)s, %(page)s, %(parentId)s, %(revId)s, %(revert)s, %(reverted)s, %(time_stamp)s, %(userId)s, %(userName)s);""", pageAll) conn.commit() # print('imported') except: conn.rollback() for stat in pageAll: try: cur.execute( """INSERT INTO revision_metadata (bytes, namespace, page, par_Id, rev_Id, revert, reverted, time_Stamp, user_Id, user_Name) VALUES (%(bytes)s, %(namespace)s, %(page)s, %(parentId)s, %(revId)s, %(revert)s, %(reverted)s, %(time_stamp)s, %(userId)s, %(userName)s);""", stat) conn.commit() except: conn.rollback() e = sys.exc_info()[0] print("<p>Error: %s</p>" % e) print('not imported, revision id error') print(stat) pageAll = [] counter = 0 counter += 1 checksum_revisions = [] revertsList = [] pageTitle = page.title.lower().replace(' ', '_') pageNS = page.namespace # state = persistence.State() # Iterate through a page's revisions for revision in page: revData = {} # print(revision.id, revision.contributor, revision.timestamp) revData['page'] = pageTitle revData['namespace'] = pageNS revData['bytes'] = revision.bytes revData['revId'] = revision.id revData['parentId'] = revision.parent_id revData['time_stamp'] = revision.timestamp.long_format().replace( 'T', ' ').replace('Z', ' ') if revision.contributor.id == None: revData['userId'] = 'ip' else: revData['userId'] = revision.contributor.id revData['userName'] = revision.contributor.user_text revData['revert'] = False revData['reverted'] = False pageMetadata.append(revData) checksum_revisions.append((revision.text, {"rev_id": revision.id})) # state.process(revision.text, revision=revision.id) # print(state.last) revertsList.append(list(reverts.detect(checksum_revisions))) # print(revertsList) for revvos in revertsList: for revvo in revvos: for revis in pageMetadata: try: if revis['revId'] == revvo.reverting['rev_id']: revis['revert'] = True except: print(revvo) for reverted in revvo.reverteds: if revis['revId'] == reverted['rev_id']: revis['reverted'] = True pageAll += pageMetadata pageMetadata = [] conn = dbCreator.get_db_params() cur = conn.cursor() try: cur.executemany( """INSERT INTO revision_metadata (bytes, namespace, page, par_Id, rev_Id, revert, reverted, time_Stamp, user_Id, user_Name) VALUES (%(bytes)s, %(namespace)s, %(page)s, %(parentId)s, %(revId)s, %(revert)s, %(reverted)s, %(time_stamp)s, %(userId)s, %(userName)s);""", pageAll) conn.commit() # print('imported') except: conn.rollback() for stat in pageAll: try: cur.execute( """INSERT INTO revision_metadata (bytes, namespace, page, par_Id, rev_Id, revert, reverted, time_Stamp, user_Id, user_Name) VALUES (%(bytes)s, %(namespace)s, %(page)s, %(parentId)s, %(revId)s, %(revert)s, %(reverted)s, %(time_stamp)s, %(userId)s, %(userName)s);""", stat) conn.commit() except: conn.rollback() e = sys.exc_info()[0] print("<p>Error: %s</p>" % e) print('not imported, revision id error') print(stat)
""" Prints out all rev_ids that appear in dump.xml. """ from mw.xml_dump import Iterator # Construct dump file iterator dump = Iterator.from_file(open("examples/dump.xml")) # Iterate through pages for page in dump: # Iterate through a page's revisions for revision in page: print(revision.id)
def analyseArticle(file_name): # Container of relationships. relations = {} # Revisions to compare. revision_curr = Revision() revision_prev = Revision() text_curr = None # Access the file. dumpIterator = mwIterator.from_file(open_file(file_name)) # Iterate over the pages. for page in dumpIterator: i = 0 # Iterate over revisions of the article. for revision in page: vandalism = False # Update the information about the previous revision. revision_prev = revision_curr if (revision.sha1 == None): revision.sha1 = Text.calculateHash(revision.text) if (revision.sha1 in spam): vandalism = True #TODO: SPAM detection: DELETION if (revision.comment!= None and revision.comment.find(FLAG) > 0): pass else: if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE): vandalism = True revision_curr = revision_prev if (not vandalism): # Information about the current revision. revision_curr = Revision() revision_curr.id = i revision_curr.wikipedia_id = int(revision.id) revision_curr.length = len(revision.text) revision_curr.timestamp = revision.timestamp # Relation of the current relation. relation = Relation() relation.revision = int(revision.id) relation.length = len(revision.text) # Some revisions don't have contributor. if (revision.contributor != None): revision_curr.contributor_id = revision.contributor.id revision_curr.contributor_name = revision.contributor.user_text relation.author = revision.contributor.user_text else: revision_curr.contributor_id = 'Not Available ' + revision.id revision_curr.contribur_name = 'Not Available ' + revision.id relation.author = 'Not Available ' + revision.id # Content within the revision. text_curr = revision.text.lower() # Perform comparison. vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation) if (not vandalism): # Add the current revision with all the information. revisions.update({revision_curr.wikipedia_id : revision_curr}) relations.update({revision_curr.wikipedia_id : relation}) revision_order.append((revision_curr.wikipedia_id, False)) # Update the fake revision id. i = i+1 # Calculate the number of tokens in the revision. total = 0 for p in revision_curr.ordered_paragraphs: for paragraph_curr in revision_curr.paragraphs[p]: for hash_sentence_curr in paragraph_curr.sentences.keys(): for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]: total = total + len(sentence_curr.words) revision_curr.total_tokens = total relation.total_tokens = total else: revision_order.append((revision_curr.wikipedia_id, True)) revision_curr = revision_prev spam.append(revision.sha1) return (revisions, revision_order, relations)
import csv import argparse import re from mw.xml_dump import Iterator parser = argparse.ArgumentParser() parser.add_argument("-i", help="input file location") parser.add_argument("-o", help="output file location") args = parser.parse_args() in_fn = args.i out_fn = args.o # Construct dump file iterator dump = Iterator.from_file(open(in_fn)) with open(out_fn, 'w') as o: output = csv.writer(o, delimiter = '\t', quotechar='|') output.writerow(['pageID','page_title','namespace','editor_id','username','timestamp','comment']) # Iterate through pages for page in dump: # Iterate through a page's revisions for rev in page: output.writerow([page.id, page.title, page.namespace,rev.contributor.user_text, rev.timestamp,rev.comment])
"type": "date", "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ssZZZ||epoch_millis" } } } } }''' if not es.indices.exists(index): es.indices.create(index=index, body=mapping, ignore=400) else: # query source = wiki,delete by query queryDoc = {'query': {'match': {'source': 'wiki'}}} deleteDoc = es.delete_by_query(index='cms', body=queryDoc) print("delete document!!!") dumpfile = Iterator.from_file(open("/tmp/wiki_dump.xml")) for page in dumpfile: # Iterate through a page's revisions for revision in page: wikiBody = { "id": "revision.id", "url": "https://wiki.xxx.cn/" + page.title, "source": "wiki", "createUser": revision.contributor.user_text, "updateUser": revision.contributor.user_text, "content":
def parse_wikipedia_controversial_articles(xml_stub_history): primary_ip_dir = "C:\WikiProject\\" internal_ip_dir = "Controversial Single Pages Wikipedia\\" controversial_page_titles_fileName = "controversial_page_titles.txt" controversial_page_ids_fileName = "controversial_wikipedia_page_ids.txt" # Create page_ids file corresponding to page_titles inputFile = open( primary_ip_dir + internal_ip_dir + controversial_page_titles_fileName, 'r') outputFile = open( primary_ip_dir + internal_ip_dir + controversial_page_ids_fileName, 'w') for title in inputFile: outputFile.write(title.strip() + " " + find_page_id_given_page_title(title)) outputFile.write("\n") inputFile.close() outputFile.close() # Parse xml_dump to retrieve the reverts of controversial pages inputFile = open( primary_ip_dir + internal_ip_dir + controversial_page_ids_fileName, 'r') titles = [] pageID = [] for line in inputFile: parts = line.split() titles.append(parts[0].strip()) pageID.append(int(parts[1])) inputFile.close() inputFile = open(xml_stub_history, 'r', encoding='utf-8') article_count = 0 dump_iter = Iterator.from_file(inputFile) for page_iter in dump_iter: print(page_iter.id) if page_iter.id in pageID: article_count += 1 try: item_idx = pageID.index(page_iter.id) except ValueError: continue outputFile = open( primary_ip_dir + internal_ip_dir + "Anonymous Inclusion With IP Address\Revision Logs\\" + titles[item_idx] + ".log", mode='w', encoding='utf-8') page = Page(page_iter.id, page_iter.title, page_iter.namespace, page_iter.redirect, page_iter.restrictions, page_iter.__iter__()) rev_iter_idx = 0 detector = reverts.Detector() # edit_list contains tuples <revision_id, user_id> to track previous revisions. For anonymous, saved user_id in form <IP address> edit_list = [] for rev_iter in page: rev_iter_idx = rev_iter_idx + 1 if rev_iter.contributor.id != None: edit_list.append([rev_iter.id, rev_iter.contributor.id]) else: edit_list.append( [rev_iter.id, rev_iter.contributor.user_text]) revert_info = detector.process(rev_iter.sha1, rev_iter.id) if revert_info != None: reverter = find_user(edit_list, revert_info.reverting) revertedTo = find_user(edit_list, revert_info.reverted_to) for i in range(len(revert_info.reverteds)): reverted = find_user(edit_list, revert_info.reverteds[i]) outputFile.write( str(reverter) + "," + str(revertedTo) + "," + str(reverted)) outputFile.write("\n") outputFile.close() # All articles found if (article_count == len(pageID)): break inputFile.close()
def parse_all_wiki_articles_for_rev_history(xml_stub_history, xml_file_no): #primary_op_dir = "C:\WikiProject\\" #internal_op_dir = "Controversial Single Pages Wikipedia\Wikidumps\Revisions\All Revisions\\" primary_op_dir = "/N/u/mmaity/Karst/" internal_op_dir = "WikiAnalysis/Wikidumps/Output_Logs/" dump_iter = Iterator.from_file(open(xml_stub_history,encoding='latin-1')) tot_file_ct = 0 start = time.time() output_file_reverts = open(primary_op_dir+internal_op_dir+"reverts_"+str(xml_file_no)+".log", mode='w', encoding='utf-8') output_file_edits = open(primary_op_dir+internal_op_dir+"edits_"+str(xml_file_no)+".log", mode='w', encoding='utf-8') page_ids = create_set_of_page_ids("page_ids") for page_iter in dump_iter: if page_iter.id in page_ids: print(page_iter.id) page = Page(page_iter.id, page_iter.title, page_iter.namespace, page_iter.redirect, page_iter.restrictions, page_iter.__iter__()) detector = reverts.Detector() output_file_reverts.write("#"+str(page_iter.id)+"\n") edit_list = [] for rev_iter in page: if rev_iter.contributor.id != None: edit_list.append([rev_iter.id, rev_iter.contributor.id]) else: edit_list.append([rev_iter.id, rev_iter.contributor.user_text]) # Detect reverts and save info in reverts_ file revert_info = detector.process(rev_iter.sha1, rev_iter.id) if revert_info != None: reverter = find_user(edit_list, revert_info.reverting) revertedTo = find_user(edit_list, revert_info.reverted_to) for i in range(len(revert_info.reverteds)): reverted = find_user(edit_list, revert_info.reverteds[i]) output_file_reverts.write(str(reverter)+","+str(revertedTo)+","+str(reverted)+"\n") # <page_id, user_id, num_of_revs> user_list = {} for edit_list_iter in edit_list: contributor = edit_list_iter[1] if contributor in user_list.keys(): user_list[contributor] += 1 else: user_list[contributor] = 1 for item in user_list.items(): output_file_edits.write(str(page_iter.id)+","+str(item[0])+","+str(item[1])+"\n") else: print(page_iter.id, "Not Found") output_file_reverts.close() output_file_edits.close() end = time.time() print(xml_stub_history) print("Total File Count:", tot_file_ct) print("Elapsed Time:", (end-start))
) #geoip2.database.Reader(os.environ['GEO2_DIRECTORY'], maxminddb.MODE_MMAP_EXT) flagged = FlaggedTools.load('/home/alexander/flagged.pkl') users = DepRepo.flags() #UserFlagsTools.load(os.environ['USER_FLAGS']) ################# d1 = dt.datetime.now() pp = PageProcessor(flagged, users, db, geoip) #pp.clear() cnt = 0 totalcnt = 0 rcnt = 0 #pr = cProfile.Profile() #pr.enable() dump = Iterator.from_file(open_file(file_name)) for page in dump: totalcnt += 1 if totalcnt % 50 == 0: print( str(rcnt) + "/" + str(cnt) + "/" + str(totalcnt) + " pushed: " + str(pp.items_pushed)) gc.collect() excl = page.namespace != 0 and page.namespace != 10 if not excl: cnt += 1 # check page namespace rcnt += pp.process(page, excl)
import sys,os;sys.path.insert(0, os.path.abspath(os.getcwd())) from mw.xml_dump import Iterator # Construct dump file iterator dump = Iterator.from_file(open("examples/dump.xml")) # Iterate through pages for page in dump: # Iterate through a page's revisions for revision in page: print(revision.id)
def main(): parser = argparse.ArgumentParser() parser.add_argument( '--dump_file', help= "gzipped XML dump file -- e.g., enwiki-20190301-stub-meta-history.xml.gz" ) parser.add_argument('--outdir', help="directory to write monthly editor files") parser.add_argument('--botlist_fn', help="Text file containing bot accounts") parser.add_argument('--mys', nargs="+", help="List of months to track of form '2016-11'") parser.add_argument( '--startdate', default='2001-08', help="If not mys, starting month to track of form '2016-11'") parser.add_argument( '--enddate', default='2019-03', help="If not mys, ending month to track of form '2016-11'") parser.add_argument( '--stopafter', type=int, default=-1, help="If greater than 0, limit to # of pages to check before stopping") args = parser.parse_args() # build list of months to track if not args.mys: args.mys = [] sd = (int(args.startdate[:4]), int(args.startdate[5:])) ed = (int(args.enddate[:4]), int(args.enddate[5:])) while ed[0] > sd[0] or ed[1] >= sd[1]: args.mys.append('{0}-{1:02}'.format(sd[0], sd[1])) if sd[1] == 12: sd = (sd[0] + 1, 1) else: sd = (sd[0], sd[1] + 1) print(args) # load in bot usernames bots = set() if args.botlist_fn: with open(args.botlist_fn) as fin: csvreader = csv.reader(fin) for line in csvreader: bots.add(line[0].lower()) # Construct dump file iterator dump = Iterator.from_file(gzip.open(args.dump_file)) editors_by_month = {} editor_startdates = {} i = 0 bots_edits_filtered = 0 user_edits = 0 anon_edits = 0 print_every = 25 # Iterate through pages for page in dump: # only count edits to article namespace if page.namespace != 0: continue i += 1 # Iterate through a page's revisions for revision in page: contributor = revision.contributor if not contributor.id or contributor.id == 0: anon_edits += 1 continue editor_name = contributor.user_text if editor_name.lower() in bots: bots_edits_filtered += 1 continue month_year = revision.timestamp.strftime("%Y-%m") editor_startdates[editor_name] = min( month_year, editor_startdates.get(editor_name, "2100-01")) if args.mys and month_year not in args.mys: continue user_edits += 1 if month_year not in editors_by_month: editors_by_month[month_year] = {} editors_by_month[month_year][ editor_name] = editors_by_month[month_year].get( editor_name, 0) + 1 if i == args.stopafter: break if i % print_every == 0: print( '{0} completed. On: {1}. {2} bot edits, {3} anon edits, {4} user edits.' .format(i, page.title, bots_edits_filtered, anon_edits, user_edits)) print_every *= 2 print( '{0} completed. On: {1}. {2} bot edits, {3} anon edits, {4} user edits.' .format(i, page.title, bots_edits_filtered, anon_edits, user_edits)) for my in editors_by_month: fn = os.path.join(args.outdir, my) with open(fn, 'w') as fout: csvwriter = csv.writer(fout) csvwriter.writerow(['editor_name', 'edit_count', 'first_edit_dt']) d = editors_by_month[my] by_editcount = [(k, d[k], editor_startdates[k]) for k in sorted(d, key=d.get, reverse=True)] for editor_count in by_editcount: csvwriter.writerow(editor_count)
def run(self): def _process_dump(dump, path): try: for page in dump: logger.debug("Constructing new processor for {0}:{1}"\ .format(page.namespace, page.title)) processor_status = self.store.processor_status.get(page.id, type=self.engine.Processor.Status) if processor_status is None: processor_status = self.engine.Processor.Status(page.id) processor = self.engine.processor(processor_status) for rev in page: if rev.id <= processor_status.last_rev_id: logger.debug( "Skipping revision (already processed) " +\ "{0}:{1}".format(rev.id, rev.timestamp)) continue try: user = User(rev.contributor.id, rev.contributor.user_text) delta = processor.process(rev.id, rev.timestamp, rev.text) revision = Revision(rev.id, rev.timestamp, page.id, user, delta) yield (revision, None) except RevisionOrderError as e: logger.error(traceback.format_exc()) logger.info("Skipping revision (out of order) " + \ "{0}:{1}".format(rev.id, rev.timestamp)) logger.debug("Finished processing page {0}:{1}"\ .format(page.namespace, page.title)) yield (processor.status, page.title) logger.debug("Finished processing dump at {0}".format(path)) yield (path, None) except Exception as e: logger.error(traceback.format_exc()) raise engine_status = self.store.engine_status.get(type=self.engine.Status) if engine_status is None: logger.info("Starting {0} from scratch.".format(self.engine.info())) engine_status = self.engine.Status(self.engine.info()) max_rev_id = 0 max_timestamp = Timestamp(0) if len(self.paths) == 1: dump = Iterator.from_file(open_file(self.paths[0])) rev_proc_or_paths = _process_dump(dump, self.paths[0]) else: rev_proc_or_paths = map(self.paths, _process_dump, **self.map_kwargs) try: for rev_proc_or_path, meta in rev_proc_or_paths: if isinstance(rev_proc_or_path, Revision): revision = rev_proc_or_path self.store.revisions.store(revision) self.status.stats['revisions_processed'] += 1 max_rev_id = max(revision.rev_id, max_rev_id) max_timestamp = max(revision.timestamp, max_timestamp) elif isinstance(rev_proc_or_path, ProcessorStatus): processor_status = rev_proc_or_path page_title = meta logger.debug("Completed processing page " + \ "{0}. {1}".format( page_title, processor_status.stats)) self.store.processor_status.store(processor_status) elif isinstance(rev_proc_or_path, str): path = rev_proc_or_path logger.info("Completed processing dump {0}".format(path)) else: raise RuntimeError( "Did not expect a " + \ "{0}".format(type(rev_proc_or_path))) self.status.update(max_rev_id, max_timestamp) self.store.engine_status.store(engine_status) except Exception as e: logger.error(traceback.format_exc()) raise