Ejemplo n.º 1
0
def main():
    dump = Iterator.from_file(
        bz2.open(
            "/mnt/documents/Divers/frwiki-20150331-pages-articles.xml.bz2"))

    RE_WORD = re.compile(r"[\w-]{1,30}", re.IGNORECASE)

    l = LeveldbStorage(3)  # , path='/home/palkeo/Divers/stage_wikipedia')
    l.clear()

    i = 0
    wcount = 0
    start = datetime.datetime.now()
    sentences = None
    for page in dump:
        i += 1
        print("Article %s, %s tokens, %s tokens/second" %
              (i, wcount, wcount //
               (datetime.datetime.now() - start).total_seconds()))

        text = str(next(iter(page)).text).lower()

        sentences = text.split(".")
        sentences = list(
            filter(None, map(lambda p: RE_WORD.findall(p), sentences)))
        for sentence in sentences:
            wcount += len(sentence)
            l.add_sentence(sentence)
def generate_category_relation(input=None, output=None):
    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:

        # ignore redirected pages for both article and talk pages
        if page.redirect:
            continue
        # only parse category page
        if page.namespace != 14:
            continue

        # only one revision on the current page
        for rev in page:
            try:
                wikicode = mwp.parse(rev.text)
            except:
                print(page.id, page.title, page.namespace)
                continue

            # parse the article page to extract category info of the article
            cate = page.title.lower()[len("category:"):]
            for link in wikicode.filter_wikilinks():
                if link.title.startswith('Category:'):
                    super_cate = link.title.lower().replace('category:', "")
                    # categories.append(cate)
                    record = {
                        "cate": cate,
                        "super_cate": super_cate,
                        "cate_pid": page.id
                    }
                    from json import dumps
                    print(dumps(record), file=fout)
Ejemplo n.º 3
0
def parse_specific_pages_given_pageID_anonymous(xml_dump, page_id,
                                                output_file):
    dump_iter = Iterator.from_file(open(xml_dump, encoding='latin-1'))
    for page_iter in dump_iter:
        if page_iter.id == page_id:
            page = Page(page_iter.id, page_iter.title, page_iter.namespace,
                        page_iter.redirect, page_iter.restrictions,
                        page_iter.__iter__())
            rev_iter_idx = 0
            detector = reverts.Detector()
            # edit_list contains tuples <revision_id, user_id> to track previous revisions. For anonymous, saved in form -1,<IP address>
            edit_list = []
            for rev_iter in page:
                rev_iter_idx = rev_iter_idx + 1
                if rev_iter.contributor.id != None:
                    edit_list.append([rev_iter.id, rev_iter.contributor.id])
                else:
                    edit_list.append(
                        [rev_iter.id, rev_iter.contributor.user_text])
                revert_info = detector.process(rev_iter.sha1, rev_iter.id)
                if revert_info != None:
                    reverter = find_user(edit_list, revert_info.reverting)
                    revertedTo = find_user(edit_list, revert_info.reverted_to)
                    for i in range(len(revert_info.reverteds)):
                        reverted = find_user(edit_list,
                                             revert_info.reverteds[i])
                        output_file.write(reverter, ",", revertedTo, ",",
                                          reverted, "\n")
            break
def get_id2properties(lang, date, output_dir):
    """Build lookup for length of page (bytes)."""
    Page = namedtuple('Page', ['title', 'length'])
    output_fn = os.path.join(output_dir, '{0}_page_props.tsv'.format(lang))
    id2props = {}
    if os.path.exists(output_fn):
        with open(output_fn, 'r') as fin:
            tsvreader = csv.reader(fin, delimiter="\t")
            for line in tsvreader:
                pid = int(line[0])
                title = line[1]
                plen = int(line[2])
                id2props[pid] = Page(title, plen)
    else:
        file_path = build_local_currentpage_dump_fn(lang, date)
        print("Gathering page properties from dump.")
        with bz2.BZ2File(file_path, 'r') as fin:
            d = Iterator.from_file(fin)
            for i, page in enumerate(d, start=1):
                if not page.redirect and page.namespace == 0:
                    curr_rev = next(page)
                    id2props[page.id] = Page(page.title, len(curr_rev.text))
                if i % 1000000 == 0:
                    print("{0} pages evaluated. {1} retained.".format(
                        i, len(id2props)))
        with open(output_fn, 'w') as fout:
            tsvwriter = csv.writer(fout, delimiter="\t")
            for pid in id2props:
                tsvwriter.writerow(
                    [pid, id2props[pid].title, id2props[pid].length])

    return id2props
def parse_file(input=None, output=None):

    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # ignore old version pages that were redirected
        if page.redirect:
            print(page.title)
            # print(page.redirect_title)
            continue

        for rev in page:

            from time import mktime, strptime
            pattern = '%Y-%m-%d%HT%M%SZ'
            pattern = '%Y%m%d%H%M%S'
            epoch = int(mktime(strptime(str(rev.timestamp), pattern)))

            record = {
                "rev_id": rev.id,
                "rev_page_id": page.id,
                "rev_page_title": page.title,
                "rev_user_id": rev.contributor.id,
                "rev_user_text": rev.contributor.user_text,
                "ns": page.namespace,
                "rev_comment": rev.comment,
                "rev_timestamp": epoch
            }

            from json import dumps
            print(dumps(record), file=fout)
Ejemplo n.º 6
0
def parse_file(input=None, output=None, bot_file=None):

    #bot_list = load_bots(bot_file)

    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # ignore old version pages that were redirected
        if page.redirect:
            continue
        diff_content = ""

        if page.namespace in [1, 3, 5]:
            print("{},{}".format(page.title, page.namespace))
            revtext = []
            for rev in page:
                from time import mktime, strptime
                pattern = '%Y%m%d%H%M%S'
                epoch = int(mktime(strptime(str(rev.timestamp), pattern)))
                current_revtext = useful_text(rev.text)
                diff_content = diff_sentences(revtext, current_revtext)
                record = {
                    "rev_timestamp": epoch,
                    "rev_id": rev.id,
                    "rev_user_text": rev.contributor.user_text,
                    "rev_user_id": rev.contributor.id,
                    "rev_page_title": page.title,
                    "rev_page_id": page.id,
                    "ns": page.namespace,
                    "rev_diff": diff_content
                }
                revtext = current_revtext
                from json import dumps
                print(dumps(record), file=fout)
        else:
            for rev in page:
                diff_content = "None"
                from time import mktime, strptime
                pattern = '%Y%m%d%H%M%S'
                epoch = int(mktime(strptime(str(rev.timestamp), pattern)))
                record = {
                    "rev_timestamp": epoch,
                    "rev_id": rev.id,
                    "rev_user_text": rev.contributor.user_text,
                    "rev_user_id": rev.contributor.id,
                    "rev_page_title": page.title,
                    "rev_page_id": page.id,
                    "ns": page.namespace,
                    "rev_diff": diff_content
                }
                from json import dumps
                print(dumps(record), file=fout)

    return
Ejemplo n.º 7
0
def parse_revisions(xml_dump, output_file):
    dump_iter = Iterator.from_file(open(xml_dump, encoding='latin-1'))
    #print("------------Site Metadata----------------", file=output_file)
    #print("\nSiteName: ",dump_iter.site_name,"\nBase: ",dump_iter.base,"\nGenerator:        ",dump_iter.generator,"\nCase: ",dump_iter.case, file=output_file)

    #page_iter_idx = 0 # Number of pages
    #cumulative_rev_iter_idx = 0 # Total number of revisions of all pages
    ## Iterate through pages
    #for page_iter in dump_iter:
    #    page_iter_idx = page_iter_idx+1
    #    rev_iter_idx = 0
    #    # Iterate through a page's revisions
    #    for revision_iter in page_iter:
    #        rev_iter_idx = rev_iter_idx+1
    #        cumulative_rev_iter_idx = cumulative_rev_iter_idx+1
    #        #print(revision_iter.id)
    #
    #print(page_iter_idx, cumulative_rev_iter_idx)

    page_iter_idx = 0  # Number of pages
    for page_iter in dump_iter:
        if page_iter_idx < 1000:
            page_iter_idx = page_iter_idx + 1
            page = Page(page_iter.id, page_iter.title, page_iter.namespace,
                        page_iter.redirect, page_iter.restrictions,
                        page_iter.__iter__())
            #print("\n",page_iter_idx,". PageID: ",page.id, file=output_file)
            output_file.write("#\n")
            rev_iter_idx = 0
            detector = reverts.Detector()
            edit_list = []
            for rev_iter in page:
                rev_iter_idx = rev_iter_idx + 1
                #                revision = Revision(rev_iter.id, rev_iter.timestamp)
                edit_list.append([rev_iter.id, rev_iter.contributor.id])
                #print(edit_list, file=output_file)
                #print("\n\t",rev_iter_idx,".",rev_iter,"\n", file=output_file)
                revert_info = detector.process(rev_iter.sha1, rev_iter.id)
                if revert_info != None:
                    reverter = find_user_including_anonymous(
                        edit_list, revert_info.reverting)
                    revertedTo = find_user_including_anonymous(
                        edit_list, revert_info.reverted_to)
                    for i in range(len(revert_info.reverteds)):
                        reverted = find_user_including_anonymous(
                            edit_list, revert_info.reverteds[i])
                        output_file.write(reverter, ",", revertedTo, ",",
                                          reverted, "\n")
 def id2text_iterator(self):
     capture_ids = not self.page_ids
     with bz2.BZ2File(self.article_dump, 'r') as fin:
         d = Iterator.from_file(fin)
         for page in d:
             if not page.redirect and page.namespace == 0:
                 wikitext = next(page).text
                 plaintext = mwparserfromhell.parse(wikitext).strip_code()
                 self.page_count += 1
                 if capture_ids:
                     self.page_ids.append(page.id)
                 yield plaintext
             else:
                 self.skipped += 1
     if capture_ids:
         print("{0}: {1} pages yielded. {2} skipped.".format(
             self.article_dump, self.page_count, self.skipped))
Ejemplo n.º 9
0
def parse_file(input=None, output=None):

    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # ignore old version pages that were redirected
        if page.redirect:
            continue

        record = {
            "page_title": page.title,
            "page_id": page.id,
            "ns": page.namespace
        }
        from json import dumps
        print(dumps(record), file=fout)
Ejemplo n.º 10
0
def parse_file(input=None, output=None, bot_file=None):

    bot_list = load_bots(bot_file)

    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # ignore old version pages that were redirected
        if page.redirect:
            continue

        for rev in page:

            user_text = rev.contributor.user_text
            from IPy import IP
            try:
                IP(user_text)
                continue
            except:
                if user_text in bot_list:
                    continue

            from time import mktime, strptime
            pattern = '%Y%m%d%H%M%S'
            epoch = int(mktime(strptime(str(rev.timestamp), pattern)))

            record = {
                "rev_id": rev.id,
                "rev_page_id": page.id,
                "rev_page_title": page.title,
                "rev_user_id": rev.contributor.id,
                "rev_user_text": rev.contributor.user_text,
                "ns": page.namespace,
                "rev_timestamp": epoch
            }

            from json import dumps
            print(dumps(record), file=fout)
Ejemplo n.º 11
0
def main():
    dump = Iterator.from_file(bz2.open('/mnt/documents/Divers/frwiki-20150331-pages-articles.xml.bz2')) 

    RE_WORD = re.compile(r"[\w-]{1,30}", re.IGNORECASE)

    l = LeveldbStorage(3) #, path='/home/palkeo/Divers/stage_wikipedia')
    l.clear()

    i = 0
    wcount = 0
    start = datetime.datetime.now()
    sentences = None
    for page in dump:
        i += 1
        print("Article %s, %s tokens, %s tokens/second" % (i, wcount, wcount // (datetime.datetime.now() - start).total_seconds()))

        text = str(next(iter(page)).text).lower()

        sentences = text.split('.')
        sentences = list(filter(None, map(lambda p: RE_WORD.findall(p), sentences)))
        for sentence in sentences:
            wcount += len(sentence)
            l.add_sentence(sentence)
Ejemplo n.º 12
0
def main():
    #current events templates regexp
    currentevent_templates_r = {
        "cawiki":
        re.compile(
            r'(?im)(\{\{\s*(?:Actualitat|Fet[ _]actual|Fets[ _]recents)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'
        ),
        "dewiki":
        re.compile(
            r'(?im)(\{\{\s*(?:Laufendes[ _]Ereignis|Laufende[ _]Veranstaltung|Aktuelles[ _]Ereignis)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'
        ),
        "enwiki":
        re.compile(
            r'(?im)(\{\{\s*(?:Current|Current[ _]antics|Current[ _]?disaster|Current[ _]election|Current[ _]?events?|Current[ _]news|Current[ _]paragraph|Current[ _]?person|Current[ _-]?related|Currentsect|Current[ _-]?section|Current[ _]spaceflight|Current[ _]?sport|Current[ _]sport-related|Current[ _]sports[ _]transaction|Current[ _]tornado[ _]outbreak|Current[ _]tropical[ _]cyclone|Current[ _]war|Currentwarfare|Flux|Live|Developing|Developingstory|Ongoing[ _]election|Ongoing[ _]event|Recent[ _]?death|Recent[ _]death[ _]presumed|Recent[ _]?event|Recent[ _]news|Recent[ _]related|Related[ _]current)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'
        ),
        "eswiki":
        re.compile(
            r'(?im)(\{\{\s*(?:Actual|Actualidad|Actualidad[ _]deporte|Current|EA|Evento[ _]actual|Launching|Muerte[ _]reciente|Sencillo[ _]actual|Single[ _]actual|Telenovela[ _]en[ _]emisión|Teleserie[ _]en[ _]emisión)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'
        ),
    }
    #current events categories regexp
    currentevent_categories_r = {
        "cawiki":
        re.compile(
            r'(?im)\[\[\s*(?:Categoria|Category)\s*:\s*Articles[ _]d\'actualitat\s*[\|\]]'
        ),
        "dewiki":
        re.compile(
            r'(?im)\[\[\s*(?:Kategorie|Category)\s*:\s*Wikipedia:Laufendes[ _]Ereignis\s*[\|\]]'
        ),
        "enwiki":
        re.compile(r'(?im)\[\[\s*Category\s*:\s*Current[ _]events\s*[\|\]]'),
        "eswiki":
        re.compile(
            r'(?im)\[\[\s*(?:Categoría|Category)\s*:\s*Actualidad\s*[\|\]]'),
    }
    #namespaces to analyse
    wanted_namespaces = {
        "cawiki": [0],  #main
        "dewiki": [0],  #main
        "enwiki": [0],  #main
        "eswiki": [0, 104],  #main, anexo
    }
    #fields to generate
    fields = [
        'page_id',
        'page_namespace',
        'page_title',
        'page_creator',
        'page_creator_type',  #ip, registered, unknown
        'page_creation_date',
        'it_rev_id',  #it = inserted tag
        'it_rev_timestamp',
        'it_rev_username',
        'it_rev_comment',
        'rt_rev_id',  #rt = removed tag
        'rt_rev_timestamp',
        'rt_rev_username',
        'rt_rev_comment',
        'tag_type',  #template, category, both
        'tag_string',
        'tag_time_since_creation_(hours)',
        'tag_duration_(hours)',
        'tag_edits',
        'tag_distinct_editors',
        #'maintenance_templates', #templates for maintenance during current event
        'diff_len',
        'diff_links',
        'diff_extlinks',
        'diff_refs',
        'diff_templates',
        'diff_images',
        'page_moves',
        #ideas: diff_sections
    ]
    #maintenance templates
    maintenance_templates_r = {
        "eswiki":
        re.compile(
            r'(?im)(\{\{\s*(?:Actualizar|Ampliación[ _]propuesta|Archivo|Artículo[ _]indirecto/esbozo|Artículo[ _]infraesbozo|Autotrad|Aviso[ _]infraesbozo|Bulo|Cita[ _]requerida|Complejo|Contextualizar|Copyedit|Copyvio|Curiosidades|Desactualizado|Desambiguación|Destruir|Discusión[ _]sosegada|Discutido|En[ _]desarrollo|En[ _]uso|Evento[ _]actual|Evento[ _]futuro|Excesivamente[ _]detallado|Ficticio|Formato[ _]de[ _]cita|FP|Fuentes[ _]no[ _]fiables|Fuente[ _]primaria|Fusionando|Fusionar|Fusionar[ _]desde|Fusionar[ _]en|Infraesbozo|Irrelevante|Largo|Mal[ _]traducido|Mejorar[ _]redacción|No[ _]es[ _]un[ _]foro|No[ _]neutralidad|Página[ _]bloqueada|Plagio|Plagio[ _]externo|Polémico|Posible[ _]copyvio|Posible[ _]fusionar|Problemas[ _]artículo|Promocional|Publicidad|PVfan|Reducido|Referencias|Referencias[ _]adicionales|Renombrar|Revisar[ _]traducción|Separado[ _]de|Separar|Sin[ _]?relevancia|SRA|Traducción|Traducido[ _]de|Transferir[ _]a|Wikificar)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'
        ),
    }
    #regexps for counts
    links_r = re.compile(r'(?im)(\[\[[^\[\]\r\n]+\]\])')  # [[..|?..]]
    extlinks_r = re.compile(r'(?im)(://)')  # ://
    refs_r = re.compile(r'(?im)< */ *ref *>')  # </ref>
    templates_r = re.compile(r'(?im)((?:^|[^\{\}])\{\{[^\{\}])')  # {{
    images_r = re.compile(
        r'(?im)\[\[\s*(File|Image|Fitxer|Imatge|Datei|Bild|Archivo|Imagen)\s*:'
    )

    #get parameters
    dumpfilename = sys.argv[1]
    chunkid = sys.argv[2]
    #input can be compressed or plain xml
    if dumpfilename.endswith('.7z'):
        #7za or 7zr are valid
        fp = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % dumpfilename,
                              shell=True,
                              stdout=subprocess.PIPE,
                              bufsize=65535)
        pages = Iterator.from_file(fp.stdout)
    elif dumpfilename.endswith('.bz2'):
        import bz2
        source = bz2.BZ2File(dumpfilename)
        pages = Iterator.from_file(source)
    else:
        source = open(dumpfilename)
        pages = Iterator.from_file(source)

    #get dump language and date
    dumplang = dumpfilename.split('/')[-1].split('-')[0]
    dumpdate = datetime.datetime.strptime(
        '%s 23:59:59' % (dumpfilename.split('/')[-1].split('-')[1]),
        '%Y%m%d %H:%M:%S')
    pagecount = 0

    #blank CSV currentevents
    filename = 'currentevents-%s-%s.csv.%s' % (
        dumplang, dumpdate.strftime('%Y%m%d'), chunkid)
    f = open(filename, 'w', encoding='utf-8')
    output = '{0}\n'.format('|'.join(fields))
    f.write(output)
    f.close()
    #blank CSV pages
    filename = 'pages-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'),
                                       chunkid)
    g = open(filename, 'w', encoding='utf-8')
    output = 'page_id|page_namespace|page_title|page_creation_rev_id|page_creation_date|page_creator|page_is_redirect\n'
    g.write(output)
    g.close()

    #analyse dump
    for page in pages:
        if int(page.namespace
               ) not in wanted_namespaces[dumplang]:  #skip unwanted namespaces
            continue
        msg = 'Analysing: {0}'.format(page.title)
        print(msg.encode('utf-8'))

        pagecount += 1
        if pagecount % 100 == 0:
            msg = 'Analysed {0} pages'.format(pagecount)
            print(msg.encode('utf-8'))
        #if pagecount > 2000:
        #    if dumpfilename.endswith('.7z'):
        #        fp.kill()
        #    break

        currentevents = []
        tagged = False
        revcount = 0
        page_creator = ''
        page_creator_type = ''
        pagecreationdate = ''
        page_is_redirect = page.redirect and 'True' or 'False'
        temp = {}  # to detect wrongly removed templates
        prevrevtext = ''
        for rev in page:
            if revcount == 0:
                if rev.contributor:
                    page_creator = rev.contributor.user_text and rev.contributor.user_text or ''
                    page_creator_type = rev.contributor.id and rev.contributor.id != 0 and 'registered' or 'ip'
                else:
                    page_creator = ''
                    page_creator_type = 'unknown'
                pagecreationdate = rev.timestamp
                filename = 'pages-%s-%s.csv.%s' % (
                    dumplang, dumpdate.strftime('%Y%m%d'), chunkid)
                g = csv.writer(open(filename, 'a', encoding='utf-8'),
                               delimiter='|',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
                g.writerow([
                    page.id, page.namespace, page.title, rev.id,
                    pagecreationdate.long_format(), page_creator,
                    page_is_redirect
                ])
            revcount += 1
            #print (rev.id)
            rev_user_text = ''
            if rev.contributor:
                rev_user_text = rev.contributor.user_text and rev.contributor.user_text or ''
            revtext = rev.text and rev.text or ''
            revcomment = re.sub(r'\n', '', rev.comment and rev.comment or '')
            if re.search(currentevent_templates_r[dumplang],
                         revtext) or re.search(
                             currentevent_categories_r[dumplang], revtext):
                if tagged:
                    #still is current event
                    currentevents[-1]['tag_edits'] += 1
                    currentevents[-1]['tag_distinct_editors'].add(
                        rev_user_text)
                    #check page moves
                    if pagemoved(revtext, prevrevtext):
                        currentevents[-1]['page_moves'] += 1
                else:
                    #tagged as current event just now
                    if temp:
                        if timediffinhours(
                                temp['rt_rev_timestamp'].long_format(),
                                rev.timestamp.long_format()) <= 24 * 2:
                            #if it was current event less than X days before, then the template was wrongly removed
                            currentevents[-1] = temp.copy()
                            currentevents[-1]['tag_edits'] += 1
                            currentevents[-1]['tag_distinct_editors'].add(
                                rev_user_text)
                            temp = {}
                            tagged = currentevents[-1]['it_rev_timestamp']
                            continue

                    tagged = rev.timestamp
                    tag_time_since_creation = timediffinhours(
                        pagecreationdate.long_format(),
                        rev.timestamp.long_format())
                    print(page.title.encode('utf-8'), tag_time_since_creation)

                    tag_string = 'unknown'
                    if re.search(currentevent_templates_r[dumplang], revtext):
                        #unify a bit the tag, to ease comparison later
                        tag_string = re.findall(
                            currentevent_templates_r[dumplang],
                            revtext)[0].lower().strip()
                        tag_string = re.sub(r'_', r' ', tag_string)
                        tag_string = re.sub(r'\{\{\s+', r'{{', tag_string)
                        tag_string = re.sub(r'\s+\}\}', r'}}', tag_string)
                        tag_string = re.sub(r'\s*\|\s*', r'|', tag_string)
                        tag_string = re.sub(r'\n', r'', tag_string)
                        tag_string = re.sub(r'\|\|+', r'|', tag_string)
                        tag_string = re.sub(
                            r'(?i)\|\s*date\s*\=\s*[A-Za-z0-9 ]+', r'',
                            tag_string)  #remove |date=May 2014 in English WP
                    tag_type = ""
                    if re.search(currentevent_templates_r[dumplang], revtext):
                        tag_type = "template"
                        if re.search(currentevent_categories_r[dumplang],
                                     revtext):
                            tag_type = "both"
                    elif re.search(currentevent_categories_r[dumplang],
                                   revtext):
                        tag_type = "category"

                    currentevent = {
                        'page_id':
                        str(page.id),
                        'page_namespace':
                        str(page.namespace),
                        'page_title':
                        page.title,
                        'page_creator':
                        page_creator,
                        'page_creator_type':
                        page_creator_type,
                        'page_creation_date':
                        pagecreationdate,
                        'it_rev_id':
                        str(rev.id),
                        'it_rev_timestamp':
                        rev.timestamp,
                        'it_rev_username':
                        rev.contributor.user_text,
                        'it_rev_comment':
                        revcomment and revcomment or "",
                        'rt_rev_id':
                        "",
                        'rt_rev_timestamp':
                        "",
                        'rt_rev_username':
                        "",
                        'rt_rev_comment':
                        "",
                        'tag_type':
                        tag_type,
                        'tag_string':
                        tag_string,
                        'tag_time_since_creation_(hours)':
                        str(tag_time_since_creation),
                        'tag_duration_(hours)':
                        "",
                        'tag_edits':
                        1,  #counter to increment
                        'tag_distinct_editors':
                        set([rev_user_text]),  #set of unique editors
                        #prevrevtext to catch any change right when is marked as current event
                        'diff_len':
                        len(prevrevtext),
                        'diff_links':
                        len(re.findall(links_r, prevrevtext)),
                        'diff_extlinks':
                        len(re.findall(extlinks_r, prevrevtext)),
                        'diff_refs':
                        len(re.findall(refs_r, prevrevtext)),
                        'diff_templates':
                        len(re.findall(templates_r, prevrevtext)),
                        'diff_images':
                        len(re.findall(images_r, prevrevtext)),
                        'page_moves':
                        0,
                    }
                    currentevents.append(currentevent)
            else:
                if tagged:
                    #tag has been removed just now

                    temp = currentevents[-1].copy(
                    )  #saving temporaly to check if it is added again shortly
                    temp['rt_rev_timestamp'] = rev.timestamp

                    currentevents[-1]['page_creation_date'] = currentevents[
                        -1]['page_creation_date'].long_format()
                    currentevents[-1]['it_rev_timestamp'] = currentevents[-1][
                        'it_rev_timestamp'].long_format()
                    currentevents[-1]['rt_rev_id'] = str(rev.id)
                    currentevents[-1][
                        'rt_rev_timestamp'] = rev.timestamp.long_format()
                    currentevents[-1][
                        'rt_rev_username'] = rev.contributor.user_text
                    currentevents[-1][
                        'rt_rev_comment'] = revcomment and revcomment or ""
                    currentevents[-1][
                        'tag_duration_(hours)'] = timediffinhours(
                            tagged.long_format(), rev.timestamp.long_format())
                    currentevents[-1]['tag_edits'] += 1
                    currentevents[-1]['tag_distinct_editors'].add(
                        rev_user_text)
                    currentevents[-1]['tag_distinct_editors'] = len(
                        currentevents[-1]['tag_distinct_editors'])
                    currentevents[-1]['diff_len'] = len(
                        revtext) - currentevents[-1]['diff_len']
                    #revtext because it was current event until this very edit
                    currentevents[-1]['diff_links'] = len(
                        re.findall(links_r,
                                   revtext)) - currentevents[-1]['diff_links']
                    currentevents[-1]['diff_extlinks'] = len(
                        re.findall(
                            extlinks_r,
                            revtext)) - currentevents[-1]['diff_extlinks']
                    currentevents[-1]['diff_refs'] = len(
                        re.findall(refs_r,
                                   revtext)) - currentevents[-1]['diff_refs']
                    currentevents[-1]['diff_templates'] = len(
                        re.findall(
                            templates_r,
                            revtext)) - currentevents[-1]['diff_templates']
                    currentevents[-1]['diff_images'] = len(
                        re.findall(images_r,
                                   revtext)) - currentevents[-1]['diff_images']
                    currentevents[-1]['page_moves'] += 1
                    tagged = False
                else:
                    if temp:
                        #keep temp updated
                        temp['tag_edits'] += 1
                        temp['tag_distinct_editors'].add(rev_user_text)
                        #check page moves
                        if pagemoved(revtext, prevrevtext):
                            temp['page_moves'] += 1

            prevrevtext = revtext  #needed for diff stats

        if tagged:
            #tagged still as of dumpdate
            currentevents[-1]['page_creation_date'] = currentevents[-1][
                'page_creation_date'].long_format()
            currentevents[-1]['it_rev_timestamp'] = currentevents[-1][
                'it_rev_timestamp'].long_format()
            currentevents[-1]['tag_duration_(hours)'] = timediffinhours(
                tagged.long_format(), dumpdate.strftime("%Y-%m-%dT%H:%M:%SZ"))
            currentevents[-1]['tag_edits'] += 1
            currentevents[-1]['tag_distinct_editors'].add(rev_user_text)
            currentevents[-1]['tag_distinct_editors'] = len(
                currentevents[-1]['tag_distinct_editors'])
            #use revtext and not prevrevtext because it is still current event
            currentevents[-1]['diff_len'] = len(
                revtext) - currentevents[-1]['diff_len']
            currentevents[-1]['diff_links'] = len(re.findall(
                links_r, revtext)) - currentevents[-1]['diff_links']
            currentevents[-1]['diff_extlinks'] = len(
                re.findall(extlinks_r,
                           revtext)) - currentevents[-1]['diff_extlinks']
            currentevents[-1]['diff_refs'] = len(re.findall(
                refs_r, revtext)) - currentevents[-1]['diff_refs']
            currentevents[-1]['diff_templates'] = len(
                re.findall(templates_r,
                           revtext)) - currentevents[-1]['diff_templates']
            currentevents[-1]['diff_images'] = len(
                re.findall(images_r,
                           revtext)) - currentevents[-1]['diff_images']
            #print page.title.encode('utf-8'), currentevents[-1]
            tagged = False

        filename = 'currentevents-%s-%s.csv.%s' % (
            dumplang, dumpdate.strftime('%Y%m%d'), chunkid)
        f = csv.writer(open(filename, 'a', encoding='utf-8'),
                       delimiter='|',
                       quotechar='"',
                       quoting=csv.QUOTE_MINIMAL)
        for i in currentevents:
            row = [i[field] for field in fields]
            f.writerow(row)

    print('Finished correctly')
Ejemplo n.º 13
0
def main():
    #current events templates regexp
    currentevent_templates_r = {
        "cawiki": re.compile(r'(?im)(\{\{\s*(?:Actualitat|Fet[ _]actual|Fets[ _]recents)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'), 
        "dewiki": re.compile(r'(?im)(\{\{\s*(?:Laufendes[ _]Ereignis|Laufende[ _]Veranstaltung|Aktuelles[ _]Ereignis)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'), 
        "enwiki": re.compile(r'(?im)(\{\{\s*(?:Current|Current[ _]antics|Current[ _]?disaster|Current[ _]election|Current[ _]?events?|Current[ _]news|Current[ _]paragraph|Current[ _]?person|Current[ _-]?related|Currentsect|Current[ _-]?section|Current[ _]spaceflight|Current[ _]?sport|Current[ _]sport-related|Current[ _]sports[ _]transaction|Current[ _]tornado[ _]outbreak|Current[ _]tropical[ _]cyclone|Current[ _]war|Currentwarfare|Flux|Live|Developing|Developingstory|Ongoing[ _]election|Ongoing[ _]event|Recent[ _]?death|Recent[ _]death[ _]presumed|Recent[ _]?event|Recent[ _]news|Recent[ _]related|Related[ _]current)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'),
        "eswiki": re.compile(r'(?im)(\{\{\s*(?:Actual|Actualidad|Actualidad[ _]deporte|Current|EA|Evento[ _]actual|Launching|Muerte[ _]reciente|Sencillo[ _]actual|Single[ _]actual|Telenovela[ _]en[ _]emisión|Teleserie[ _]en[ _]emisión)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'), 
        }
    #current events categories regexp
    currentevent_categories_r = {
        "cawiki": re.compile(r'(?im)\[\[\s*(?:Categoria|Category)\s*:\s*Articles[ _]d\'actualitat\s*[\|\]]'),
        "dewiki": re.compile(r'(?im)\[\[\s*(?:Kategorie|Category)\s*:\s*Wikipedia:Laufendes[ _]Ereignis\s*[\|\]]'),
        "enwiki": re.compile(r'(?im)\[\[\s*Category\s*:\s*Current[ _]events\s*[\|\]]'),
        "eswiki": re.compile(r'(?im)\[\[\s*(?:Categoría|Category)\s*:\s*Actualidad\s*[\|\]]'),
        }
    #namespaces to analyse
    wanted_namespaces = {
        "cawiki": [0], #main
        "dewiki": [0], #main
        "enwiki": [0], #main
        "eswiki": [0, 104], #main, anexo
        }
    #fields to generate
    fields = [
        'page_id', 
        'page_namespace', 
        'page_title', 
        'page_creator', 
        'page_creator_type', #ip, registered, unknown
        'page_creation_date', 
        'it_rev_id', #it = inserted tag
        'it_rev_timestamp', 
        'it_rev_username', 
        'it_rev_comment', 
        'rt_rev_id', #rt = removed tag
        'rt_rev_timestamp', 
        'rt_rev_username', 
        'rt_rev_comment', 
        'tag_type', #template, category, both
        'tag_string', 
        'tag_time_since_creation_(hours)', 
        'tag_duration_(hours)', 
        'tag_edits', 
        'tag_distinct_editors', 
        #'maintenance_templates', #templates for maintenance during current event
        'diff_len', 
        'diff_links', 
        'diff_extlinks', 
        'diff_refs', 
        'diff_templates', 
        'diff_images', 
        'page_moves', 
        #ideas: diff_sections
        ]
    #maintenance templates
    maintenance_templates_r = {
        "eswiki": re.compile(r'(?im)(\{\{\s*(?:Actualizar|Ampliación[ _]propuesta|Archivo|Artículo[ _]indirecto/esbozo|Artículo[ _]infraesbozo|Autotrad|Aviso[ _]infraesbozo|Bulo|Cita[ _]requerida|Complejo|Contextualizar|Copyedit|Copyvio|Curiosidades|Desactualizado|Desambiguación|Destruir|Discusión[ _]sosegada|Discutido|En[ _]desarrollo|En[ _]uso|Evento[ _]actual|Evento[ _]futuro|Excesivamente[ _]detallado|Ficticio|Formato[ _]de[ _]cita|FP|Fuentes[ _]no[ _]fiables|Fuente[ _]primaria|Fusionando|Fusionar|Fusionar[ _]desde|Fusionar[ _]en|Infraesbozo|Irrelevante|Largo|Mal[ _]traducido|Mejorar[ _]redacción|No[ _]es[ _]un[ _]foro|No[ _]neutralidad|Página[ _]bloqueada|Plagio|Plagio[ _]externo|Polémico|Posible[ _]copyvio|Posible[ _]fusionar|Problemas[ _]artículo|Promocional|Publicidad|PVfan|Reducido|Referencias|Referencias[ _]adicionales|Renombrar|Revisar[ _]traducción|Separado[ _]de|Separar|Sin[ _]?relevancia|SRA|Traducción|Traducido[ _]de|Transferir[ _]a|Wikificar)\s*(?:\|[^\{\}\n]*?\s*\}\}|\}\}))'),
    }
    #regexps for counts
    links_r = re.compile(r'(?im)(\[\[[^\[\]\r\n]+\]\])') # [[..|?..]]
    extlinks_r = re.compile(r'(?im)(://)') # ://
    refs_r = re.compile(r'(?im)< */ *ref *>') # </ref>
    templates_r = re.compile(r'(?im)((?:^|[^\{\}])\{\{[^\{\}])') # {{
    images_r = re.compile(r'(?im)\[\[\s*(File|Image|Fitxer|Imatge|Datei|Bild|Archivo|Imagen)\s*:')
    
    #get parameters
    dumpfilename = sys.argv[1]
    chunkid = sys.argv[2]
    #input can be compressed or plain xml
    if dumpfilename.endswith('.7z'):
        #7za or 7zr are valid
        fp = subprocess.Popen('7za e -bd -so %s 2>/dev/null' % dumpfilename, shell=True, stdout=subprocess.PIPE, bufsize=65535)
        pages = Iterator.from_file(fp.stdout)
    elif dumpfilename.endswith('.bz2'):
        import bz2
        source = bz2.BZ2File(dumpfilename)
        pages = Iterator.from_file(source)
    else:
        source = open(dumpfilename)
        pages = Iterator.from_file(source)
    
    #get dump language and date
    dumplang = dumpfilename.split('/')[-1].split('-')[0]
    dumpdate = datetime.datetime.strptime('%s 23:59:59' % (dumpfilename.split('/')[-1].split('-')[1]), '%Y%m%d %H:%M:%S')
    pagecount = 0
    
    #blank CSV currentevents
    filename = 'currentevents-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'), chunkid)
    f = open(filename, 'w', encoding='utf-8')
    output = '{0}\n'.format('|'.join(fields))
    f.write(output)
    f.close()
    #blank CSV pages
    filename = 'pages-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'), chunkid)
    g = open(filename, 'w', encoding='utf-8')
    output = 'page_id|page_namespace|page_title|page_creation_rev_id|page_creation_date|page_creator|page_is_redirect\n'
    g.write(output)
    g.close()
    
    #analyse dump
    for page in pages:
        if int(page.namespace) not in wanted_namespaces[dumplang]: #skip unwanted namespaces
            continue
        msg = 'Analysing: {0}'.format(page.title)
        print(msg.encode('utf-8'))
        
        pagecount += 1
        if pagecount % 100 == 0:
            msg = 'Analysed {0} pages'.format(pagecount)
            print(msg.encode('utf-8'))
        #if pagecount > 2000:
        #    if dumpfilename.endswith('.7z'):
        #        fp.kill()
        #    break
        
        currentevents = []
        tagged = False
        revcount = 0
        page_creator = ''
        page_creator_type = ''
        pagecreationdate = ''
        page_is_redirect = page.redirect and 'True' or 'False'
        temp = {} # to detect wrongly removed templates
        prevrevtext = ''
        for rev in page:
            if revcount == 0:
                if rev.contributor:
                    page_creator = rev.contributor.user_text and rev.contributor.user_text or ''
                    page_creator_type = rev.contributor.id and rev.contributor.id != 0 and 'registered' or 'ip'
                else:
                    page_creator = ''
                    page_creator_type = 'unknown'
                pagecreationdate = rev.timestamp
                filename = 'pages-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'), chunkid)
                g = csv.writer(open(filename, 'a', encoding='utf-8'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                g.writerow([page.id, page.namespace, page.title, rev.id, pagecreationdate.long_format(), page_creator, page_is_redirect])
            revcount += 1
            #print (rev.id)
            rev_user_text = ''
            if rev.contributor:
                rev_user_text = rev.contributor.user_text and rev.contributor.user_text or ''
            revtext = rev.text and rev.text or ''
            revcomment = re.sub(r'\n', '', rev.comment and rev.comment or '')
            if re.search(currentevent_templates_r[dumplang], revtext) or re.search(currentevent_categories_r[dumplang], revtext):
                if tagged:
                    #still is current event
                    currentevents[-1]['tag_edits'] += 1
                    currentevents[-1]['tag_distinct_editors'].add(rev_user_text)
                    #check page moves
                    if pagemoved(revtext, prevrevtext):
                        currentevents[-1]['page_moves'] += 1
                else:
                    #tagged as current event just now
                    if temp:
                        if timediffinhours(temp['rt_rev_timestamp'].long_format(), rev.timestamp.long_format()) <= 24 * 2:
                            #if it was current event less than X days before, then the template was wrongly removed
                            currentevents[-1] = temp.copy()
                            currentevents[-1]['tag_edits'] += 1
                            currentevents[-1]['tag_distinct_editors'].add(rev_user_text)
                            temp = {}
                            tagged = currentevents[-1]['it_rev_timestamp']
                            continue
                    
                    tagged = rev.timestamp
                    tag_time_since_creation = timediffinhours(pagecreationdate.long_format(), rev.timestamp.long_format())
                    print(page.title.encode('utf-8'), tag_time_since_creation)
                    
                    tag_string = 'unknown'
                    if re.search(currentevent_templates_r[dumplang], revtext):
                        #unify a bit the tag, to ease comparison later
                        tag_string = re.findall(currentevent_templates_r[dumplang], revtext)[0].lower().strip()
                        tag_string = re.sub(r'_', r' ', tag_string)
                        tag_string = re.sub(r'\{\{\s+', r'{{', tag_string)
                        tag_string = re.sub(r'\s+\}\}', r'}}', tag_string)
                        tag_string = re.sub(r'\s*\|\s*', r'|', tag_string)
                        tag_string = re.sub(r'\n', r'', tag_string)
                        tag_string = re.sub(r'\|\|+', r'|', tag_string)
                        tag_string = re.sub(r'(?i)\|\s*date\s*\=\s*[A-Za-z0-9 ]+', r'', tag_string) #remove |date=May 2014 in English WP
                    tag_type = ""
                    if re.search(currentevent_templates_r[dumplang], revtext):
                        tag_type = "template"
                        if re.search(currentevent_categories_r[dumplang], revtext):
                            tag_type = "both"
                    elif re.search(currentevent_categories_r[dumplang], revtext):
                        tag_type = "category"
                    
                    currentevent = {
                        'page_id': str(page.id), 
                        'page_namespace': str(page.namespace), 
                        'page_title': page.title, 
                        'page_creator': page_creator, 
                        'page_creator_type': page_creator_type, 
                        'page_creation_date': pagecreationdate, 
                        'it_rev_id': str(rev.id),
                        'it_rev_timestamp': rev.timestamp, 
                        'it_rev_username': rev.contributor.user_text, 
                        'it_rev_comment': revcomment and revcomment or "", 
                        'rt_rev_id': "",
                        'rt_rev_timestamp': "", 
                        'rt_rev_username': "", 
                        'rt_rev_comment': "", 
                        'tag_type': tag_type,
                        'tag_string': tag_string, 
                        'tag_time_since_creation_(hours)': str(tag_time_since_creation), 
                        'tag_duration_(hours)': "", 
                        'tag_edits': 1, #counter to increment
                        'tag_distinct_editors': set([rev_user_text]), #set of unique editors
                        #prevrevtext to catch any change right when is marked as current event
                        'diff_len': len(prevrevtext), 
                        'diff_links': len(re.findall(links_r, prevrevtext)), 
                        'diff_extlinks': len(re.findall(extlinks_r, prevrevtext)), 
                        'diff_refs': len(re.findall(refs_r, prevrevtext)), 
                        'diff_templates': len(re.findall(templates_r, prevrevtext)), 
                        'diff_images': len(re.findall(images_r, prevrevtext)), 
                        'page_moves': 0, 
                    }
                    currentevents.append(currentevent)
            else:
                if tagged:
                    #tag has been removed just now
                    
                    temp = currentevents[-1].copy() #saving temporaly to check if it is added again shortly
                    temp['rt_rev_timestamp'] = rev.timestamp
                    
                    currentevents[-1]['page_creation_date'] = currentevents[-1]['page_creation_date'].long_format()
                    currentevents[-1]['it_rev_timestamp'] = currentevents[-1]['it_rev_timestamp'].long_format()
                    currentevents[-1]['rt_rev_id'] = str(rev.id)
                    currentevents[-1]['rt_rev_timestamp'] = rev.timestamp.long_format()
                    currentevents[-1]['rt_rev_username'] = rev.contributor.user_text
                    currentevents[-1]['rt_rev_comment'] = revcomment and revcomment or ""
                    currentevents[-1]['tag_duration_(hours)'] = timediffinhours(tagged.long_format(), rev.timestamp.long_format())
                    currentevents[-1]['tag_edits'] += 1
                    currentevents[-1]['tag_distinct_editors'].add(rev_user_text)
                    currentevents[-1]['tag_distinct_editors'] = len(currentevents[-1]['tag_distinct_editors'])
                    currentevents[-1]['diff_len'] = len(revtext) - currentevents[-1]['diff_len']
                    #revtext because it was current event until this very edit
                    currentevents[-1]['diff_links'] = len(re.findall(links_r, revtext)) - currentevents[-1]['diff_links']
                    currentevents[-1]['diff_extlinks'] = len(re.findall(extlinks_r, revtext)) - currentevents[-1]['diff_extlinks']
                    currentevents[-1]['diff_refs'] = len(re.findall(refs_r, revtext)) - currentevents[-1]['diff_refs']
                    currentevents[-1]['diff_templates'] = len(re.findall(templates_r, revtext)) - currentevents[-1]['diff_templates']
                    currentevents[-1]['diff_images'] = len(re.findall(images_r, revtext)) - currentevents[-1]['diff_images']
                    currentevents[-1]['page_moves'] += 1
                    tagged = False
                else:
                    if temp:
                        #keep temp updated
                        temp['tag_edits'] += 1
                        temp['tag_distinct_editors'].add(rev_user_text)
                        #check page moves
                        if pagemoved(revtext, prevrevtext):
                            temp['page_moves'] += 1
            
            prevrevtext = revtext #needed for diff stats
        
        if tagged:
            #tagged still as of dumpdate
            currentevents[-1]['page_creation_date'] = currentevents[-1]['page_creation_date'].long_format()
            currentevents[-1]['it_rev_timestamp'] = currentevents[-1]['it_rev_timestamp'].long_format()
            currentevents[-1]['tag_duration_(hours)'] = timediffinhours(tagged.long_format(), dumpdate.strftime("%Y-%m-%dT%H:%M:%SZ"))
            currentevents[-1]['tag_edits'] += 1
            currentevents[-1]['tag_distinct_editors'].add(rev_user_text)
            currentevents[-1]['tag_distinct_editors'] = len(currentevents[-1]['tag_distinct_editors'])
            #use revtext and not prevrevtext because it is still current event
            currentevents[-1]['diff_len'] = len(revtext) - currentevents[-1]['diff_len']
            currentevents[-1]['diff_links'] = len(re.findall(links_r, revtext)) - currentevents[-1]['diff_links']
            currentevents[-1]['diff_extlinks'] = len(re.findall(extlinks_r, revtext)) - currentevents[-1]['diff_extlinks']
            currentevents[-1]['diff_refs'] = len(re.findall(refs_r, revtext)) - currentevents[-1]['diff_refs']
            currentevents[-1]['diff_templates'] = len(re.findall(templates_r, revtext)) - currentevents[-1]['diff_templates']
            currentevents[-1]['diff_images'] = len(re.findall(images_r, revtext)) - currentevents[-1]['diff_images']
            #print page.title.encode('utf-8'), currentevents[-1]
            tagged = False
    
        filename = 'currentevents-%s-%s.csv.%s' % (dumplang, dumpdate.strftime('%Y%m%d'), chunkid)
        f = csv.writer(open(filename, 'a', encoding='utf-8'), delimiter='|', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for i in currentevents:
            row = [i[field] for field in fields]
            f.writerow(row)
    
    print('Finished correctly')
def parse_dump(dump_filename, wanted_page_ids, found_pages_dict, users_page_edits_dict, pages_pwr_dict, logfile):
    """
    Parse the given dump, processing assessments for the given
    talk page IDs.

    @param dump_filename: path to the dump file to process
    @type dump_filename: str

    @param wanted_page_ids: dictionary where keys are talk page IDs,
                            values don't matter, we're only using the
                            dict for fast lookups
    @type wanted_page_ids: dict
    """

    # Construct dump file iterator
    dump = Iterator.from_file(functions.open_file(dump_filename))

    bots_file = "resources/wikipedia_bots_full.txt"
    bots = {}
    try:
        with open(bots_file, "r") as fin:
            csvreader = csv.reader(fin)
            for line in csvreader:
                bots[line[0].lower()] = True
    except:
        print("Invalid bots file - only text matching with 'bot' will be used")
        with open(logfile, "a") as fout:
            fout.write("Invalid bots file - only text regex with 'bot' followed by whitespace will be used.\n")

    scripts = ["commonsdelinker", "conversion script"]

    count = 0
    # Iterate through pages
    for page in dump:
        # skip if not a page we want to process
        if not page.id in wanted_page_ids:
            continue
        try:
            with open(logfile, "a", encoding="utf-8", errors="backslashreplace") as fout:
                fout.write(str(datetime.now()) + ": " + page.title + "\n")
            print(page.title)
        except:
            with open(logfile, "a") as fout:
                fout.write(str(datetime.now()) + ": next spatial article.\n")
            print("next spatial article.")

        state = persistence.State()

        count += 1
        counts_dict = {
            "total_edits": 0,
            "bot_edits": 0,
            "unverified_bot_edits": 0,
            "known_script_edits": 0,
            "anonymous_edits": 0,
            "awb_edits": 0,
            "minor_edits": 0,
            "wpcleaner_edits": 0,
        }

        # Iterate through a page's revisions
        for revision in page:
            # skip if there's no content
            if not revision.text:
                continue

            if revision.comment and "awb" in revision.comment.lower():
                pwr = state.process(revision.text, revision="awb")
            else:
                pwr = state.process(revision.text, revision=revision.contributor.user_text)

            counts_dict["total_edits"] += 1
            try:
                if revision.contributor.user_text:
                    process_rev(revision, counts_dict, bots, scripts, users_page_edits_dict, page.id)
            except:
                try:
                    print(
                        "Error in revision.contributor.user_text {0} for page {1}".format(
                            revision.contributor.user_text, page.title
                        )
                    )
                    with open(logfile, "a") as fout:
                        fout.write(
                            "Error in revision.contributor.user_text {0} for page {1}\n".format(
                                revision.contributor.user_text, page.title
                            )
                        )
                except:
                    print("Error in a revision.contributor.user_text for a page.")
                    with open(logfile, "a") as fout:
                        fout.write("Error in a revision.contributor.user_text for a page.")

        found_pages_dict[page.id] = wanted_page_ids[page.id]
        found_pages_dict[page.id].update(counts_dict)

        current_state = {
            "total_tokens": 0,
            "bot_tokens": 0,
            "unverified_bot_tokens": 0,
            "known_script_tokens": 0,
            "anonymous_tokens": 0,
            "awb_tokens": 0,
        }

        for tk in pwr[0]:  # loop through tokens in current state of the page
            current_state["total_tokens"] += 1
            try:
                if tk.revisions[0]:
                    process_current_page(tk.revisions[0].lower(), current_state, bots, scripts, pages_pwr_dict, page.id)
            except:
                try:
                    print("Error in processing token {0} for page {1}".format(tk.text, page.id))
                    with open(logfile, "a", encoding="utf-8", errors="backslashreplace") as fout:
                        fout.write("Error in processing token {0} for page {1}.\n".format(tk.text, str(page.id)))
                except:
                    print("Error in processing a token for page {0}".format(page.id))
                    with open(logfile, "a") as fout:
                        fout.write("Error in processing a token for page {0}.\n".format(page.id))
        found_pages_dict[page.id].update(current_state)

    # ok, done
    return
            x = True

            while (x):

                name = "tmpdump" + str(n) + ".bz2"

                if (os.path.isfile(name)):

                    n = n + 1

                else:

                    x = False

            print("Downloading " + line + "...")
            
            urllib.request.urlretrieve(line, name, reporthook=dlProgress)

            print("\nAnalizing " + line + "...")

            dump = Iterator.from_file(bz2.open(name, "r"))

            do(dump, line.split("/")[-1] + ".csv")

            os.remove(name)

    else:

        dump = Iterator.from_file(bz2.open(arguments["-f"], "r"))

        do(dump, arguments['-o'])
def parse_file(input=None, output=None, wp_dir=None, cat_dir=None):

    wp_output = wp_dir + output.replace(FILE_TYPE,
                                        '') + '_wikiproject' + FILE_TYPE
    cat_output = cat_dir + output.replace(FILE_TYPE,
                                          '') + '_category' + FILE_TYPE
    wp_fout = open(wp_output, 'w')
    cat_fout = open(cat_output, 'w')

    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # print(page.title, page.namespace)
        # ignore redirected pages for both article and talk pages
        if page.redirect:
            continue
        if page.namespace != 0 and page.namespace != 1:
            continue

        # only one revision on the current page
        for rev in page:
            # catch rare parsing errors
            try:
                wikicode = mwp.parse(rev.text)
            except:
                print(page.id, page.title, page.namespace)
                continue

            # parse the article page to extract category info of the article
            if page.namespace == 0:
                categories = []
                title = page.title.lower()
                for link in wikicode.filter_wikilinks():
                    if link.title.startswith('Category:'):
                        cate = link.title.lower().replace('category:', "")
                        categories.append(cate)

                        if not LIST_FORMAT:
                            record = {
                                "pageId": page.id,
                                "title": title,
                                "category": cate
                            }
                            from json import dumps
                            print(dumps(record), file=cat_fout)

                if LIST_FORMAT:
                    record = {
                        "pageId": page.id,
                        "title": title,
                        "categories": categories
                    }
                    from json import dumps
                    print(dumps(record), file=cat_fout)

            # parse the talk page to extract wikiproject info of the article
            if page.namespace == 1:
                title = page.title.lower().replace("talk:", "")
                cls = importance = "None"
                wikiprojects = []

                for template in wikicode.filter_templates():
                    if template.name == 'WikiProjectBannerShell':
                        continue

                    if template.name.lower().startswith('wikiproject'):
                        from re import search
                        wikiproject = template.name.lower().replace(
                            "wikiproject", "").strip()
                        wikiprojects.append(wikiproject)
                        template = str(template).replace("}", "|").replace(
                            " ", "").replace("\n", "")

                        try:
                            cls = search(r'\|class=([a-z-A-Z]+)\|',
                                         template).group(1)
                            importance = search(r'\|importance=([a-z-A-Z]+)\|',
                                                template).group(1)
                        except AttributeError:
                            pass

                        if not LIST_FORMAT:
                            record = {
                                "pageId": page.id,
                                "title": title,
                                "wikiproject": wikiproject,
                                "class": cls.lower(),
                                "importance": importance.lower()
                            }
                            from json import dumps
                            print(dumps(record), file=wp_fout)

                if LIST_FORMAT:
                    record = {
                        "pageId": page.id,
                        "title": title,
                        "wikiprojects": wikiprojects,
                        "class": cls.lower(),
                        "importance": importance.lower()
                    }
                    from json import dumps
                    print(dumps(record), file=wp_fout)
Ejemplo n.º 17
0
def wikiParser(file_name):
    pageMetadata = []
    #create table
    dbCreator.create_table()
    # Construct dump file iterator
    counter = 0
    dump = Iterator.from_file(bz2.open(file_name))
    # dump = Iterator.from_file(open("/Users/alessandro/Documents/PhD/trWiki/trSample.xml"))

    # Iterate through pages
    pageAll = []

    for page in dump:
        if counter == 2500:
            conn = dbCreator.get_db_params()
            cur = conn.cursor()
            try:
                cur.executemany(
                    """INSERT INTO revision_metadata (bytes, namespace, page, par_Id, rev_Id, revert, reverted, time_Stamp, user_Id, user_Name) VALUES (%(bytes)s, %(namespace)s, %(page)s, %(parentId)s, %(revId)s, %(revert)s, %(reverted)s, %(time_stamp)s, %(userId)s, %(userName)s);""",
                    pageAll)
                conn.commit()
                # print('imported')
            except:
                conn.rollback()
                for stat in pageAll:
                    try:
                        cur.execute(
                            """INSERT INTO revision_metadata (bytes, namespace, page, par_Id, rev_Id, revert, reverted, time_Stamp, user_Id, user_Name) VALUES (%(bytes)s, %(namespace)s, %(page)s, %(parentId)s, %(revId)s, %(revert)s, %(reverted)s, %(time_stamp)s, %(userId)s, %(userName)s);""",
                            stat)
                        conn.commit()
                    except:
                        conn.rollback()
                        e = sys.exc_info()[0]
                        print("<p>Error: %s</p>" % e)
                        print('not imported, revision id error')
                        print(stat)
            pageAll = []
            counter = 0

        counter += 1
        checksum_revisions = []
        revertsList = []
        pageTitle = page.title.lower().replace(' ', '_')
        pageNS = page.namespace
        # state = persistence.State()

        # Iterate through a page's revisions
        for revision in page:
            revData = {}
            # print(revision.id, revision.contributor, revision.timestamp)
            revData['page'] = pageTitle
            revData['namespace'] = pageNS
            revData['bytes'] = revision.bytes
            revData['revId'] = revision.id
            revData['parentId'] = revision.parent_id
            revData['time_stamp'] = revision.timestamp.long_format().replace(
                'T', ' ').replace('Z', ' ')
            if revision.contributor.id == None:
                revData['userId'] = 'ip'
            else:
                revData['userId'] = revision.contributor.id
            revData['userName'] = revision.contributor.user_text
            revData['revert'] = False
            revData['reverted'] = False

            pageMetadata.append(revData)
            checksum_revisions.append((revision.text, {"rev_id": revision.id}))
            # state.process(revision.text, revision=revision.id)

        # print(state.last)
        revertsList.append(list(reverts.detect(checksum_revisions)))
        # print(revertsList)
        for revvos in revertsList:
            for revvo in revvos:
                for revis in pageMetadata:
                    try:
                        if revis['revId'] == revvo.reverting['rev_id']:
                            revis['revert'] = True
                    except:
                        print(revvo)
                    for reverted in revvo.reverteds:
                        if revis['revId'] == reverted['rev_id']:
                            revis['reverted'] = True

        pageAll += pageMetadata
        pageMetadata = []

    conn = dbCreator.get_db_params()
    cur = conn.cursor()
    try:
        cur.executemany(
            """INSERT INTO revision_metadata (bytes, namespace, page, par_Id, rev_Id, revert, reverted, time_Stamp, user_Id, user_Name) VALUES (%(bytes)s, %(namespace)s, %(page)s, %(parentId)s, %(revId)s, %(revert)s, %(reverted)s, %(time_stamp)s, %(userId)s, %(userName)s);""",
            pageAll)
        conn.commit()
        # print('imported')
    except:
        conn.rollback()
        for stat in pageAll:
            try:
                cur.execute(
                    """INSERT INTO revision_metadata (bytes, namespace, page, par_Id, rev_Id, revert, reverted, time_Stamp, user_Id, user_Name) VALUES (%(bytes)s, %(namespace)s, %(page)s, %(parentId)s, %(revId)s, %(revert)s, %(reverted)s, %(time_stamp)s, %(userId)s, %(userName)s);""",
                    stat)
                conn.commit()
            except:
                conn.rollback()
                e = sys.exc_info()[0]
                print("<p>Error: %s</p>" % e)
                print('not imported, revision id error')
                print(stat)
"""
Prints out all rev_ids that appear in dump.xml.
"""
from mw.xml_dump import Iterator

# Construct dump file iterator
dump = Iterator.from_file(open("examples/dump.xml"))

# Iterate through pages
for page in dump:

    # Iterate through a page's revisions
    for revision in page:
        print(revision.id)
Ejemplo n.º 19
0
def analyseArticle(file_name):
    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = mwIterator.from_file(open_file(file_name))

    # Iterate over the pages.
    for page in dumpIterator:
        i = 0

        # Iterate over revisions of the article.
        for revision in page:
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.sha1 == None):
                revision.sha1 = Text.calculateHash(revision.text)

            if (revision.sha1 in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.comment!= None and revision.comment.find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.id)
                revision_curr.length = len(revision.text)
                revision_curr.timestamp = revision.timestamp

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.id)
                relation.length = len(revision.text)

                # Some revisions don't have contributor.
                if (revision.contributor != None):
                    revision_curr.contributor_id = revision.contributor.id
                    revision_curr.contributor_name = revision.contributor.user_text
                    relation.author = revision.contributor.user_text
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.id
                    revision_curr.contribur_name = 'Not Available ' + revision.id
                    relation.author = 'Not Available ' + revision.id

                # Content within the revision.
                text_curr = revision.text.lower()

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)


                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.sha1)

    return (revisions, revision_order, relations)
Ejemplo n.º 20
0
import csv
import argparse
import re
from mw.xml_dump import Iterator

parser = argparse.ArgumentParser()
parser.add_argument("-i", help="input file location")
parser.add_argument("-o", help="output file location")
args = parser.parse_args()
in_fn = args.i
out_fn = args.o

# Construct dump file iterator
dump = Iterator.from_file(open(in_fn))

with open(out_fn, 'w') as o:
    output = csv.writer(o, delimiter = '\t', quotechar='|')
    output.writerow(['pageID','page_title','namespace','editor_id','username','timestamp','comment'])
    # Iterate through pages
    for page in dump:
        # Iterate through a page's revisions
        for rev in page:
            output.writerow([page.id, page.title, page.namespace,rev.contributor.user_text,
            rev.timestamp,rev.comment])
Ejemplo n.º 21
0
          "type": "date",
          "format": "yyyy-MM-dd HH:mm:ss||yyyy-MM-dd HH:mm:ssZZZ||epoch_millis"
        }
      }
    }
  }
}'''
if not es.indices.exists(index):
    es.indices.create(index=index, body=mapping, ignore=400)
else:
    # query source = wiki,delete by query
    queryDoc = {'query': {'match': {'source': 'wiki'}}}
    deleteDoc = es.delete_by_query(index='cms', body=queryDoc)
    print("delete document!!!")

dumpfile = Iterator.from_file(open("/tmp/wiki_dump.xml"))
for page in dumpfile:
    # Iterate through a page's revisions
    for revision in page:
        wikiBody = {
            "id":
            "revision.id",
            "url":
            "https://wiki.xxx.cn/" + page.title,
            "source":
            "wiki",
            "createUser":
            revision.contributor.user_text,
            "updateUser":
            revision.contributor.user_text,
            "content":
Ejemplo n.º 22
0
def parse_wikipedia_controversial_articles(xml_stub_history):
    primary_ip_dir = "C:\WikiProject\\"
    internal_ip_dir = "Controversial Single Pages Wikipedia\\"
    controversial_page_titles_fileName = "controversial_page_titles.txt"
    controversial_page_ids_fileName = "controversial_wikipedia_page_ids.txt"

    # Create page_ids file corresponding to page_titles
    inputFile = open(
        primary_ip_dir + internal_ip_dir + controversial_page_titles_fileName,
        'r')
    outputFile = open(
        primary_ip_dir + internal_ip_dir + controversial_page_ids_fileName,
        'w')

    for title in inputFile:
        outputFile.write(title.strip() + " " +
                         find_page_id_given_page_title(title))
        outputFile.write("\n")

    inputFile.close()
    outputFile.close()

    # Parse xml_dump to retrieve the reverts of controversial pages
    inputFile = open(
        primary_ip_dir + internal_ip_dir + controversial_page_ids_fileName,
        'r')

    titles = []
    pageID = []
    for line in inputFile:
        parts = line.split()
        titles.append(parts[0].strip())
        pageID.append(int(parts[1]))

    inputFile.close()

    inputFile = open(xml_stub_history, 'r', encoding='utf-8')
    article_count = 0
    dump_iter = Iterator.from_file(inputFile)
    for page_iter in dump_iter:
        print(page_iter.id)
        if page_iter.id in pageID:
            article_count += 1
            try:
                item_idx = pageID.index(page_iter.id)
            except ValueError:
                continue
            outputFile = open(
                primary_ip_dir + internal_ip_dir +
                "Anonymous Inclusion With IP Address\Revision Logs\\" +
                titles[item_idx] + ".log",
                mode='w',
                encoding='utf-8')
            page = Page(page_iter.id, page_iter.title, page_iter.namespace,
                        page_iter.redirect, page_iter.restrictions,
                        page_iter.__iter__())
            rev_iter_idx = 0
            detector = reverts.Detector()
            # edit_list contains tuples <revision_id, user_id> to track previous revisions. For anonymous, saved user_id in form <IP address>
            edit_list = []
            for rev_iter in page:
                rev_iter_idx = rev_iter_idx + 1
                if rev_iter.contributor.id != None:
                    edit_list.append([rev_iter.id, rev_iter.contributor.id])
                else:
                    edit_list.append(
                        [rev_iter.id, rev_iter.contributor.user_text])
                revert_info = detector.process(rev_iter.sha1, rev_iter.id)
                if revert_info != None:
                    reverter = find_user(edit_list, revert_info.reverting)
                    revertedTo = find_user(edit_list, revert_info.reverted_to)
                    for i in range(len(revert_info.reverteds)):
                        reverted = find_user(edit_list,
                                             revert_info.reverteds[i])
                        outputFile.write(
                            str(reverter) + "," + str(revertedTo) + "," +
                            str(reverted))
                        outputFile.write("\n")

        outputFile.close()
        # All articles found
        if (article_count == len(pageID)):
            break
    inputFile.close()
def parse_all_wiki_articles_for_rev_history(xml_stub_history, xml_file_no):

    #primary_op_dir = "C:\WikiProject\\"
    #internal_op_dir = "Controversial Single Pages Wikipedia\Wikidumps\Revisions\All Revisions\\"
    primary_op_dir = "/N/u/mmaity/Karst/"
    internal_op_dir = "WikiAnalysis/Wikidumps/Output_Logs/"
    dump_iter = Iterator.from_file(open(xml_stub_history,encoding='latin-1'))
    tot_file_ct = 0
    
    start = time.time()
    
    
    output_file_reverts = open(primary_op_dir+internal_op_dir+"reverts_"+str(xml_file_no)+".log", mode='w', encoding='utf-8')
    output_file_edits = open(primary_op_dir+internal_op_dir+"edits_"+str(xml_file_no)+".log", mode='w', encoding='utf-8')
    page_ids = create_set_of_page_ids("page_ids")
    
    for page_iter in dump_iter:
        if page_iter.id in page_ids:
            print(page_iter.id)
            page = Page(page_iter.id, page_iter.title, page_iter.namespace,  page_iter.redirect, page_iter.restrictions, page_iter.__iter__())
            detector = reverts.Detector()
        
            output_file_reverts.write("#"+str(page_iter.id)+"\n")
            edit_list = []
            for rev_iter in page:
                if rev_iter.contributor.id != None:
                    edit_list.append([rev_iter.id, rev_iter.contributor.id])
                else:
                    edit_list.append([rev_iter.id, rev_iter.contributor.user_text])
            
            
                # Detect reverts and save info in reverts_ file
                revert_info = detector.process(rev_iter.sha1, rev_iter.id)
                if revert_info != None:
                    reverter = find_user(edit_list, revert_info.reverting)
                    revertedTo = find_user(edit_list, revert_info.reverted_to)
                    for i in range(len(revert_info.reverteds)):
                        reverted = find_user(edit_list, revert_info.reverteds[i])
                        output_file_reverts.write(str(reverter)+","+str(revertedTo)+","+str(reverted)+"\n")
        
            # <page_id, user_id, num_of_revs>
            user_list = {}
            for edit_list_iter in edit_list:
                contributor = edit_list_iter[1]
                if contributor in user_list.keys():
                    user_list[contributor] += 1
                else:
                    user_list[contributor] = 1
                
                   
            for item in user_list.items():
                output_file_edits.write(str(page_iter.id)+","+str(item[0])+","+str(item[1])+"\n")
        else:
            print(page_iter.id, "Not Found")
                        
 
    output_file_reverts.close()
    output_file_edits.close()
    end = time.time()
    
    print(xml_stub_history)
    print("Total File Count:", tot_file_ct) 
    print("Elapsed Time:", (end-start))   
)  #geoip2.database.Reader(os.environ['GEO2_DIRECTORY'], maxminddb.MODE_MMAP_EXT)
flagged = FlaggedTools.load('/home/alexander/flagged.pkl')
users = DepRepo.flags()  #UserFlagsTools.load(os.environ['USER_FLAGS'])
#################

d1 = dt.datetime.now()
pp = PageProcessor(flagged, users, db, geoip)
#pp.clear()

cnt = 0
totalcnt = 0
rcnt = 0
#pr = cProfile.Profile()
#pr.enable()

dump = Iterator.from_file(open_file(file_name))

for page in dump:
    totalcnt += 1
    if totalcnt % 50 == 0:
        print(
            str(rcnt) + "/" + str(cnt) + "/" + str(totalcnt) + " pushed: " +
            str(pp.items_pushed))
        gc.collect()

    excl = page.namespace != 0 and page.namespace != 10
    if not excl:
        cnt += 1
    # check page namespace
    rcnt += pp.process(page, excl)
import sys,os;sys.path.insert(0, os.path.abspath(os.getcwd()))

from mw.xml_dump import Iterator

# Construct dump file iterator
dump = Iterator.from_file(open("examples/dump.xml"))

# Iterate through pages
for page in dump:

		# Iterate through a page's revisions
		for revision in page:

				print(revision.id)
Ejemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--dump_file',
        help=
        "gzipped XML dump file -- e.g., enwiki-20190301-stub-meta-history.xml.gz"
    )
    parser.add_argument('--outdir',
                        help="directory to write monthly editor files")
    parser.add_argument('--botlist_fn',
                        help="Text file containing bot accounts")
    parser.add_argument('--mys',
                        nargs="+",
                        help="List of months to track of form '2016-11'")
    parser.add_argument(
        '--startdate',
        default='2001-08',
        help="If not mys, starting month to track of form '2016-11'")
    parser.add_argument(
        '--enddate',
        default='2019-03',
        help="If not mys, ending month to track of form '2016-11'")
    parser.add_argument(
        '--stopafter',
        type=int,
        default=-1,
        help="If greater than 0, limit to # of pages to check before stopping")
    args = parser.parse_args()

    # build list of months to track
    if not args.mys:
        args.mys = []
        sd = (int(args.startdate[:4]), int(args.startdate[5:]))
        ed = (int(args.enddate[:4]), int(args.enddate[5:]))
        while ed[0] > sd[0] or ed[1] >= sd[1]:
            args.mys.append('{0}-{1:02}'.format(sd[0], sd[1]))
            if sd[1] == 12:
                sd = (sd[0] + 1, 1)
            else:
                sd = (sd[0], sd[1] + 1)

    print(args)

    # load in bot usernames
    bots = set()
    if args.botlist_fn:
        with open(args.botlist_fn) as fin:
            csvreader = csv.reader(fin)
            for line in csvreader:
                bots.add(line[0].lower())

    # Construct dump file iterator
    dump = Iterator.from_file(gzip.open(args.dump_file))

    editors_by_month = {}
    editor_startdates = {}
    i = 0
    bots_edits_filtered = 0
    user_edits = 0
    anon_edits = 0
    print_every = 25
    # Iterate through pages
    for page in dump:

        # only count edits to article namespace
        if page.namespace != 0:
            continue
        i += 1

        # Iterate through a page's revisions
        for revision in page:
            contributor = revision.contributor
            if not contributor.id or contributor.id == 0:
                anon_edits += 1
                continue
            editor_name = contributor.user_text
            if editor_name.lower() in bots:
                bots_edits_filtered += 1
                continue
            month_year = revision.timestamp.strftime("%Y-%m")
            editor_startdates[editor_name] = min(
                month_year, editor_startdates.get(editor_name, "2100-01"))
            if args.mys and month_year not in args.mys:
                continue
            user_edits += 1
            if month_year not in editors_by_month:
                editors_by_month[month_year] = {}
            editors_by_month[month_year][
                editor_name] = editors_by_month[month_year].get(
                    editor_name, 0) + 1

        if i == args.stopafter:
            break

        if i % print_every == 0:
            print(
                '{0} completed. On: {1}. {2} bot edits, {3} anon edits, {4} user edits.'
                .format(i, page.title, bots_edits_filtered, anon_edits,
                        user_edits))
            print_every *= 2

    print(
        '{0} completed. On: {1}. {2} bot edits, {3} anon edits, {4} user edits.'
        .format(i, page.title, bots_edits_filtered, anon_edits, user_edits))

    for my in editors_by_month:
        fn = os.path.join(args.outdir, my)
        with open(fn, 'w') as fout:
            csvwriter = csv.writer(fout)
            csvwriter.writerow(['editor_name', 'edit_count', 'first_edit_dt'])
            d = editors_by_month[my]
            by_editcount = [(k, d[k], editor_startdates[k])
                            for k in sorted(d, key=d.get, reverse=True)]
            for editor_count in by_editcount:
                csvwriter.writerow(editor_count)
Ejemplo n.º 27
0
 def run(self):
     
     def _process_dump(dump, path):
         try:
             for page in dump:
                 logger.debug("Constructing new processor for {0}:{1}"\
                              .format(page.namespace, page.title))
                 
                 processor_status = self.store.processor_status.get(page.id,
                                           type=self.engine.Processor.Status)
                 
                 if processor_status is None:
                     processor_status = self.engine.Processor.Status(page.id)
                 
                 processor = self.engine.processor(processor_status)
                 
                 for rev in page:
                     if rev.id <= processor_status.last_rev_id:
                         
                         logger.debug(
                                 "Skipping revision (already processed) " +\
                                 "{0}:{1}".format(rev.id, rev.timestamp))
                         continue
                     try:
                         user = User(rev.contributor.id,
                                     rev.contributor.user_text)
                         delta = processor.process(rev.id, rev.timestamp,
                                                   rev.text)
                         revision = Revision(rev.id, rev.timestamp, page.id,
                                             user, delta)
                         yield (revision, None)
                     except RevisionOrderError as e:
                         logger.error(traceback.format_exc())
                         logger.info("Skipping revision (out of order) " + \
                                     "{0}:{1}".format(rev.id, rev.timestamp))
                 
                 logger.debug("Finished processing page {0}:{1}"\
                              .format(page.namespace, page.title))
                 
                 yield (processor.status, page.title)
             
             logger.debug("Finished processing dump at {0}".format(path))
             yield (path, None)
         
         
         except Exception as e:
             logger.error(traceback.format_exc())
             raise
     
     engine_status = self.store.engine_status.get(type=self.engine.Status)
     if engine_status is None:
         logger.info("Starting {0} from scratch.".format(self.engine.info()))
         engine_status = self.engine.Status(self.engine.info())
     
     max_rev_id = 0
     max_timestamp = Timestamp(0)
     
     if len(self.paths) == 1:
         dump = Iterator.from_file(open_file(self.paths[0]))
         rev_proc_or_paths = _process_dump(dump, self.paths[0])
     else:
         rev_proc_or_paths = map(self.paths, _process_dump,
                                 **self.map_kwargs)
     
     try:
         for rev_proc_or_path, meta in rev_proc_or_paths:
             
             if isinstance(rev_proc_or_path, Revision):
                 revision = rev_proc_or_path
                 
                 self.store.revisions.store(revision)
                 self.status.stats['revisions_processed'] += 1
                 
                 max_rev_id = max(revision.rev_id, max_rev_id)
                 max_timestamp = max(revision.timestamp, max_timestamp)
                 
             elif isinstance(rev_proc_or_path, ProcessorStatus):
                 processor_status = rev_proc_or_path
                 page_title = meta
                     
                 logger.debug("Completed processing page " + \
                              "{0}. {1}".format(
                                      page_title,
                                      processor_status.stats))
                 
                 self.store.processor_status.store(processor_status)
                 
                 
             elif isinstance(rev_proc_or_path, str):
                 path = rev_proc_or_path
                 
                 logger.info("Completed processing dump {0}".format(path))
                 
             else:
                 raise RuntimeError(
                         "Did not expect a " + \
                         "{0}".format(type(rev_proc_or_path)))
             
             
         
         self.status.update(max_rev_id, max_timestamp)
         
         self.store.engine_status.store(engine_status)
     
     except Exception as e:
         logger.error(traceback.format_exc())
         raise
Ejemplo n.º 28
0
def analyseArticle(file_name):
    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = mwIterator.from_file(open_file(file_name))

    # Iterate over the pages.
    for page in dumpIterator:
        i = 0

        # Iterate over revisions of the article.
        for revision in page:
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.sha1 == None):
                revision.sha1 = Text.calculateHash(revision.text)

            if (revision.sha1 in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.comment!= None and revision.comment.find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.id)
                revision_curr.length = len(revision.text)
                revision_curr.timestamp = revision.timestamp

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.id)
                relation.length = len(revision.text)

                # Some revisions don't have contributor.
                if (revision.contributor != None):
                    revision_curr.contributor_id = revision.contributor.id
                    revision_curr.contributor_name = revision.contributor.user_text
                    relation.author = revision.contributor.user_text
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.id
                    revision_curr.contribur_name = 'Not Available ' + revision.id
                    relation.author = 'Not Available ' + revision.id

                # Content within the revision.
                text_curr = revision.text.lower()

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)


                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.sha1)

    return (revisions, revision_order, relations)