def generate_category_relation(input=None, output=None):
    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:

        # ignore redirected pages for both article and talk pages
        if page.redirect:
            continue
        # only parse category page
        if page.namespace != 14:
            continue

        # only one revision on the current page
        for rev in page:
            try:
                wikicode = mwp.parse(rev.text)
            except:
                print(page.id, page.title, page.namespace)
                continue

            # parse the article page to extract category info of the article
            cate = page.title.lower()[len("category:"):]
            for link in wikicode.filter_wikilinks():
                if link.title.startswith('Category:'):
                    super_cate = link.title.lower().replace('category:', "")
                    # categories.append(cate)
                    record = {
                        "cate": cate,
                        "super_cate": super_cate,
                        "cate_pid": page.id
                    }
                    from json import dumps
                    print(dumps(record), file=fout)
def parse_file(input=None, output=None):

    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # ignore old version pages that were redirected
        if page.redirect:
            print(page.title)
            # print(page.redirect_title)
            continue

        for rev in page:

            from time import mktime, strptime
            pattern = '%Y-%m-%d%HT%M%SZ'
            pattern = '%Y%m%d%H%M%S'
            epoch = int(mktime(strptime(str(rev.timestamp), pattern)))

            record = {
                "rev_id": rev.id,
                "rev_page_id": page.id,
                "rev_page_title": page.title,
                "rev_user_id": rev.contributor.id,
                "rev_user_text": rev.contributor.user_text,
                "ns": page.namespace,
                "rev_comment": rev.comment,
                "rev_timestamp": epoch
            }

            from json import dumps
            print(dumps(record), file=fout)
Exemple #3
0
def parse_file(input=None, output=None, bot_file=None):

    #bot_list = load_bots(bot_file)

    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # ignore old version pages that were redirected
        if page.redirect:
            continue
        diff_content = ""

        if page.namespace in [1, 3, 5]:
            print("{},{}".format(page.title, page.namespace))
            revtext = []
            for rev in page:
                from time import mktime, strptime
                pattern = '%Y%m%d%H%M%S'
                epoch = int(mktime(strptime(str(rev.timestamp), pattern)))
                current_revtext = useful_text(rev.text)
                diff_content = diff_sentences(revtext, current_revtext)
                record = {
                    "rev_timestamp": epoch,
                    "rev_id": rev.id,
                    "rev_user_text": rev.contributor.user_text,
                    "rev_user_id": rev.contributor.id,
                    "rev_page_title": page.title,
                    "rev_page_id": page.id,
                    "ns": page.namespace,
                    "rev_diff": diff_content
                }
                revtext = current_revtext
                from json import dumps
                print(dumps(record), file=fout)
        else:
            for rev in page:
                diff_content = "None"
                from time import mktime, strptime
                pattern = '%Y%m%d%H%M%S'
                epoch = int(mktime(strptime(str(rev.timestamp), pattern)))
                record = {
                    "rev_timestamp": epoch,
                    "rev_id": rev.id,
                    "rev_user_text": rev.contributor.user_text,
                    "rev_user_id": rev.contributor.id,
                    "rev_page_title": page.title,
                    "rev_page_id": page.id,
                    "ns": page.namespace,
                    "rev_diff": diff_content
                }
                from json import dumps
                print(dumps(record), file=fout)

    return
Exemple #4
0
def parse_file(input=None, output=None):

    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # ignore old version pages that were redirected
        if page.redirect:
            continue

        record = {
            "page_title": page.title,
            "page_id": page.id,
            "ns": page.namespace
        }
        from json import dumps
        print(dumps(record), file=fout)
Exemple #5
0
def parse_file(input=None, output=None, bot_file=None):

    bot_list = load_bots(bot_file)

    fout = open(output, 'w')
    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # ignore old version pages that were redirected
        if page.redirect:
            continue

        for rev in page:

            user_text = rev.contributor.user_text
            from IPy import IP
            try:
                IP(user_text)
                continue
            except:
                if user_text in bot_list:
                    continue

            from time import mktime, strptime
            pattern = '%Y%m%d%H%M%S'
            epoch = int(mktime(strptime(str(rev.timestamp), pattern)))

            record = {
                "rev_id": rev.id,
                "rev_page_id": page.id,
                "rev_page_title": page.title,
                "rev_user_id": rev.contributor.id,
                "rev_user_text": rev.contributor.user_text,
                "ns": page.namespace,
                "rev_timestamp": epoch
            }

            from json import dumps
            print(dumps(record), file=fout)
def parse_dump(dump_filename, wanted_page_ids, found_pages_dict, users_page_edits_dict, pages_pwr_dict, logfile):
    """
    Parse the given dump, processing assessments for the given
    talk page IDs.

    @param dump_filename: path to the dump file to process
    @type dump_filename: str

    @param wanted_page_ids: dictionary where keys are talk page IDs,
                            values don't matter, we're only using the
                            dict for fast lookups
    @type wanted_page_ids: dict
    """

    # Construct dump file iterator
    dump = Iterator.from_file(functions.open_file(dump_filename))

    bots_file = "resources/wikipedia_bots_full.txt"
    bots = {}
    try:
        with open(bots_file, "r") as fin:
            csvreader = csv.reader(fin)
            for line in csvreader:
                bots[line[0].lower()] = True
    except:
        print("Invalid bots file - only text matching with 'bot' will be used")
        with open(logfile, "a") as fout:
            fout.write("Invalid bots file - only text regex with 'bot' followed by whitespace will be used.\n")

    scripts = ["commonsdelinker", "conversion script"]

    count = 0
    # Iterate through pages
    for page in dump:
        # skip if not a page we want to process
        if not page.id in wanted_page_ids:
            continue
        try:
            with open(logfile, "a", encoding="utf-8", errors="backslashreplace") as fout:
                fout.write(str(datetime.now()) + ": " + page.title + "\n")
            print(page.title)
        except:
            with open(logfile, "a") as fout:
                fout.write(str(datetime.now()) + ": next spatial article.\n")
            print("next spatial article.")

        state = persistence.State()

        count += 1
        counts_dict = {
            "total_edits": 0,
            "bot_edits": 0,
            "unverified_bot_edits": 0,
            "known_script_edits": 0,
            "anonymous_edits": 0,
            "awb_edits": 0,
            "minor_edits": 0,
            "wpcleaner_edits": 0,
        }

        # Iterate through a page's revisions
        for revision in page:
            # skip if there's no content
            if not revision.text:
                continue

            if revision.comment and "awb" in revision.comment.lower():
                pwr = state.process(revision.text, revision="awb")
            else:
                pwr = state.process(revision.text, revision=revision.contributor.user_text)

            counts_dict["total_edits"] += 1
            try:
                if revision.contributor.user_text:
                    process_rev(revision, counts_dict, bots, scripts, users_page_edits_dict, page.id)
            except:
                try:
                    print(
                        "Error in revision.contributor.user_text {0} for page {1}".format(
                            revision.contributor.user_text, page.title
                        )
                    )
                    with open(logfile, "a") as fout:
                        fout.write(
                            "Error in revision.contributor.user_text {0} for page {1}\n".format(
                                revision.contributor.user_text, page.title
                            )
                        )
                except:
                    print("Error in a revision.contributor.user_text for a page.")
                    with open(logfile, "a") as fout:
                        fout.write("Error in a revision.contributor.user_text for a page.")

        found_pages_dict[page.id] = wanted_page_ids[page.id]
        found_pages_dict[page.id].update(counts_dict)

        current_state = {
            "total_tokens": 0,
            "bot_tokens": 0,
            "unverified_bot_tokens": 0,
            "known_script_tokens": 0,
            "anonymous_tokens": 0,
            "awb_tokens": 0,
        }

        for tk in pwr[0]:  # loop through tokens in current state of the page
            current_state["total_tokens"] += 1
            try:
                if tk.revisions[0]:
                    process_current_page(tk.revisions[0].lower(), current_state, bots, scripts, pages_pwr_dict, page.id)
            except:
                try:
                    print("Error in processing token {0} for page {1}".format(tk.text, page.id))
                    with open(logfile, "a", encoding="utf-8", errors="backslashreplace") as fout:
                        fout.write("Error in processing token {0} for page {1}.\n".format(tk.text, str(page.id)))
                except:
                    print("Error in processing a token for page {0}".format(page.id))
                    with open(logfile, "a") as fout:
                        fout.write("Error in processing a token for page {0}.\n".format(page.id))
        found_pages_dict[page.id].update(current_state)

    # ok, done
    return
def parse_file(input=None, output=None, wp_dir=None, cat_dir=None):

    wp_output = wp_dir + output.replace(FILE_TYPE,
                                        '') + '_wikiproject' + FILE_TYPE
    cat_output = cat_dir + output.replace(FILE_TYPE,
                                          '') + '_category' + FILE_TYPE
    wp_fout = open(wp_output, 'w')
    cat_fout = open(cat_output, 'w')

    dump = Iterator.from_file(functions.open_file(input))

    for page in dump:
        # print(page.title, page.namespace)
        # ignore redirected pages for both article and talk pages
        if page.redirect:
            continue
        if page.namespace != 0 and page.namespace != 1:
            continue

        # only one revision on the current page
        for rev in page:
            # catch rare parsing errors
            try:
                wikicode = mwp.parse(rev.text)
            except:
                print(page.id, page.title, page.namespace)
                continue

            # parse the article page to extract category info of the article
            if page.namespace == 0:
                categories = []
                title = page.title.lower()
                for link in wikicode.filter_wikilinks():
                    if link.title.startswith('Category:'):
                        cate = link.title.lower().replace('category:', "")
                        categories.append(cate)

                        if not LIST_FORMAT:
                            record = {
                                "pageId": page.id,
                                "title": title,
                                "category": cate
                            }
                            from json import dumps
                            print(dumps(record), file=cat_fout)

                if LIST_FORMAT:
                    record = {
                        "pageId": page.id,
                        "title": title,
                        "categories": categories
                    }
                    from json import dumps
                    print(dumps(record), file=cat_fout)

            # parse the talk page to extract wikiproject info of the article
            if page.namespace == 1:
                title = page.title.lower().replace("talk:", "")
                cls = importance = "None"
                wikiprojects = []

                for template in wikicode.filter_templates():
                    if template.name == 'WikiProjectBannerShell':
                        continue

                    if template.name.lower().startswith('wikiproject'):
                        from re import search
                        wikiproject = template.name.lower().replace(
                            "wikiproject", "").strip()
                        wikiprojects.append(wikiproject)
                        template = str(template).replace("}", "|").replace(
                            " ", "").replace("\n", "")

                        try:
                            cls = search(r'\|class=([a-z-A-Z]+)\|',
                                         template).group(1)
                            importance = search(r'\|importance=([a-z-A-Z]+)\|',
                                                template).group(1)
                        except AttributeError:
                            pass

                        if not LIST_FORMAT:
                            record = {
                                "pageId": page.id,
                                "title": title,
                                "wikiproject": wikiproject,
                                "class": cls.lower(),
                                "importance": importance.lower()
                            }
                            from json import dumps
                            print(dumps(record), file=wp_fout)

                if LIST_FORMAT:
                    record = {
                        "pageId": page.id,
                        "title": title,
                        "wikiprojects": wikiprojects,
                        "class": cls.lower(),
                        "importance": importance.lower()
                    }
                    from json import dumps
                    print(dumps(record), file=wp_fout)
)  #geoip2.database.Reader(os.environ['GEO2_DIRECTORY'], maxminddb.MODE_MMAP_EXT)
flagged = FlaggedTools.load('/home/alexander/flagged.pkl')
users = DepRepo.flags()  #UserFlagsTools.load(os.environ['USER_FLAGS'])
#################

d1 = dt.datetime.now()
pp = PageProcessor(flagged, users, db, geoip)
#pp.clear()

cnt = 0
totalcnt = 0
rcnt = 0
#pr = cProfile.Profile()
#pr.enable()

dump = Iterator.from_file(open_file(file_name))

for page in dump:
    totalcnt += 1
    if totalcnt % 50 == 0:
        print(
            str(rcnt) + "/" + str(cnt) + "/" + str(totalcnt) + " pushed: " +
            str(pp.items_pushed))
        gc.collect()

    excl = page.namespace != 0 and page.namespace != 10
    if not excl:
        cnt += 1
    # check page namespace
    rcnt += pp.process(page, excl)
def analyseArticle(file_name):
    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = mwIterator.from_file(open_file(file_name))

    # Iterate over the pages.
    for page in dumpIterator:
        i = 0

        # Iterate over revisions of the article.
        for revision in page:
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.sha1 == None):
                revision.sha1 = Text.calculateHash(revision.text)

            if (revision.sha1 in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.comment!= None and revision.comment.find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.id)
                revision_curr.length = len(revision.text)
                revision_curr.timestamp = revision.timestamp

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.id)
                relation.length = len(revision.text)

                # Some revisions don't have contributor.
                if (revision.contributor != None):
                    revision_curr.contributor_id = revision.contributor.id
                    revision_curr.contributor_name = revision.contributor.user_text
                    relation.author = revision.contributor.user_text
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.id
                    revision_curr.contribur_name = 'Not Available ' + revision.id
                    relation.author = 'Not Available ' + revision.id

                # Content within the revision.
                text_curr = revision.text.lower()

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)


                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.sha1)

    return (revisions, revision_order, relations)
Exemple #10
0
def analyseArticle(file_name):
    # Container of relationships.
    relations = {}

    # Revisions to compare.
    revision_curr = Revision()
    revision_prev = Revision()
    text_curr = None

    # Access the file.
    dumpIterator = mwIterator.from_file(open_file(file_name))

    # Iterate over the pages.
    for page in dumpIterator:
        i = 0

        # Iterate over revisions of the article.
        for revision in page:
            vandalism = False

            # Update the information about the previous revision.
            revision_prev = revision_curr

            if (revision.sha1 == None):
                revision.sha1 = Text.calculateHash(revision.text)

            if (revision.sha1 in spam):
                vandalism = True

            #TODO: SPAM detection: DELETION
            if (revision.comment!= None and revision.comment.find(FLAG) > 0):
                pass
            else:
                if (revision_prev.length > PREVIOUS_LENGTH) and (len(revision.text) < CURR_LENGTH) and (((len(revision.text)-revision_prev.length)/float(revision_prev.length)) <= CHANGE_PERCENTAGE):
                    vandalism = True
                    revision_curr = revision_prev

            if (not vandalism):
                # Information about the current revision.
                revision_curr = Revision()
                revision_curr.id = i
                revision_curr.wikipedia_id = int(revision.id)
                revision_curr.length = len(revision.text)
                revision_curr.timestamp = revision.timestamp

                # Relation of the current relation.
                relation = Relation()
                relation.revision = int(revision.id)
                relation.length = len(revision.text)

                # Some revisions don't have contributor.
                if (revision.contributor != None):
                    revision_curr.contributor_id = revision.contributor.id
                    revision_curr.contributor_name = revision.contributor.user_text
                    relation.author = revision.contributor.user_text
                else:
                    revision_curr.contributor_id = 'Not Available ' + revision.id
                    revision_curr.contribur_name = 'Not Available ' + revision.id
                    relation.author = 'Not Available ' + revision.id

                # Content within the revision.
                text_curr = revision.text.lower()

                # Perform comparison.
                vandalism = determineAuthorship(revision_curr, revision_prev, text_curr, relation)


                if (not vandalism):
                    # Add the current revision with all the information.
                    revisions.update({revision_curr.wikipedia_id : revision_curr})
                    relations.update({revision_curr.wikipedia_id : relation})
                    revision_order.append((revision_curr.wikipedia_id, False))
                    # Update the fake revision id.
                    i = i+1

                    # Calculate the number of tokens in the revision.
                    total = 0
                    for p in revision_curr.ordered_paragraphs:
                        for paragraph_curr in revision_curr.paragraphs[p]:
                            for hash_sentence_curr in paragraph_curr.sentences.keys():
                                for sentence_curr in paragraph_curr.sentences[hash_sentence_curr]:
                                    total = total + len(sentence_curr.words)
                    revision_curr.total_tokens = total
                    relation.total_tokens = total

                else:
                    revision_order.append((revision_curr.wikipedia_id, True))
                    revision_curr = revision_prev
                    spam.append(revision.sha1)

    return (revisions, revision_order, relations)