Esempio n. 1
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    features = import_from_path(args['<features>'])

    if args['--language'] is None:
        language = None
    else:
        language = import_from_path(args['--language'])

    session = api.Session(args['--api'],
                          user_agent="Revscoring feature extractor utility")
    extractor = APIExtractor(session, language=language)

    if args['--rev-labels'] == "<stdin>":
        rev_labels = read_rev_labels(sys.stdin)
    else:
        rev_labels = read_rev_labels(open(args['--rev-labels']))

    if args['--value-labels'] == "<stdout>":
        value_labels = sys.stdout
    else:
        value_labels = open(args['--value-labels'], 'w')

    verbose = args['--verbose']

    run(rev_labels, value_labels, features, extractor, verbose)
def bot_gen(rev_pages, language, api_url):

    session = api.Session(api_url)
    extractor = APIExtractor(session, language=language)

    for rev_id, page_id in rev_pages:
        sys.stderr.write(".")
        sys.stderr.flush()
        try:

            # Detect reverted status
            revert = reverts.api.check(session, rev_id, page_id, radius=3)
            reverted = revert is not None
            added_words = list(extractor.extract(rev_id,
                                                 [diff.added_words]))[0]
            yield Edit(rev_id, added_words, reverted)

        except KeyboardInterrupt:
            sys.stderr.write("\n^C Caught.  Exiting...")
            break

        except:
            sys.stderr.write(traceback.format_exc())
            sys.stderr.write("\n")

    sys.stderr.write("\n")
Esempio n. 3
0
    def from_config(cls, config, name, section_key="extractors"):
        section = config[section_key][name]
        session = api.Session(section['url'], user_agent=section['user_agent'])

        language = Language.from_config(config, section['language'])

        return cls(session, language)
Esempio n. 4
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    diff_docs = read_docs(sys.stdin)

    session = api.Session(args['--api'])

    config_doc = yamlconf.load(open(args['--config']))
    diff_engine = DiffEngine.from_config(config_doc, config_doc["diff_engine"])

    run(diff_docs, session, diff_engine)
Esempio n. 5
0
def main(argv=None):
    args = docopt.docopt(__doc__, argv=argv)

    model = MLScorerModel.load(open(args['<model-file>'], 'rb'))

    extractor = APIExtractor(api.Session(args['--api']),
                             language=model.language)

    rev_ids = [int(rev_id) for rev_id in args['<rev_id>']]

    verbose = args['--verbose']

    run(model, extractor, rev_ids, verbose)
def update_revs():
    api_session = api.Session("https://en.wikipedia.org/w/api.php")

    rv_props = {
        'revid': 'ids',
        'timestamp': 'timestamp',
        'user': '******',
        'userid': 'userid',
        'size': 'size',
        'sha1': 'sha1',
        'contentmodel': 'contentmodel',
        'tags': 'tags',
        'flags': 'flags',
        'comment': 'comment',
        'content': 'content'
    }

    revs = api_session.revisions.query(properties=rv_props.values(),
                                       titles={'climate change'},
                                       direction="newer")

    all_revs = list(revs)
    pickle.dump(all_revs, open(revcache_path, 'wb'))
    return all_revs
Esempio n. 7
0
t_qmark = r'\?'
t_epoint = r'!'
t_comma = r','
t_colon = r':'
t_scolon = r';'
t_break = r'(\n|\n\r|\r\n)\s*(\n|\n\r|\r\n)+'
t_whitespace = r'[\n\r\s]+'
t_etc = r"."


def t_error(t):
    print("Illegal character '%s'" % t.value[0])
    t.lexer.skip(1)


lexer = lex()

session = api.Session("https://en.wikipedia.org/w/api.php")
common1 = session.revisions.get(638029546, properties={"content"})['*']

start = time.time()
for i in range(50):
    lexer.input(common1)
    while True:
        token = lexer.token()
        #print(token)
        if token is None:
            break

print("Tokenizing (text_split):", (time.time() - start) / 50)
Esempio n. 8
0
def main():
    args = parse_args()

    output_path = args.output_folder
    article_filename = args.article_file

    #handle -W
    if args.logging_destination:
        logging.basicConfig(filename=args.logging_destination,
                            filemode='a',
                            level=args.logging_level)
    else:
        logging.basicConfig(level=args.logging_level)

    export_time = str(datetime.datetime.now())
    export_date = datetime.datetime.today().strftime("%Y%m%d")

    logging.info(f"Starting run at {export_time}")
    logging.info(f"Last commit: {digobs.git_hash()}")

    json_output_filename = os.path.join(
        output_path,
        f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.json")
    tsv_output_filename = os.path.join(
        output_path,
        f"digobs_covid19-wikipedia-enwiki_revisions-{export_date}.tsv")

    api_session = api.Session("https://en.wikipedia.org/w/api.php")

    # list of properties from the API we want to gather (basically all of
    # them supported by mediawik-utilities)

    rv_props = {
        'revid': 'ids',
        'timestamp': 'timestamp',
        'user': '******',
        'userid': 'userid',
        'size': 'size',
        'sha1': 'sha1',
        'contentmodel': 'contentmodel',
        'tags': 'tags',
        'flags': 'flags',
        'comment': 'comment',
        'content': 'content'
    }

    exclude_from_tsv = ['tags', 'comment', 'content', 'flags']

    # load the list of articles
    with open(article_filename, 'r') as infile:
        article_list = list(map(str.strip, infile))

    def get_revisions_for_page(title):
        return api_session.revisions.query(properties=rv_props.values(),
                                           titles={title},
                                           direction="newer")

    tsv_fields = ['title', 'pageid', 'namespace']
    tsv_fields = tsv_fields + list(rv_props.keys())

    # drop fields that we identified for exclusion
    tsv_fields = [e for e in tsv_fields if e not in exclude_from_tsv]

    # add special export fields
    tsv_fields = tsv_fields + [
        'anon', 'minor', 'url', 'export_timestamp', 'export_commit'
    ]

    export_info = {'git_commit': digobs.git_hash(), 'timestamp': export_time}

    with open(json_output_filename, 'w') as json_output, \
         open(tsv_output_filename, 'w') as tsv_output:

        tsv_writer = DictWriter(tsv_output,
                                fieldnames=tsv_fields,
                                delimiter="\t")
        tsv_writer.writeheader()

        for article in article_list:
            logging.info(f"pulling revisions for: {article}")

            # try to grab the code 10 times, sleeping for one minute each time
            tries = 0
            while True:
                try:
                    rev_rows = []
                    for rev in get_revisions_for_page(article):
                        logging.debug(f"processing raw revision: {rev}")

                        # add export metadata
                        rev['exported'] = export_info

                        # save the json version of the code
                        print(json.dumps(rev), file=json_output)

                        # handle missing data
                        if "sha1" not in rev:
                            rev["sha1"] = ""

                        if "userhidden" in rev:
                            rev["user"] = ""
                            rev["userid"] = ""

                        # recode anon so it's true or false instead of present/missing
                        if "anon" in rev:
                            rev["anon"] = True
                        else:
                            rev["anon"] = False

                        # let's recode "minor" in the same way
                        if "minor" in rev:
                            rev["minor"] = True
                        else:
                            rev["minor"] = False

                        # add page title information
                        rev['title'] = rev['page']['title']
                        rev['pageid'] = rev['page']['pageid']
                        rev['namespace'] = rev['page']['ns']

                        # construct a URL
                        rev['url'] = Request(
                            'GET',
                            'https://en.wikipedia.org/w/index.php',
                            params={
                                'title': rev['title'].replace(" ", "_"),
                                'oldid': rev['revid']
                            }).prepare().url

                        rev['export_timestamp'] = export_time
                        rev['export_commit'] = digobs.git_hash(short=True)

                        rev_rows.append(rev)
                    logging.debug(
                        f"successfully received revisions for: {article}")
                    break
                except:
                    if tries > 10:
                        logging.critical(
                            f"giving up after 10 tries to get {article}")
                        raise
                    else:
                        logging.warning(f"socket.timeout from {article}")
                        logging.warning(f"sleeping 60 seconds before retrying")
                        tries = tries + 1
                        time.sleep(60)
                        continue

                # print out each of the revisions once we know we have it all
                for rev in rev_rows:
                    tsv_writer.writerow({k: rev[k] for k in tsv_fields})
import time

from mw import api

from mwcites.extractors import doi

session = api.Session("https://en.wikipedia.org/w/api.php",
                      user_agent="Demo doi extractor")

revisions = session.revisions.query(titles={"Psychotherapy"},
                                    properties={'content'})
lots = next(revisions)['*']
print("Text with lots of DOIs has {0} characters".format(len(lots)))

revisions = session.revisions.query(titles={"Waffle"}, properties={'content'})
few = next(revisions)['*']
print("Text with few DOIs has {0} characters".format(len(few)))

start = time.time()
for i in range(50):
    ids = set(doi.extract(lots))
    ids = set(doi.extract(few))
print("Regex strategy: {0}".format(time.time() - start))

start = time.time()
for i in range(50):
    ids = set(doi.extract_mwp(lots))
    ids = set(doi.extract_mwp(few))
print("MWP strategy: {0}".format(time.time() - start))

start = time.time()
Esempio n. 10
0
def main():
    # This is used for Aaron Halfaker's API wrapper...
    loginfile = configparser.ConfigParser()
    loginfile.read([os.path.expanduser('~/.wiki.ini')])
    username = loginfile.get('wiki', 'username')
    password = loginfile.get('wiki', 'password')

    # ...And this is for Pywikibot
    bot = pywikibot.Site('en', 'wikipedia')

    wptools = WikiProjectTools()

    now = datetime.datetime.utcnow()
    now = now.strftime(
        '%Y%m%d%H%M%S')  # converts timestamp to MediaWiki format

    # Pulling timestamp of the last time the script was run
    query = wptools.query(
        'index',
        'select lu_timestamp from lastupdated where lu_key = "new_discussions";',
        None)
    lastupdated = query[0][0]

    # Polling for newest talk page posts in the last thirty minutes
    query = wptools.query(
        'wiki',
        'select distinct recentchanges.rc_this_oldid, page.page_id, recentchanges.rc_title, recentchanges.rc_comment, recentchanges.rc_timestamp, page.page_namespace from recentchanges join page on recentchanges.rc_namespace = page.page_namespace and recentchanges.rc_title = page.page_title join categorylinks on page.page_id=categorylinks.cl_from where rc_timestamp >= {0} and rc_timestamp < {1} and rc_comment like "% new section" and rc_deleted = 0 and cl_to like "%_articles" and page_namespace not in (0, 2, 6, 8, 10, 12, 14, 100, 108, 118) order by rc_timestamp asc;'
        .format(lastupdated, now), None)

    # Cleaning up output
    namespace = {
        1: 'Talk:',
        3: 'User_talk:',
        4: 'Wikipedia:',
        5: 'Wikipedia_talk:',
        7: 'File_talk:',
        9: 'MediaWiki_talk:',
        11: 'Template_talk:',
        13: 'Help_talk:',
        15: 'Category_talk:',
        101: 'Portal_talk:',
        109: 'Book_talk:',
        119: 'Draft_talk:',
        447: 'Education_Program_talk:',
        711: 'TimedText_talk:',
        829: 'Module_talk:',
        2600: 'Topic:'
    }

    output = []
    for row in query:
        rc_id = row[0]
        page_id = row[1]
        rc_title = row[2].decode('utf-8')
        rc_comment = row[3].decode('utf-8')
        rc_comment = rc_comment[
            3:]  # Truncate beginning part of the edit summary
        rc_comment = rc_comment[:-15]  # Truncate end of the edit summary
        rc_timestamp = row[4].decode('utf-8')
        rc_timestamp = datetime.datetime.strptime(rc_timestamp, '%Y%m%d%H%M%S')
        rc_timestamp = rc_timestamp.strftime('%H:%M, %d %B %Y (UTC)')
        page_namespace = row[5]
        page_namespace = namespace[page_namespace]

        session = api.Session("https://en.wikipedia.org/w/api.php",
                              user_agent='WPX Revert Checker')
        session.login(username, password)

        # Check if revision has been reverted
        reverted = reverts.api.check(session, rc_id, page_id, 3, None, 172800,
                                     None)
        if reverted is None:
            entry = {
                'title': (page_namespace + rc_title),
                'section': rc_comment,
                'timestamp': rc_timestamp
            }
            output.append(entry)

    # Loading list of WikiProjects signed up to get lists of new discussions
    config = json.loads(
        wptools.query('index', 'select json from config;', None)[0][0])

    if config['defaults'][
            'new_discussions'] == False:  # i.e. if New Discussions is an opt-in system
        whitelist = []  # Whitelisted WikiProjects for new discussion lists
        for project in config['projects']:
            try:
                project['new_discussions']
            except KeyError:
                continue
            else:
                if project['new_discussions'] == True:
                    whitelist.append(project['name'])
    else:
        whitelist = None

    # A whitelist of [] is one where there is a whitelist, but it's just empty.
    # A whitelist of None is for situations where the need for a whitelist has been obviated.

    # Generating list of WikiProjects for each thread
    for thread in output:
        query = wptools.query(
            'index',
            'select distinct pi_project from projectindex where pi_page = %s;',
            (thread['title']))
        thread['wikiprojects'] = []
        for row in query:
            wikiproject = row[0].replace('_', ' ')
            if (whitelist is None) or (wikiproject in whitelist):
                thread['wikiprojects'].append(wikiproject)
        for wikiproject in thread['wikiprojects']:
            saveto = wikiproject + '/Discussions'
            page = pywikibot.Page(bot, saveto)
            intro_garbage = '{{WPX header|Discussions|color={{{1|#37f}}}}}\n'
            intro_garbage += '{{{{WPX action box|color={{{{{{2|#086}}}}}}|title=Have a question?|content={{{{Clickable button 2|url=//en.wikipedia.org/wiki/Wikipedia_talk:{0}?action=edit&section=new|Ask the WikiProject|class=mw-ui-progressive mw-ui-block}}}}\n\n{{{{Clickable button 2|Wikipedia talk:{0}|View Other Discussions|class=mw-ui-block}}}}}}}}\n'.format(
                wikiproject[10:].replace(' ', '_'))
            intro_garbage += '{{{{WPX list start|intro={{{{WPX last updated|{0}}}}}}}}}\n\n'.format(
                saveto)
            draft = '<noinclude><div style="padding-bottom:1em;">{{{{Clickable button 2|{0}|Return to WikiProject|class=mw-ui-neutral}}}}</div>\n</noinclude>'.format(
                wikiproject) + intro_garbage
            submission = '{{{{WPX new discussion|color={{{{{{1|#37f}}}}}}|title={0}|section={1}|timestamp={2}}}}}\n'.format(
                thread['title'].replace('_', ' '), thread['section'],
                thread['timestamp'])

            notification = "* '''[[{0}#{1}|{1}]] on {0}".format(
                thread['title'].replace('_', ' '), thread['section'])
            queue_notification(wikiproject[10:].replace(' ', '_'),
                               notification)

            index = mwparserfromhell.parse(page.text)
            index = index.filter_templates()
            templatelist = []
            for i in index:
                if i.name == "WPX new discussion":
                    templatelist.append(str(i))
            templatelist = templatelist[:14]  # Sayonara, old threads!
            page.text = draft + submission
            if len(templatelist) > 3:
                templatelist[
                    2] += "<noinclude>"  # Anything after the third item will not be transcluded
                templatelist[len(templatelist) - 1] += "</noinclude>"
            for i in templatelist:
                page.text += i + "\n"
            page.text += "{{{{WPX list end|more={0}}}}}".format(
                saveto.replace(' ', '_'))
            page.save('New discussion on [[{0}]]'.format(
                thread['title'].replace('_', ' ')),
                      minor=False)

    # Update the Last Updated field with new timestamp
    wptools.query(
        'index',
        'update lastupdated set lu_timestamp = {0} where lu_key = "new_discussions";'
        .format(now), None)
Esempio n. 11
0
from mw.types.timestamp import Timestamp
from itertools import chain, islice
from functools import partial, cache
import re
import pandas as pd
from dataclasses import dataclass
from multiprocessing import Pool
from wikidata.client import Client


def parse_wikimedia_timestamp(timestamp):
    return datetime.fromtimestamp(Timestamp(timestamp).serialize())


apiurl = "https://en.wikipedia.org/w/api.php"
session = api.Session(apiurl)
wdclient = Client()
title = "Sustainable energy"

wikilink_re = re.compile(
    r'''\[\[                              # Match two opening brackets
       (?P<link>                          # <link>:
           [^\n\|\]\[\#\<\>\{\}]{0,256}   # Text inside link group
                                          # everything not illegal, non-greedy
                                          # can be empty or up to 256 chars
       )
       (?:                                # Non-capturing group
          \|                              # Match a pipe
          (?P<anchor>                     # <anchor>:
              [^\[]*?                     # Test inside anchor group:
                                          # match everything not an open braket