Beispiel #1
0
 def init_source(self):
     response = self.with_requests(events_url + '/events')
     self.source = EventSource(response)
# any downtime, we always maintain 100% data coverage (up to the 7-31 days
# that the EventStream historical data is kept).
latest_datetime = db.get_latest_datetime()

if latest_datetime[0]:
    latest_date_formatted = latest_datetime[0].strftime('%Y-%m-%dT%H:%M:%SZ')

    url = base_stream_url + '?since={date}'.format(
        date=latest_date_formatted)
else:
    url = base_stream_url

if len(sys.argv) > 1 and sys.argv[1] == 'nohistorical':
    url = base_stream_url

for event in EventSource(url):
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue

        hashtag_matches = hashtag_match(change['comment'])
        if hashtag_matches and valid_edit(change):
            for hashtag in hashtag_matches:
                if db.is_duplicate(hashtag, change['id']):
                    print("Skipped duplicate {hashtag} ({id})".format(
                        hashtag=hashtag, id=change['id']))

                elif valid_hashtag(hashtag):
                    # Check edit_summary length, truncate if necessary
import json
from sseclient import SSEClient as EventSource

#rc_url = 'https://stream.wikimedia.org/v2/stream/recentchange'
ref_url = 'https://stream.wikimedia.org/v2/stream/page-links-change'
for event in EventSource(ref_url):
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue
#        print(change.keys())
        if 'added_links' in change:
            added_ext_links = [l for l in change['added_links'] if '.cn' in l['link']]
            if added_ext_links:
                print('{0} ({1}): {2}'.format(change['page_title'], change['database'], added_ext_links))
def wiki_stream(): 
	

	url = 'https://stream.wikimedia.org/v2/stream/revision-create'

	c =0
	domain = {}
	user = {}
	start_time = time.time()
	limit = 60
	open("file1", "w").close()
	for event in EventSource(url):
		if (time.time() - start_time) > limit:
			print("*********************************************** Last 1 Min Report **************************************************")
			print("\n")
			break
		if event.event == 'message':
			try:
				out_file = open("file1", "a+")
				out_file.write(event.data+'\n')
				pass
				out_file.close()
			except ValueError:
				pass
	
	
	with open('file1') as f:
		content = f.readlines()
		for i in content:
			if not i.lstrip().rstrip():
				continue
			data = json.loads(i)
			key = data['meta']['domain']
			if key in domain:
				domain[key] = domain[key]+1
			else:
				domain[key] = 1	
			
	dict(sorted(domain.items(), key=lambda item: item[1]))	

	print("                                                        Domain Report")
	print("\n")
	print("Total number of Wikipedia Domains Updated:", len(domain))	
	for d in domain:
		print(d , " : " ,domain[d] , " Pages updated")	


	print("\n")


	with open('file1') as f:
		content = f.readlines()
		for i in content:
			if not i.lstrip().rstrip():
				continue
			data = json.loads(i)
			usr = data['performer']['user_text']
			if data['meta']['domain']=='en.wikipedia.org':
				if usr in user:
					user[usr] = max(user[usr], data['performer']['user_edit_count'])

				else:
					try:
						user[usr] = data['performer']['user_edit_count']
					except:
						pass	


	dict(sorted(user.items(), key = lambda item: item[1]))


	print("                                                        User Report")
	print("\n")
	print("Users who made changes to en.wikipedia.org")
	print("\n")

	for key, value in user.items():
		print(key, " : " ,value)
Beispiel #5
0
    for row in batch:
        cache.add(row.meta_id)

rs.close()
print(
    f"Initial cache size is: {len(cache)}, in memory: {getsizeof(cache)/1024/1024} MBytes"
)

# get last date
max_date = session.query(func.max(Event.meta_dt)).scalar()
date_from = max_date.strftime('%Y-%m-%dT%H:%M:%SZ')
print(f"Reload from date: {date_from}")

url = f'https://stream.wikimedia.org/v2/stream/recentchange?since={date_from}'
print(f"Using SSE URL {url}")
for event in EventSource(url, retry=1000, chunk_size=81920000):
    try:
        change = json.loads(event.data)
    except ValueError as ex:
        print(ex, event.data)
    else:
        if change['meta']['id'] not in cache:
            buffer.append(
                Event(meta_id=change['meta']['id'],
                      meta_dt=change['meta']['dt'],
                      data=change))
            cache.add(change['meta']['id'])
        else:
            print(
                f"ID {change['meta']['id']} {change['meta']['dt']} already in cache"
            )
Beispiel #6
0
    latest_date_formatted = latest_datetime[0].strftime('%Y-%m-%dT%H:%M:%SZ')

    url = base_stream_url + '?since={date}'.format(date=latest_date_formatted)
else:
    url = base_stream_url

if len(sys.argv) > 1 and sys.argv[1] == 'nohistorical':
    url = base_stream_url

for event in EventSource(
        url,
        # The retry argument sets the delay between retries in milliseconds.
        # We're setting this to 5 minutes.
        # There's no way to set the max_retries value with this library,
        # but since it depends upon requests, which in turn uses urllib3
        # by default, we get a default max_retries value of 3.
        retry=300000,
        # The timeout argument gets passed to requests.get.
        # An integer value sets connect (socket connect) and
        # read (time to first byte / since last byte) timeout values.
        # A tuple value sets each respective value independently.
        # https://requests.readthedocs.io/en/latest/user/advanced/#timeouts
        timeout=(3.05, 30)):
    if event.event == 'message':
        try:
            change = json.loads(event.data)
        except ValueError:
            continue

        hashtag_matches = hashtag_match(change['comment'])
        if hashtag_matches and valid_edit(change):
            for hashtag in hashtag_matches:
    def streaming(self):

        # construct file and fout
        print("Working on a new file..")

        print("user_text,wikiproject,type,title,ns,timestamp,minor",
              file=self.fout)

        for event in EventSource(self.url):

            if event.event == 'message':
                try:
                    change = json.loads(event.data)
                except ValueError:
                    pass

                else:
                    if change['wiki'] != "enwiki":
                        continue

                    if change['type'] != 'edit':
                        continue

                    username = change.get('user').encode('utf-8')
                    title = change.get('title').encode('utf-8')
                    timestamp = change.get('timestamp')
                    ns = change.get('namespace')
                    minor = 1 if change.get('minor') else 0

                    if username in self.users_bonds:
                        type = 'bonds'
                    elif username in self.users_topics:
                        type = 'topics'
                    elif username in self.users_rule:
                        type = 'rule'
                    elif username in self.users_cf:
                        type = 'cf'
                    elif username in self.newcomers:
                        type = 'newcomer'
                    elif username in self.organizers:
                        type = 'organizer'
                    else:
                        # not recommended users...
                        continue

                    wikiproject = None
                    if username in self.user_project:
                        wikiproject = self.user_project[username]
                    else:
                        # TODO: something is wrong here...
                        pass

                    print("{}**{}**{}**{}**{}**{}**{}".format(
                        username, wikiproject, type, title, ns, timestamp,
                        minor),
                          file=self.fout)
                    print("{}**{}**{}**{}**{}**{}**{}".format(
                        username, wikiproject, type, title, ns, timestamp,
                        minor))

                    self.fout.flush()
    def run(self):
        '''
        Grab events from the stream until shutdown.
        '''

        ## SQL query to identify a redirect, disambiguation page, and list
        page_check_query = '''SELECT ap.page_is_redirect,
                                     IFNULL(c1.cl_from, 0) AS page_is_disambig,
                                     IFNULL(c2.cl_from, 0) AS page_is_list
                              FROM revision r
                              JOIN page ap
                              ON r.rev_page=ap.page_id
                              LEFT JOIN page tp
                              ON (ap.page_title=tp.page_title
                                  AND tp.page_namespace=1)
                              LEFT JOIN (
                                SELECT cl_from FROM categorylinks
                                WHERE cl_to='All_article_disambiguation_pages')
                                AS c1
                              ON c1.cl_from=ap.page_id
                              LEFT JOIN (
                                SELECT cl_from FROM categorylinks
                                WHERE cl_to REGEXP "^List-Class.*")
                                AS c2
                              ON c2.cl_from=tp.page_id
                              WHERE r.rev_id=%(rev_id)s
                              LIMIT 1'''

        ## SQL query to insert predictions for a given revision into
        ## our local database table, formatted so that the two groups
        ## of prediction results are easy to spot.
        insert_query = '''INSERT INTO page_predictions
                          VALUES (%s, %s,
                                  %s, %s, %s, %s, %s,
                                  %s, %s, %s, %s, %s, %s, %s)'''
        
        # Set up a signal handler for SIGUSR1
        signal.signal(signal.SIGUSR1, self.handle_signal);

        # Create the ORES session variable
        ores_session = ORESSession(self.ores_url, self.ores_user_agent)

        ## Connect to the database
        wiki_db_conn = db.connect("{}.labsdb".format(self.wiki),
                                  "{}_p".format(self.wiki),
                                  self.db_conf)
        if not wiki_db_conn:
            logging.error("unable to connect to Wiki database")
            return()

        local_db_conn = db.connect(self.local_db_host,
                                   self.local_db_name,
                                   self.db_conf)
        if not local_db_conn:
            logging.error("unable to connect to tools database")
            return()
        
        logging.info("Running...")
        for event in EventSource(self.rc_url):
            if self.shutdown:
                break

            data = self.filter_event(event)
            if not data:
                continue

            ## Turn the timestamp into a datetime object
            data['timestamp'] = dt.datetime.fromtimestamp(
                data['timestamp'], tz=dt.timezone.utc)
            
            logging.info('{user} created {title}'.format_map(data))

            ## Check that it's not a redirect, not a disambiguation page,
            ## and not a list page.
            page_is_redirect = 0
            page_is_disambig = 0
            page_is_list = 0
            
            try:
                with db.cursor(wiki_db_conn) as db_cursor:
                    db_cursor.execute('SELECT * FROM page LIMIT 1')
            except MySQLdb.OperationalError as e:
                wiki_db_conn = db.connect("{}.labsdb".format(self.wiki),
                                          "{}_p".format(self.wiki),
                                          self.db_conf)
                
            with db.cursor(wiki_db_conn, 'dict') as db_cursor:
                db_cursor.execute(
                    page_check_query,
                    {'rev_id': data['revision']['new']})

                for row in db_cursor:
                    page_is_redirect = row['page_is_redirect']
                    page_is_disambig = row['page_is_disambig']
                    page_is_list = row['page_is_list']

            if page_is_redirect or page_is_disambig or page_is_list:
                continue

            logging.info("{user} created {title} which is not a redirect, not a disambiguation page, and not a list".format_map(data))

            try:
                with db.cursor(local_db_conn) as db_cursor:
                    db_cursor.execute('SELECT * FROM page_predictions LIMIT 1')
            except MySQLdb.OperationalError as e:
                local_db_conn = db.connect(self.local_db_host,
                                   self.local_db_name,
                                   self.db_conf)

            with db.cursor(local_db_conn, 'dict') as db_cursor:
                ## Grab the wp10 and draftquality predictions from ORES
                ## for the given revision and store it in the database:
                for prediction in ores_session.score(data['wiki'],
                                                     self.ores_models,
                                                     [data['revision']['new']]):
                    try:
                        draft_res = prediction['draftquality']['score']
                        wp10_res = prediction['wp10']['score']
                    except KeyError:
                        logging.warning('unexpected ORES result')
                        continue

                    try:
                        db_cursor.execute(
                            insert_query, (data['revision']['new'],
                                           data['timestamp'],
                                           draft_res['prediction'],
                                           draft_res['probability']['spam'],
                                           draft_res['probability']['vandalism'],
                                           draft_res['probability']['attack'],
                                           draft_res['probability']['OK'],
                                           wp10_res['prediction'],
                                           wp10_res['probability']['Stub'],
                                           wp10_res['probability']['Start'],
                                           wp10_res['probability']['C'],
                                           wp10_res['probability']['B'],
                                           wp10_res['probability']['GA'],
                                           wp10_res['probability']['FA']))
                        local_db_conn.commit()
                        print("inserted {}, created by {}, draftquality prediction {}, wp10 prediction {}".format(data['title'], data['user'], draft_res['prediction'], wp10_res['prediction']))
                    except Exception as e:
                        print(e)

        ## ok, done
        local_db_conn.close()
        wiki_db_conn.close()
        return()
    def stream(self, start_date):

        # construct file and fout
        print("Working on a new file..")
        user_cnt = 0
        register_cnt = 0

        unit = 0
        if self.time_gap >= 60 * 60 * 24:
            unit = start_date.day
        elif self.time_gap >= 60 * 60:
            unit = start_date.hour
        elif self.time_gap >= 60:
            unit = start_date.minute

        self.fout_newcomers = open(
            self.output_dir + "/" + self.newcomers_file + str(unit) + ".csv",
            "w")
        self.fout_newreg = open(
            self.output_dir + "/" + self.new_registered_file + str(unit) +
            ".csv", "w")
        print("user_cnt**user_text**article**timestamp",
              file=self.fout_newcomers)
        print("user_cnt**user_text**timestamp", file=self.fout_newreg)

        for event in EventSource(self.url):

            # check if times up for the next file (over a day)
            if self.times_up(start_date):
                break

            if event.event == 'message':
                try:
                    change = json.loads(event.data)
                except ValueError:
                    pass

                else:
                    if change['wiki'] != "enwiki":
                        continue

                    if (change['type'] == "log"
                            and change['log_type'] == "newusers"
                            and change['log_action'] == "create"
                            and 'user' in change
                            and change['user'] is not None):

                        self.NEW_USERS[change['user']] = 0

                        # TODO: add on edit hour, day, and month
                        register_cnt += 1
                        print("Registered {}. {}".format(
                            register_cnt,
                            change.get('user').encode('utf8')))
                        print("{}**{}**{}".format(register_cnt, change['user'],
                                                  change['timestamp']),
                              file=self.fout_newreg)
                        self.fout_newreg.flush()

                    elif change['type'] in ('edit', 'new'):
                        username = change.get('user')
                        if username in self.NEW_USERS:

                            if self.NEW_USERS[username] == 0 and change[
                                    'namespace'] == 0:
                                self.NEW_USERS[username] += 1

                                # TODO: add on edit hour, day, and month
                                user_cnt += 1
                                print("Edited {}. {} edited {}.".format(
                                    user_cnt, username.encode('utf8'),
                                    change['title'].encode('utf8')))
                                print("{}**{}**{}".format(
                                    username, change['title'],
                                    change['timestamp']),
                                      file=self.fout_newcomers)
                                self.fout_newcomers.flush()