def parse(self, response): api_response = api.revisions( [response.url.replace('https://www.wikidata.org/wiki/', '')]) bot_url = self.get_bot_url(response) operator_url = self.get_operator_url(response) data = { 'url': unquote(unescape(response.url.replace('https://', ''))), 'bot_url': bot_url, 'bot_name': self.get_bot_name(response), 'bot_has_red_link': 1 if striper.RED_LINK_RE.match(bot_url) else 0, 'operator_url': operator_url, 'operator_name': self.get_operator_name(response), 'operator_has_red_link': 1 if striper.RED_LINK_RE.match(operator_url) else 0, 'is_successful': 1 if response.url in self.json_data['successful_requests'] else 0, 'first_edit': self.get_first_edit(api_response), 'last_edit': self.get_last_edit(api_response), 'closed_at': self.get_closed_at(response), 'revision_count': self.get_revision_count(api_response), 'editor_count': self.get_editor_count(api_response), 'html': response.css('div#bodyContent').extract_first(), 'task': None, 'code': None, 'function': None, 'archive_comment': self.get_archive_comment(response.url), 'summary': None, 'retrieved_at': datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"), } for key in ['task', 'code', 'function', 'summary']: data[key] = self.xpath(key, response) for key, symbol in self.XPATH['symbols'].items(): data[key + '_symbol_count'] = len(response.xpath(symbol)) db.insert('requests_for_permissions', data)
def print_average_edit_count_per_day(cls): data = { 'userid': [], 'bot': [], 'edit_count_per_day': [], 'registration': [] } for item in db.execute(cls.EDIT_COUNT_WITH_REGISTRATION_DATE_QUERY): data['userid'].append(item[0]) data['bot'].append(item[1]) registration_date = datetime.strptime(item[3], '%Y-%m-%dT%H:%M:%SZ') current_date = datetime.now() days_since_registration = (current_date - registration_date).days edit_count_per_day = int(item[2] / days_since_registration) data['edit_count_per_day'].append(edit_count_per_day) data['registration'].append(str(registration_date.date())) df = pd.DataFrame(data) df = df.sort_values(by=['edit_count_per_day', 'bot']) print_df(df, ['userid', 'bot', 'edit_count_per_day', 'registration'], ['12', '35', '20', '20'])
def print_bots_with_bot_flag_and_in_bot_group(cls): print_names_and_count( set([ item[0] for item in db.execute(cls.BOTS_WITH_BOT_FLAG_AND_IN_BOT_GROUP_QUERY) ]), 'unique bots with a bot flag and which belong to the group bot' )
def print_bots_without_request(cls): print_names_and_count( set([ item[0] for item in db.execute(cls.BOTS_WITHOUT_REQUEST_QUERY) ]), 'bots without a request for permission' )
def print_bots_in_bot_group(cls): print_names_and_count( set([ item[0] for item in db.execute(cls.BOTS_WITHOUT_BOT_FLAG_IN_BOT_GROUP_QUERY) ]), 'unique bots which belong to the group bot and do not have a bot flag' )
def print_unique_bots(cls): print_names_and_count( set([ item[0] for item in db.execute(cls.UNIQUE_BOTS_QUERY) ]), 'unique bots' )
def print_bots_without_request_without_groups(cls): print_names_and_count( set([ item[0] for item in db.execute(cls.GROUPS_OF_BOTS_WITHOUT_REQUEST_QUERY) if item[1] is None ]), 'bots without a request and without groups' )
def print_request_for_permission_without_editor_count(cls): print_names_and_count( [ item[0] for item in db.execute(cls.REQUEST_WITHOUT_EDITOR_COUNT_QUERY) ], 'requests for permissions without editor_count', "\n" )
def print_request_for_permission_without_closed_at(cls): print_names_and_count( [ item[0] for item in db.execute(cls.REQUEST_WITHOUT_CLOSED_AT_QUERY) ], 'requests for permissions without closed_at', "\n" )
def plot_distribution_over_time(cls, sql): earliest_time = None for item in db.execute(sql + cls.SQL_MIN): earliest_time = re.sub(cls.TIME_RE, '', item[0]) latest_time = None for item in db.execute(sql + cls.SQL_MAX): latest_time = re.sub(cls.TIME_RE, '', item[0]) time_series = cls.init_time_series(earliest_time, latest_time) result = db.execute(sql) for item in result: time = re.sub(cls.TIME_RE, '', item[0]) time_series[time] += 1 cls.plot(time_series, ['date', 'count'])
def print_groups_differences(cls): with_request_groups = [] for item in db.execute(cls.GROUPS_OF_BOTS_WITH_REQUEST_QUERY): if item[1] is not None: with_request_groups += item[1].split(',') with_request_groups = set(with_request_groups) without_request_groups = [] for item in db.execute(cls.GROUPS_OF_BOTS_WITHOUT_REQUEST_QUERY): if item[1] is not None: without_request_groups += item[1].split(',') without_request_groups = set(without_request_groups) print( "#################### All groups that bots with a request for permission belong to but all other bots do not: ####################\n", ', '.join(with_request_groups.difference(without_request_groups)), "\n") print( "#################### All groups that bots without a request for permission belong to but all other bots do not: ####################\n", ', '.join(without_request_groups.difference(with_request_groups)), "\n")
def print_bots_with_request_without_groups(cls): bots_with_red_link = [] bots_without_red_link = [] for item in db.execute(cls.GROUPS_OF_BOTS_WITH_REQUEST_QUERY): if item[1] is None: if item[2] == 1: bots_with_red_link.append(item[0]) else: bots_without_red_link.append(item[0]) print_names_and_count(bots_with_red_link, 'bots with a request, without groups and a red link') print_names_and_count(bots_without_red_link, 'bots with a request, without groups and without a red link')
def plot_distribution(cls, sql): min_value = cls.get_min_value(sql + ''' ORDER BY editor_count''') max_value = cls.get_max_value(sql + ''' ORDER BY editor_count''') distribution = {} for i in range(min_value, max_value + 1): distribution[i] = 0 for item in db.execute(sql): distribution[item[0]] = item[1] cls.plot(distribution, ['editor_count', 'count'])
def print_right_differences_for_request(cls): with_request_rights = [] for item in db.execute(cls.RIGHTS_OF_BOTS_WITH_REQUEST_QUERY): if item[1] is not None: with_request_rights += item[1].split(',') with_request_rights = set(with_request_rights) without_request_rights = [] for item in db.execute(cls.RIGHTS_OF_BOTS_WITHOUT_REQUEST_QUERY): if item[1] is not None: without_request_rights += item[1].split(',') without_request_rights = set(without_request_rights) print( "#################### All rights that bots with a request for permission have but all other bots do not have: ####################\n", ', '.join(with_request_rights.difference(without_request_rights)), "\n") print( "#################### All rights that bots without a request for permission have but all other bots do not have: ####################\n", ', '.join(without_request_rights.difference(with_request_rights)), "\n")
def print_rights_of_bot_in_bot_group(cls): data = { 'right': [] } for item in db.execute(cls.RIGHTS_OF_BOTS_IN_GROUP_BOT): if item[1] is not None: data['right'] += item[1].split(',') df = pd.DataFrame(data) df = df.groupby(['right']).size().reset_index(name='count') df = df.sort_values(by=['count', 'right'], ascending=[False, True]) print_df(df, ['right', 'count'], ['30', '30'])
def print_editor_count(cls, mode='none'): data = { 'url': [], 'editor_count': [] } for item in db.execute(cls.EDITOR_COUNT_QUERIES[mode]): data['url'].append(item[0]) data['editor_count'].append(item[1]) df = pd.DataFrame(data) df = df.sort_values(by=['editor_count', 'url']) print_df(df, ['url', 'editor_count'], ['90', '15'])
def print_right_differences_for_bot_flag_and_bot_group(cls): with_bot_flag_rights = [] for item in db.execute(cls.RIGHTS_OF_BOTS_WITH_BOT_FLAG_QUERY): if item[1] is not None: with_bot_flag_rights += item[1].split(',') with_bot_flag_rights = set(with_bot_flag_rights) in_bot_group_rights = [] for item in db.execute(cls.RIGHTS_OF_BOTS_IN_GROUP_BOT): if item[1] is not None: in_bot_group_rights += item[1].split(',') in_bot_group_rights = set(in_bot_group_rights) print( "#################### All rights that bots with a bot flag have but bots in bot group do not have: ####################\n", ', '.join(with_bot_flag_rights.difference(in_bot_group_rights)), "\n") print( "#################### All rights that bots which belong to the bot group have but bots with a bot flag do not have: ####################\n", ', '.join(in_bot_group_rights.difference(with_bot_flag_rights)), "\n")
def plot_general_statistics_about_requests(cls): data = { 'request': {}, 'bot': {} } for target in cls.GENERAL_STATISTICS_ABOUT_REQUESTS_QUERIES.keys(): for statistic, query in cls.GENERAL_STATISTICS_ABOUT_REQUESTS_QUERIES[target].items(): for item in db.execute(query): data[target][statistic] = item[0] trace0 = Bar( y=list(data['request'].keys()), x=list(data['request'].values()), name='Requests for Permissions', orientation='h', marker=dict( color='rgb(235,173,104)', line=dict( color='rgb(185,125,54)', width=1.5), ), opacity=0.8 ) trace1 = Bar( y=list(data['bot'].keys()), x=list(data['bot'].values()), name='Bots', orientation='h', marker=dict( color='rgb(204,204,204)', line=dict( color='rgb(150,150,150)', width=1.5), ), opacity=0.8 ) data = [trace0, trace1] layout = Layout( xaxis=dict(tickangle=-45), barmode='group', ) fig = Figure(data=data, layout=layout) iplot(fig, filename='angled-text-bar')
def plot_bots_groups_distribution(cls): data = { 'groups': [], 'count': [] } for item in db.execute(cls.GROUPED_GROUPS_OF_BOTS_QUERY): groups = item[0].split(',') groups.sort() data['groups'].append(', '.join(groups)) data['count'].append(item[1]) df = pd.DataFrame(data) df['%'] = round(df['count'] / df['count'].sum() * 100, 2) print_df(df, ['groups', 'count', '%'], ['60', '15', '15'])
def print_groups_of_bots_without_request(cls): data = { 'group': list(itertools.chain.from_iterable( [ item[1].split(',') for item in db.execute(cls.GROUPS_OF_BOTS_WITHOUT_REQUEST_QUERY) if item[1] is not None ] )) } df = pd.DataFrame(data) df = df.groupby(['group']).size().reset_index(name='count') df = df.sort_values(by=['count', 'group'], ascending=[False, True]) print_df(df, ['group', 'count'], ['30', '30'])
def print_edit_count(cls): data = { 'userid': [], 'bot': [], 'edit_count': [], 'registration': [] } for item in db.execute(cls.EDIT_COUNT_WITH_REGISTRATION_DATE_QUERY): data['userid'].append(item[0]) data['bot'].append(item[1]) data['edit_count'].append(item[2]) data['registration'].append(str(datetime.strptime(item[3], '%Y-%m-%dT%H:%M:%SZ').date())) df = pd.DataFrame(data) df = df.sort_values(by=['edit_count', 'bot']) print_df(df, ['userid', 'bot', 'edit_count', 'registration'], ['12', '35', '12', '20'])
def plot_bots_groups_without_implicit_groups_distribution(cls): data = { 'groups': [] } for item in db.execute(cls.GROUPED_GROUPS_OF_BOTS_WITHOUT_IMPLICIT_GROUPS_QUERY): all_groups = set(item[0].split(',')) implicit_groups = set(item[1].split(',')) explicit_groups = all_groups - implicit_groups explicit_groups = list(explicit_groups) explicit_groups.sort() data['groups'].append(', '.join(explicit_groups)) df = pd.DataFrame(data) df = df.groupby(['groups']).size().reset_index(name='count') df = df.sort_values(by=['count', 'groups'], ascending=[False, True]) df['%'] = round(df['count'] / df['count'].sum() * 100, 2) print_df(df, ['groups', 'count', '%'], ['40', '15', '15'])
def print_bots_with_request_without_rights(cls): bots_with_red_link = [] # bots_with_right_and_redlink = [] bots_without_red_link = [] for item in db.execute(cls.RIGHTS_OF_BOTS_WITH_REQUEST_QUERY): if item[1] is None: if item[2] == 1: bots_with_red_link.append(item[0]) else: bots_without_red_link.append(item[0]) # else: # if item[2] == 1: # bots_with_right_and_redlink.append(item[0]) #print( # "#################### Names of all bots with a request, with rights and a red link: ####################\n", # ', '.join(bots_with_right_and_redlink), "\n") print_names_and_count(bots_with_red_link, 'bots with a request, without rights and a red link') print_names_and_count(bots_without_red_link, 'bots with a request, without rights and without a red link')
import logging from db import SqliteDb as db from parser import BotsGroupParser as bp from parser import BotsTableCreator as bc with open('config.yaml', 'r', encoding='utf-8') as config_file: config = yaml.load(config_file) logging.basicConfig(filename=config['log'], level=logging.DEBUG) spiders = [ 'archives_spider', 'request_for_permission_spider', 'requests_for_permissions_spider', 'bots_with_botflag_spider', 'extension_bots_spider', 'bots_without_botflag_spider', 'bots_with_requests_for_permissions_spider', ] db.reset() # run all spiders for spider in spiders: subprocess.call(['scrapy', 'runspider', "src/%s.py" % spider]) bp.parse() bc.create() db.migrate()
def __init__(self, **kwargs): super().__init__(**kwargs) db.reset()
def retrieve_bots(sql): bots = [item[0] for item in db.execute(sql)] return bots, len(bots)
def create(cls): bots_with_botflag = [] bots_without_botflag = [] extension_bots = [] bots = [] for file in cls.FILES: with open(file) as f: reader = csv.reader(f) bots += [row for row in reader][0] with open('data/spiders/bots_with_botflag.csv') as f: reader = csv.reader(f) bots_with_botflag += [row for row in reader][0] with open('data/spiders/bots_without_botflag.csv') as f: reader = csv.reader(f) bots_without_botflag += [row for row in reader][0] with open('data/spiders/extension_bots.csv') as f: reader = csv.reader(f) extension_bots += [row for row in reader][0] bots = set(bots) batches = [ list(bots)[i * 50:(i + 1) * 50] for i in range(int(len(bots) / 50) + 1) ] for batch in batches: for bot in api.users(batch)['query']['users']: if db.exists('bots', 'name', bot['name']): continue bot['retrieved_at'] = datetime.datetime.utcnow().strftime( "%Y-%m-%d %H:%M:%S") bot['has_botflag'] = 1 if bot[ 'name'] in bots_with_botflag else 0 if bot[ 'name'] in bots_without_botflag else None bot['is_extension_bot'] = 1 if bot[ 'name'] in extension_bots else 0 if 'invalid' in bot or 'missing' in bot: bot = { 'name': bot['name'], 'retrieved_at': bot['retrieved_at'], 'has_botflag': bot['has_botflag'], 'is_extension_bot': bot['is_extension_bot'] } db.insert('bots', bot) continue bot.pop('blockinfo', None) bot['groups'] = ','.join(bot['groups']) bot['implicitgroups'] = ','.join(bot['implicitgroups']) bot['rights'] = ','.join(bot['rights']) bot['blockid'] = bot['blockid'] if 'blockid' in bot else None bot['blockedby'] = bot[ 'blockedby'] if 'blockedby' in bot else None bot['blockedbyid'] = bot[ 'blockedbyid'] if 'blockedbyid' in bot else None bot['blockedtimestamp'] = bot[ 'blockedtimestamp'] if 'blockedtimestamp' in bot else None bot['blockreason'] = bot[ 'blockreason'] if 'blockreason' in bot else None bot['blockexpiry'] = bot[ 'blockexpiry'] if 'blockexpiry' in bot else None db.insert('bots', bot) # BotsTableCreator.create()
def get_max_value(cls, sql): return [item[0] for item in db.execute(sql + cls.SQL_MAX)][0]