Esempio n. 1
0
    def parse(self, response):

        api_response = api.revisions(
            [response.url.replace('https://www.wikidata.org/wiki/', '')])

        bot_url = self.get_bot_url(response)
        operator_url = self.get_operator_url(response)

        data = {
            'url':
            unquote(unescape(response.url.replace('https://', ''))),
            'bot_url':
            bot_url,
            'bot_name':
            self.get_bot_name(response),
            'bot_has_red_link':
            1 if striper.RED_LINK_RE.match(bot_url) else 0,
            'operator_url':
            operator_url,
            'operator_name':
            self.get_operator_name(response),
            'operator_has_red_link':
            1 if striper.RED_LINK_RE.match(operator_url) else 0,
            'is_successful':
            1 if response.url in self.json_data['successful_requests'] else 0,
            'first_edit':
            self.get_first_edit(api_response),
            'last_edit':
            self.get_last_edit(api_response),
            'closed_at':
            self.get_closed_at(response),
            'revision_count':
            self.get_revision_count(api_response),
            'editor_count':
            self.get_editor_count(api_response),
            'html':
            response.css('div#bodyContent').extract_first(),
            'task':
            None,
            'code':
            None,
            'function':
            None,
            'archive_comment':
            self.get_archive_comment(response.url),
            'summary':
            None,
            'retrieved_at':
            datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
        }

        for key in ['task', 'code', 'function', 'summary']:
            data[key] = self.xpath(key, response)

        for key, symbol in self.XPATH['symbols'].items():
            data[key + '_symbol_count'] = len(response.xpath(symbol))

        db.insert('requests_for_permissions', data)
    def print_average_edit_count_per_day(cls):

        data = {
            'userid': [],
            'bot': [],
            'edit_count_per_day': [],
            'registration': []
        }

        for item in db.execute(cls.EDIT_COUNT_WITH_REGISTRATION_DATE_QUERY):
            data['userid'].append(item[0])
            data['bot'].append(item[1])

            registration_date = datetime.strptime(item[3], '%Y-%m-%dT%H:%M:%SZ')
            current_date = datetime.now()
            days_since_registration = (current_date - registration_date).days
            edit_count_per_day = int(item[2] / days_since_registration)

            data['edit_count_per_day'].append(edit_count_per_day)
            data['registration'].append(str(registration_date.date()))

        df = pd.DataFrame(data)
        df = df.sort_values(by=['edit_count_per_day', 'bot'])

        print_df(df, ['userid', 'bot', 'edit_count_per_day', 'registration'], ['12', '35', '20', '20'])
    def print_bots_with_bot_flag_and_in_bot_group(cls):

        print_names_and_count(
            set([
                item[0]
                for item
                in db.execute(cls.BOTS_WITH_BOT_FLAG_AND_IN_BOT_GROUP_QUERY)
            ]),
            'unique bots with a bot flag and which belong to the group bot'
        )
    def print_bots_without_request(cls):

        print_names_and_count(
            set([
                item[0]
                for item
                in db.execute(cls.BOTS_WITHOUT_REQUEST_QUERY)
            ]),
            'bots without a request for permission'
        )
    def print_bots_in_bot_group(cls):

        print_names_and_count(
            set([
                item[0]
                for item
                in db.execute(cls.BOTS_WITHOUT_BOT_FLAG_IN_BOT_GROUP_QUERY)
            ]),
            'unique bots which belong to the group bot and do not have a bot flag'
        )
    def print_unique_bots(cls):

        print_names_and_count(
            set([
                item[0]
                for item
                in db.execute(cls.UNIQUE_BOTS_QUERY)
            ]),
            'unique bots'
        )
    def print_bots_without_request_without_groups(cls):

        print_names_and_count(
            set([
                item[0]
                for item
                in db.execute(cls.GROUPS_OF_BOTS_WITHOUT_REQUEST_QUERY)
                if item[1] is None
            ]),
            'bots without a request and without groups'
        )
    def print_request_for_permission_without_editor_count(cls):

        print_names_and_count(
            [
                item[0]
                for item
                in db.execute(cls.REQUEST_WITHOUT_EDITOR_COUNT_QUERY)
             ],
            'requests for permissions without editor_count',
            "\n"
        )
    def print_request_for_permission_without_closed_at(cls):

        print_names_and_count(
            [
                item[0]
                for item
                in db.execute(cls.REQUEST_WITHOUT_CLOSED_AT_QUERY)
            ],
            'requests for permissions without closed_at',
            "\n"
        )
    def plot_distribution_over_time(cls, sql):

        earliest_time = None

        for item in db.execute(sql + cls.SQL_MIN):
            earliest_time = re.sub(cls.TIME_RE, '', item[0])

        latest_time = None

        for item in db.execute(sql + cls.SQL_MAX):
            latest_time = re.sub(cls.TIME_RE, '', item[0])

        time_series = cls.init_time_series(earliest_time, latest_time)

        result = db.execute(sql)

        for item in result:
            time = re.sub(cls.TIME_RE, '', item[0])
            time_series[time] += 1

        cls.plot(time_series, ['date', 'count'])
    def print_groups_differences(cls):

        with_request_groups = []

        for item in db.execute(cls.GROUPS_OF_BOTS_WITH_REQUEST_QUERY):
            if item[1] is not None:
                with_request_groups += item[1].split(',')

        with_request_groups = set(with_request_groups)

        without_request_groups = []

        for item in db.execute(cls.GROUPS_OF_BOTS_WITHOUT_REQUEST_QUERY):
            if item[1] is not None:
                without_request_groups += item[1].split(',')

        without_request_groups = set(without_request_groups)
        print(
            "#################### All groups that bots with a request for permission belong to but all other bots do not: ####################\n",
            ', '.join(with_request_groups.difference(without_request_groups)), "\n")
        print(
            "#################### All groups that bots without a request for permission belong to but all other bots do not: ####################\n",
            ', '.join(without_request_groups.difference(with_request_groups)), "\n")
    def print_bots_with_request_without_groups(cls):

        bots_with_red_link = []
        bots_without_red_link = []

        for item in db.execute(cls.GROUPS_OF_BOTS_WITH_REQUEST_QUERY):
            if item[1] is None:
                if item[2] == 1:
                    bots_with_red_link.append(item[0])
                else:
                    bots_without_red_link.append(item[0])

        print_names_and_count(bots_with_red_link, 'bots with a request, without groups and a red link')
        print_names_and_count(bots_without_red_link, 'bots with a request, without groups and without a red link')
    def plot_distribution(cls, sql):

        min_value = cls.get_min_value(sql + ''' ORDER BY editor_count''')
        max_value = cls.get_max_value(sql + ''' ORDER BY editor_count''')

        distribution = {}

        for i in range(min_value, max_value + 1):
            distribution[i] = 0

        for item in db.execute(sql):
            distribution[item[0]] = item[1]

        cls.plot(distribution, ['editor_count', 'count'])
    def print_right_differences_for_request(cls):

        with_request_rights = []

        for item in db.execute(cls.RIGHTS_OF_BOTS_WITH_REQUEST_QUERY):
            if item[1] is not None:
                with_request_rights += item[1].split(',')

        with_request_rights = set(with_request_rights)

        without_request_rights = []

        for item in db.execute(cls.RIGHTS_OF_BOTS_WITHOUT_REQUEST_QUERY):
            if item[1] is not None:
                without_request_rights += item[1].split(',')

        without_request_rights = set(without_request_rights)

        print(
            "#################### All rights that bots with a request for permission have but all other bots do not have: ####################\n",
            ', '.join(with_request_rights.difference(without_request_rights)), "\n")
        print(
            "#################### All rights that bots without a request for permission have but all other bots do not have: ####################\n",
            ', '.join(without_request_rights.difference(with_request_rights)), "\n")
    def print_rights_of_bot_in_bot_group(cls):

        data = {
            'right': []
        }

        for item in db.execute(cls.RIGHTS_OF_BOTS_IN_GROUP_BOT):
            if item[1] is not None:
                data['right'] += item[1].split(',')

        df = pd.DataFrame(data)
        df = df.groupby(['right']).size().reset_index(name='count')
        df = df.sort_values(by=['count', 'right'], ascending=[False, True])

        print_df(df, ['right', 'count'], ['30', '30'])
    def print_editor_count(cls, mode='none'):

        data = {
            'url': [],
            'editor_count': []
        }

        for item in db.execute(cls.EDITOR_COUNT_QUERIES[mode]):
            data['url'].append(item[0])
            data['editor_count'].append(item[1])

        df = pd.DataFrame(data)
        df = df.sort_values(by=['editor_count', 'url'])

        print_df(df, ['url', 'editor_count'], ['90', '15'])
    def print_right_differences_for_bot_flag_and_bot_group(cls):

        with_bot_flag_rights = []

        for item in db.execute(cls.RIGHTS_OF_BOTS_WITH_BOT_FLAG_QUERY):
            if item[1] is not None:
                with_bot_flag_rights += item[1].split(',')

        with_bot_flag_rights = set(with_bot_flag_rights)

        in_bot_group_rights = []

        for item in db.execute(cls.RIGHTS_OF_BOTS_IN_GROUP_BOT):
            if item[1] is not None:
                in_bot_group_rights += item[1].split(',')

        in_bot_group_rights = set(in_bot_group_rights)

        print(
            "#################### All rights that bots with a bot flag have but bots in bot group do not have: ####################\n",
            ', '.join(with_bot_flag_rights.difference(in_bot_group_rights)), "\n")
        print(
            "#################### All rights that bots which belong to the bot group have but bots with a bot flag do not have: ####################\n",
            ', '.join(in_bot_group_rights.difference(with_bot_flag_rights)), "\n")
    def plot_general_statistics_about_requests(cls):

        data = {
            'request': {},
            'bot': {}
        }

        for target in cls.GENERAL_STATISTICS_ABOUT_REQUESTS_QUERIES.keys():
            for statistic, query in cls.GENERAL_STATISTICS_ABOUT_REQUESTS_QUERIES[target].items():
                for item in db.execute(query):
                    data[target][statistic] = item[0]

        trace0 = Bar(
            y=list(data['request'].keys()),
            x=list(data['request'].values()),
            name='Requests for Permissions',
            orientation='h',
            marker=dict(
                color='rgb(235,173,104)',
                line=dict(
                    color='rgb(185,125,54)',
                    width=1.5),
            ),
            opacity=0.8
        )
        trace1 = Bar(
            y=list(data['bot'].keys()),
            x=list(data['bot'].values()),
            name='Bots',
            orientation='h',
            marker=dict(
                color='rgb(204,204,204)',
                line=dict(
                    color='rgb(150,150,150)',
                    width=1.5),
            ),
            opacity=0.8
        )

        data = [trace0, trace1]
        layout = Layout(
            xaxis=dict(tickangle=-45),
            barmode='group',
        )

        fig = Figure(data=data, layout=layout)
        iplot(fig, filename='angled-text-bar')
    def plot_bots_groups_distribution(cls):

        data = {
            'groups': [],
            'count': []
        }

        for item in db.execute(cls.GROUPED_GROUPS_OF_BOTS_QUERY):
            groups = item[0].split(',')
            groups.sort()
            data['groups'].append(', '.join(groups))
            data['count'].append(item[1])

        df = pd.DataFrame(data)
        df['%'] = round(df['count'] / df['count'].sum() * 100, 2)

        print_df(df, ['groups', 'count', '%'], ['60', '15', '15'])
    def print_groups_of_bots_without_request(cls):

        data = {
            'group': list(itertools.chain.from_iterable(
                [
                    item[1].split(',')
                    for item
                    in db.execute(cls.GROUPS_OF_BOTS_WITHOUT_REQUEST_QUERY)
                    if item[1] is not None
                ]
            ))
        }

        df = pd.DataFrame(data)
        df = df.groupby(['group']).size().reset_index(name='count')
        df = df.sort_values(by=['count', 'group'], ascending=[False, True])

        print_df(df, ['group', 'count'], ['30', '30'])
    def print_edit_count(cls):

        data = {
            'userid': [],
            'bot': [],
            'edit_count': [],
            'registration': []
        }

        for item in db.execute(cls.EDIT_COUNT_WITH_REGISTRATION_DATE_QUERY):
            data['userid'].append(item[0])
            data['bot'].append(item[1])
            data['edit_count'].append(item[2])
            data['registration'].append(str(datetime.strptime(item[3], '%Y-%m-%dT%H:%M:%SZ').date()))

        df = pd.DataFrame(data)
        df = df.sort_values(by=['edit_count', 'bot'])

        print_df(df, ['userid', 'bot', 'edit_count', 'registration'], ['12', '35', '12', '20'])
    def plot_bots_groups_without_implicit_groups_distribution(cls):

        data = {
            'groups': []
        }

        for item in db.execute(cls.GROUPED_GROUPS_OF_BOTS_WITHOUT_IMPLICIT_GROUPS_QUERY):
            all_groups = set(item[0].split(','))
            implicit_groups = set(item[1].split(','))
            explicit_groups = all_groups - implicit_groups
            explicit_groups = list(explicit_groups)
            explicit_groups.sort()
            data['groups'].append(', '.join(explicit_groups))

        df = pd.DataFrame(data)
        df = df.groupby(['groups']).size().reset_index(name='count')
        df = df.sort_values(by=['count', 'groups'], ascending=[False, True])
        df['%'] = round(df['count'] / df['count'].sum() * 100, 2)

        print_df(df, ['groups', 'count', '%'], ['40', '15', '15'])
    def print_bots_with_request_without_rights(cls):

        bots_with_red_link = []
        # bots_with_right_and_redlink = []
        bots_without_red_link = []

        for item in db.execute(cls.RIGHTS_OF_BOTS_WITH_REQUEST_QUERY):
            if item[1] is None:
                if item[2] == 1:
                    bots_with_red_link.append(item[0])
                else:
                    bots_without_red_link.append(item[0])
        #    else:
        #        if item[2] == 1:
        #            bots_with_right_and_redlink.append(item[0])

        #print(
        #    "#################### Names of all bots with a request, with rights and a red link: ####################\n",
        #    ', '.join(bots_with_right_and_redlink), "\n")

        print_names_and_count(bots_with_red_link, 'bots with a request, without rights and a red link')
        print_names_and_count(bots_without_red_link, 'bots with a request, without rights and without a red link')
import logging

from db import SqliteDb as db
from parser import BotsGroupParser as bp
from parser import BotsTableCreator as bc

with open('config.yaml', 'r', encoding='utf-8') as config_file:
    config = yaml.load(config_file)
    logging.basicConfig(filename=config['log'], level=logging.DEBUG)

spiders = [
    'archives_spider',
    'request_for_permission_spider',
    'requests_for_permissions_spider',
    'bots_with_botflag_spider',
    'extension_bots_spider',
    'bots_without_botflag_spider',
    'bots_with_requests_for_permissions_spider',
]

db.reset()

# run all spiders
for spider in spiders:
    subprocess.call(['scrapy', 'runspider', "src/%s.py" % spider])

bp.parse()
bc.create()

db.migrate()
Esempio n. 25
0
 def __init__(self, **kwargs):
     super().__init__(**kwargs)
     db.reset()
 def retrieve_bots(sql):
     bots = [item[0] for item in db.execute(sql)]
     return bots, len(bots)
Esempio n. 27
0
    def create(cls):

        bots_with_botflag = []
        bots_without_botflag = []
        extension_bots = []
        bots = []

        for file in cls.FILES:
            with open(file) as f:
                reader = csv.reader(f)
                bots += [row for row in reader][0]

        with open('data/spiders/bots_with_botflag.csv') as f:
            reader = csv.reader(f)
            bots_with_botflag += [row for row in reader][0]

        with open('data/spiders/bots_without_botflag.csv') as f:
            reader = csv.reader(f)
            bots_without_botflag += [row for row in reader][0]

        with open('data/spiders/extension_bots.csv') as f:
            reader = csv.reader(f)
            extension_bots += [row for row in reader][0]

        bots = set(bots)

        batches = [
            list(bots)[i * 50:(i + 1) * 50]
            for i in range(int(len(bots) / 50) + 1)
        ]

        for batch in batches:

            for bot in api.users(batch)['query']['users']:

                if db.exists('bots', 'name', bot['name']):
                    continue

                bot['retrieved_at'] = datetime.datetime.utcnow().strftime(
                    "%Y-%m-%d %H:%M:%S")
                bot['has_botflag'] = 1 if bot[
                    'name'] in bots_with_botflag else 0 if bot[
                        'name'] in bots_without_botflag else None
                bot['is_extension_bot'] = 1 if bot[
                    'name'] in extension_bots else 0

                if 'invalid' in bot or 'missing' in bot:
                    bot = {
                        'name': bot['name'],
                        'retrieved_at': bot['retrieved_at'],
                        'has_botflag': bot['has_botflag'],
                        'is_extension_bot': bot['is_extension_bot']
                    }
                    db.insert('bots', bot)
                    continue

                bot.pop('blockinfo', None)

                bot['groups'] = ','.join(bot['groups'])
                bot['implicitgroups'] = ','.join(bot['implicitgroups'])
                bot['rights'] = ','.join(bot['rights'])
                bot['blockid'] = bot['blockid'] if 'blockid' in bot else None
                bot['blockedby'] = bot[
                    'blockedby'] if 'blockedby' in bot else None
                bot['blockedbyid'] = bot[
                    'blockedbyid'] if 'blockedbyid' in bot else None
                bot['blockedtimestamp'] = bot[
                    'blockedtimestamp'] if 'blockedtimestamp' in bot else None
                bot['blockreason'] = bot[
                    'blockreason'] if 'blockreason' in bot else None
                bot['blockexpiry'] = bot[
                    'blockexpiry'] if 'blockexpiry' in bot else None

                db.insert('bots', bot)


# BotsTableCreator.create()
 def get_max_value(cls, sql):
     return [item[0] for item in db.execute(sql + cls.SQL_MAX)][0]