Esempio n. 1
0
                        ")"
GlobalVars.standby_message = "[ " + GlobalVars.chatmessage_prefix + " ] " \
                             "SmokeDetector started in [standby mode](" + \
                             "https://charcoal-se.org/smokey/SmokeDetector-Statuses#standby-mode) " + \
                             "at [rev " +\
                             GlobalVars.commit_with_author +\
                             "](" + GlobalVars.bot_repository + "/commit/" +\
                             GlobalVars.commit['id'] +\
                             ") (running on " +\
                             GlobalVars.location +\
                             ")"

GlobalVars.standby_mode = "standby" in sys.argv

chatcommunicate.init(username, password)
Tasks.periodic(Metasmoke.send_status_ping, interval=60)
Tasks.periodic(Metasmoke.check_last_pingtime, interval=30)

if GlobalVars.standby_mode:
    chatcommunicate.tell_rooms_with("debug", GlobalVars.standby_message)
    Metasmoke.send_status_ping()

    while GlobalVars.standby_mode:
        time.sleep(3)

    chatcommunicate.join_command_rooms()


# noinspection PyProtectedMember
def check_socket_connections():
    for client in chatcommunicate._clients.values():
Esempio n. 2
0
# We need an instance of bodyfetcher before load_files() is called
GlobalVars.bodyfetcher = BodyFetcher()
if GlobalVars.flovis_host:
    GlobalVars.flovis = Flovis(GlobalVars.flovis_host)

load_files()
load_ms_cache_data()
filter_auto_ignored_posts()


GlobalVars.standby_mode = "standby" in sys.argv
GlobalVars.no_se_activity_scan = 'no_se_activity_scan' in sys.argv

chatcommunicate.init(username, password)
Tasks.periodic(Metasmoke.send_status_ping_and_verify_scanning_if_active, interval=60)

if GlobalVars.standby_mode:
    chatcommunicate.tell_rooms_with("debug", GlobalVars.standby_message)
    Metasmoke.send_status_ping()

    while GlobalVars.standby_mode:
        time.sleep(3)

    chatcommunicate.join_command_rooms()


# noinspection PyProtectedMember
def check_socket_connections():
    for client in chatcommunicate._clients.values():
        if client.last_activity and (datetime.utcnow() - client.last_activity).total_seconds() >= 60:
class BodyFetcher:
    queue = {}
    previous_max_ids = {}

    # special_cases are the minimum number of posts, for each of the specified sites, which
    # need to be in the queue prior to feching posts.
    # The number of questions we fetch each day per site is the total of
    #   new questions + new answers + edits.
    # Stack Overflow is handled specially. It's know that some SO edits/new posts don't
    #   appear on the WebSocket.
    # Queue depths were last comprehensively adjusted on 2015-12-30.
    special_cases = {
        # 2020-11-02:
        # Request numbers pre 2020-11-02 are very low due to a now fixed bug.
        #
        #                                                                pre                   sum of requests
        #                                               questions    2020-11-02   2020-02-19    2020-10-28 to
        #                                                per day       setting     requests      2020-11-02
        # "stackoverflow.com": 3,                   # _  6,816            3          360            4,365
        # "math.stackexchange.com": 2,              # _    596            1          473            6,346
        # "ru.stackoverflow.com": 2,                # _    230           10           13              145
        # "askubuntu.com": ,                        # _    140            1           88            1,199
        # "es.stackoverflow.com": 2,                # _    138            5           25              225
        # "superuser.com": ,                        # _    122            1           87            1,038
        # "physics.stackexchange.com": ,            # _     90            1           76            1,161
        # "stats.stackexchange.com": 2,             # _     82            5           16              151
        # "pt.stackoverflow.com": 2,                # _     73           10            7               75
        # "unix.stackexchange.com": ,               # _     72            1           76              772
        # "electronics.stackexchange.com": ,        # _     69            1           46              723
        # "serverfault.com": ,                      # _     62            1           43              582
        # "tex.stackexchange.com": 2,               # _     60            5            8               98
        # "blender.stackexchange.com": 2,           # _     59            5            8               85
        # "salesforce.stackexchange.com": ,         # _     49            1           47              472
        # "gis.stackexchange.com": 2,               # _     46            3           15              166
        # "mathoverflow.net" (time_sensitive)       # _     37            -           33              511
        # "english.stackexchange.com": ,            # _     36            1           34              382
        # "magento.stackexchange.com": 2,           # _     34            3            5               93
        # "ell.stackexchange.com": ,                # _     33            1           24              365
        # "wordpress.stackexchange.com": ,          # _     29            1           30              283
        # "apple.stackexchange.com": ,              # _     29            1           46              294
        # "diy.stackexchange.com": ,                # _     26            1           24              306
        # "mathematica.stackexchange.com": ,        # _     25            1           21              384
        # "dba.stackexchange.com": ,                # _     23            1           31              343
        # "datascience.stackexchange.com": ,        # _     21            1           17              220
        # "chemistry.stackexchange.com": ,          # _     20            1           20              140
        # "security.stackexchange.com": ,           # _     18            1           15              238
        # "codereview.stackexchange.com": ,         # _     18            5            2               39
        #  The only reason this is the cut-off is that it was the last in the existing list
        #    as of 2020-11-01.
    }

    time_sensitive = [
        "security.stackexchange.com", "movies.stackexchange.com",
        "mathoverflow.net", "gaming.stackexchange.com",
        "webmasters.stackexchange.com", "arduino.stackexchange.com",
        "workplace.stackexchange.com"
    ]

    threshold = 1

    last_activity_date = 0
    last_activity_date_lock = threading.Lock()
    ACTIVITY_DATE_EXTRA_EARLIER_MS_TO_FETCH = 6 * 60 * 1000  # 6 minutes in milliseconds; is beyond edit grace period

    api_data_lock = threading.Lock()
    queue_lock = threading.Lock()
    max_ids_modify_lock = threading.Lock()

    RECENTLY_SCANNED_POSTS_EXPIRE_INTERVAL = 10 * 60  # 10 minutes
    Tasks.periodic(expire_recently_scanned_posts,
                   interval=RECENTLY_SCANNED_POSTS_EXPIRE_INTERVAL)

    def add_to_queue(self, hostname, question_id, should_check_site=False):
        # For the Sandbox questions on MSE, we choose to ignore the entire question and all answers.
        ignored_mse_questions = [
            3122,  # Formatting Sandbox
            51812,  # The API sandbox
            296077,  # Sandbox archive
        ]
        if question_id in ignored_mse_questions and hostname == "meta.stackexchange.com":
            return  # don't check meta sandbox, it's full of weird posts

        with self.queue_lock:
            if hostname not in self.queue:
                self.queue[hostname] = {}

            # Something about how the queue is being filled is storing Post IDs in a list.
            # So, if we get here we need to make sure that the correct types are paseed.
            #
            # If the item in self.queue[hostname] is a dict, do nothing.
            # If the item in self.queue[hostname] is not a dict but is a list or a tuple, then convert to dict and
            # then replace the list or tuple with the dict.
            # If the item in self.queue[hostname] is neither a dict or a list, then explode.
            if type(self.queue[hostname]) is dict:
                pass
            elif type(self.queue[hostname]) in [list, tuple]:
                post_list_dict = {}
                for post_list_id in self.queue[hostname]:
                    post_list_dict[str(post_list_id)] = None
                self.queue[hostname] = post_list_dict
            else:
                raise TypeError(
                    "A non-iterable is in the queue item for a given site, this will cause errors!"
                )

            # This line only works if we are using a dict in the self.queue[hostname] object, which we should be with
            # the previous conversion code.
            self.queue[hostname][str(question_id)] = datetime.utcnow()
            flovis_dict = None
            if GlobalVars.flovis is not None:
                flovis_dict = {
                    sk: list(sq.keys())
                    for sk, sq in self.queue.items()
                }

        if flovis_dict is not None:
            GlobalVars.flovis.stage('bodyfetcher/enqueued', hostname,
                                    question_id, flovis_dict)

        if should_check_site:
            self.make_api_call_for_site(hostname)
        else:
            self.check_queue()
        return

    def check_queue(self):
        # This should be called once in a new Thread every time we add an entry to the queue. Thus, we
        # should only need to process a single queue entry in order to keep the queue from containing
        # entries which are qualified for processing, but which haven't been processed. However, that
        # doesn't account for the possibility of things going wrong and/or implementing some other
        # way to qualify other than the depth of the queue for a particular site (e.g. time in queue).

        # We use a copy of the queue in order to allow the queue to be changed in other threads.
        # This is OK, because self.make_api_call_for_site(site) verifies that the site
        # is still in the queue.
        sites_to_handle = []
        is_time_sensitive_time = datetime.utcnow().hour in range(4, 12)
        with self.queue_lock:
            sites_in_queue = {
                site: len(values)
                for site, values in self.queue.items()
            }
        # Get sites listed in special cases and as time_sensitive
        for site, length in sites_in_queue.items():
            if site in self.special_cases:
                if length >= self.special_cases[site]:
                    sites_to_handle.append(site)
                    continue
            if is_time_sensitive_time and site in self.time_sensitive and length >= 1:
                sites_to_handle.append(site)

        # Remove the sites which we've handled from our copy of the queue.
        for site in sites_to_handle:
            sites_in_queue.pop(site, None)

        # if we don't have any sites with their queue filled, take the first one without a special case
        for site, length in sites_in_queue.items():
            if site not in self.special_cases and length >= self.threshold:
                sites_to_handle.append(site)

        for site in sites_to_handle:
            self.make_api_call_for_site(site)

        if not sites_to_handle:
            # We're not making an API request, so explicitly store the queue.
            Tasks.do(store_bodyfetcher_queue)

    def print_queue(self):
        return '\n'.join([
            "{0}: {1}".format(key, str(len(values)))
            for (key, values) in self.queue.items()
        ])

    def make_api_call_for_site(self, site):
        with self.queue_lock:
            new_posts = self.queue.pop(site, None)
        if new_posts is None:
            # site was not in the queue
            return
        Tasks.do(store_bodyfetcher_queue)

        new_post_ids = [int(k) for k in new_posts.keys()]

        if GlobalVars.flovis is not None:
            for post_id in new_post_ids:
                GlobalVars.flovis.stage('bodyfetcher/api_request', site,
                                        post_id, {
                                            'site': site,
                                            'posts': list(new_posts.keys())
                                        })

        # Add queue timing data
        pop_time = datetime.utcnow()
        post_add_times = [(pop_time - v).total_seconds()
                          for k, v in new_posts.items()]
        Tasks.do(add_queue_timing_data, site, post_add_times)

        store_max_ids = False
        with self.max_ids_modify_lock:
            if site in self.previous_max_ids and max(
                    new_post_ids) > self.previous_max_ids[site]:
                previous_max_id = self.previous_max_ids[site]
                intermediate_posts = range(previous_max_id + 1,
                                           max(new_post_ids))

                # We don't want to go over the 100-post API cutoff, so take the last
                # (100-len(new_post_ids)) from intermediate_posts

                intermediate_posts = intermediate_posts[-(100 -
                                                          len(new_post_ids)):]

                # new_post_ids could contain edited posts, so merge it back in
                combined = chain(intermediate_posts, new_post_ids)

                # Could be duplicates, so uniquify
                posts = list(set(combined))
            else:
                posts = new_post_ids

            new_post_ids_max = max(new_post_ids)
            if new_post_ids_max > self.previous_max_ids.get(site, 0):
                self.previous_max_ids[site] = new_post_ids_max
                store_max_ids = True

        if store_max_ids:
            schedule_store_bodyfetcher_max_ids()

        log('debug', "New IDs / Hybrid Intermediate IDs for {}:".format(site))
        if len(new_post_ids) > 30:
            log(
                'debug', "{} +{} more".format(
                    sorted(new_post_ids)[:30],
                    len(new_post_ids) - 30))
        else:
            log('debug', sorted(new_post_ids))
        if len(new_post_ids) == len(posts):
            log('debug', "[ *Identical* ]")
        elif len(posts) > 30:
            log('debug',
                "{} +{} more".format(sorted(posts)[:30],
                                     len(posts) - 30))
        else:
            log('debug', sorted(posts))

        question_modifier = ""
        pagesize_modifier = {}

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            with self.last_activity_date_lock:
                if self.last_activity_date != 0:
                    pagesize = "100"
                else:
                    pagesize = "50"

                pagesize_modifier = {
                    'pagesize':
                    pagesize,
                    'min':
                    str(self.last_activity_date -
                        self.ACTIVITY_DATE_EXTRA_EARLIER_MS_TO_FETCH)
                }
        else:
            question_modifier = "/{0}".format(";".join(
                [str(post) for post in posts]))

        url = "https://api.stackexchange.com/2.2/questions{}".format(
            question_modifier)
        params = {
            'filter':
            '!1rs)sUKylwB)8isvCRk.xNu71LnaxjnPS12*pX*CEOKbPFwVFdHNxiMa7GIVgzDAwMa',
            'key': 'IAkbitmze4B8KpacUfLqkw((',
            'site': site
        }
        params.update(pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)

        with GlobalVars.api_request_lock:
            # Respect backoff, if we were given one
            if GlobalVars.api_backoff_time > time.time():
                time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
            try:
                time_request_made = datetime.utcnow().strftime('%H:%M:%S')
                response = requests.get(url, params=params, timeout=20).json()
            except (requests.exceptions.Timeout, requests.ConnectionError,
                    Exception):
                # Any failure in the request being made (timeout or otherwise) should be added back to
                # the queue.
                with self.queue_lock:
                    if site in self.queue:
                        self.queue[site].update(new_posts)
                    else:
                        self.queue[site] = new_posts
                return

            with self.api_data_lock:
                add_or_update_api_data(site)

            message_hq = ""
            with GlobalVars.apiquota_rw_lock:
                if "quota_remaining" in response:
                    quota_remaining = response["quota_remaining"]
                    if quota_remaining - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0 \
                            and quota_remaining > 39980:
                        tell_rooms_with(
                            "debug",
                            "API quota rolled over with {0} requests remaining. "
                            "Current quota: {1}.".format(
                                GlobalVars.apiquota, quota_remaining))

                        sorted_calls_per_site = sorted(
                            GlobalVars.api_calls_per_site.items(),
                            key=itemgetter(1),
                            reverse=True)
                        api_quota_used_per_site = ""
                        for site_name, quota_used in sorted_calls_per_site:
                            sanatized_site_name = site_name.replace(
                                '.com', '').replace('.stackexchange', '')
                            api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(
                                str(quota_used))
                        api_quota_used_per_site = api_quota_used_per_site.strip(
                        )

                        tell_rooms_with("debug", api_quota_used_per_site)
                        clear_api_data()
                    if quota_remaining == 0:
                        tell_rooms_with(
                            "debug",
                            "API reports no quota left!  May be a glitch.")
                        tell_rooms_with(
                            "debug", str(response))  # No code format for now?
                    if GlobalVars.apiquota == -1:
                        tell_rooms_with(
                            "debug", "Restart: API quota is {quota}.".format(
                                quota=quota_remaining))
                    GlobalVars.apiquota = quota_remaining
                else:
                    message_hq = "The quota_remaining property was not in the API response."

            if "error_message" in response:
                message_hq += " Error: {} at {} UTC.".format(
                    response["error_message"], time_request_made)
                if "error_id" in response and response["error_id"] == 502:
                    if GlobalVars.api_backoff_time < time.time(
                    ) + 12:  # Add a backoff of 10 + 2 seconds as a default
                        GlobalVars.api_backoff_time = time.time() + 12
                message_hq += " Backing off on requests for the next 12 seconds."
                message_hq += " Previous URL: `{}`".format(url)

            if "backoff" in response:
                if GlobalVars.api_backoff_time < time.time(
                ) + response["backoff"]:
                    GlobalVars.api_backoff_time = time.time(
                    ) + response["backoff"]

        if len(message_hq) > 0 and "site is required" not in message_hq:
            message_hq = message_hq.strip()
            if len(message_hq) > 500:
                message_hq = "\n" + message_hq
            tell_rooms_with("debug", message_hq)

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                with self.last_activity_date_lock:
                    self.last_activity_date = items[0]["last_activity_date"]

        num_scanned = 0
        start_time = time.time()

        for post in response["items"]:
            if GlobalVars.flovis is not None:
                pnb = copy.deepcopy(post)
                if 'body' in pnb:
                    pnb['body'] = 'Present, but truncated'
                if 'answers' in pnb:
                    del pnb['answers']

            if "title" not in post or "body" not in post:
                if GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage(
                        'bodyfetcher/api_response/no_content', site,
                        post['question_id'], pnb)
                continue

            post['site'] = site
            try:
                post['edited'] = (post['creation_date'] !=
                                  post['last_edit_date'])
            except KeyError:
                post[
                    'edited'] = False  # last_edit_date not present = not edited

            question_doesnt_need_scan = is_post_recently_scanned_and_unchanged(
                post)
            add_recently_scanned_post(post)
            if not question_doesnt_need_scan:
                try:
                    post_ = Post(api_response=post)
                except PostParseError as err:
                    log(
                        'error', 'Error {0} when parsing post: {1!r}'.format(
                            err, post_))
                    if GlobalVars.flovis is not None and 'question_id' in post:
                        GlobalVars.flovis.stage(
                            'bodyfetcher/api_response/error', site,
                            post['question_id'], pnb)
                    continue

                num_scanned += 1

                is_spam, reason, why = check_if_spam(post_)

                if is_spam:
                    try:
                        if GlobalVars.flovis is not None and 'question_id' in post:
                            GlobalVars.flovis.stage(
                                'bodyfetcher/api_response/spam', site,
                                post['question_id'], {
                                    'post': pnb,
                                    'check_if_spam': [is_spam, reason, why]
                                })
                        handle_spam(post=post_, reasons=reason, why=why)
                    except Exception as e:
                        log('error', "Exception in handle_spam:", e)
                elif GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage(
                        'bodyfetcher/api_response/not_spam', site,
                        post['question_id'], {
                            'post': pnb,
                            'check_if_spam': [is_spam, reason, why]
                        })

            try:
                if "answers" not in post:
                    pass
                else:
                    for answer in post["answers"]:
                        if GlobalVars.flovis is not None:
                            anb = copy.deepcopy(answer)
                            if 'body' in anb:
                                anb['body'] = 'Present, but truncated'

                        num_scanned += 1
                        answer["IsAnswer"] = True  # Necesssary for Post object
                        answer[
                            "title"] = ""  # Necessary for proper Post object creation
                        answer[
                            "site"] = site  # Necessary for proper Post object creation
                        try:
                            answer['edited'] = (answer['creation_date'] !=
                                                answer['last_edit_date'])
                        except KeyError:
                            answer[
                                'edited'] = False  # last_edit_date not present = not edited
                        answer_doesnt_need_scan = is_post_recently_scanned_and_unchanged(
                            answer)
                        add_recently_scanned_post(answer)
                        if answer_doesnt_need_scan:
                            continue
                        answer_ = Post(api_response=answer, parent=post_)

                        is_spam, reason, why = check_if_spam(answer_)
                        if is_spam:
                            try:
                                if GlobalVars.flovis is not None and 'answer_id' in answer:
                                    GlobalVars.flovis.stage(
                                        'bodyfetcher/api_response/spam', site,
                                        answer['answer_id'], {
                                            'post': anb,
                                            'check_if_spam':
                                            [is_spam, reason, why]
                                        })
                                handle_spam(answer_, reasons=reason, why=why)
                            except Exception as e:
                                log('error', "Exception in handle_spam:", e)
                        elif GlobalVars.flovis is not None and 'answer_id' in answer:
                            GlobalVars.flovis.stage(
                                'bodyfetcher/api_response/not_spam', site,
                                answer['answer_id'], {
                                    'post': anb,
                                    'check_if_spam': [is_spam, reason, why]
                                })

            except Exception as e:
                log('error', "Exception handling answers:", e)

        end_time = time.time()
        scan_time = end_time - start_time
        GlobalVars.PostScanStat.add_stat(num_scanned, scan_time)
        return
Esempio n. 4
0
                        ")"
GlobalVars.standby_message = "[ " + GlobalVars.chatmessage_prefix + " ] " \
                             "SmokeDetector started in [standby mode](" + \
                             "https://charcoal-se.org/smokey/SmokeDetector-Statuses#standby-mode) " + \
                             "at [rev " +\
                             GlobalVars.commit_with_author +\
                             "](" + GlobalVars.bot_repository + "/commit/" +\
                             GlobalVars.commit['id'] +\
                             ") (running on " +\
                             GlobalVars.location +\
                             ")"

GlobalVars.standby_mode = "standby" in sys.argv

chatcommunicate.init(username, password)
Tasks.periodic(Metasmoke.send_status_ping, interval=60)
Tasks.periodic(Metasmoke.check_last_pingtime, interval=30)

if GlobalVars.standby_mode:
    chatcommunicate.tell_rooms_with("debug", GlobalVars.standby_message)
    Metasmoke.send_status_ping()

    while GlobalVars.standby_mode:
        time.sleep(3)

    chatcommunicate.init(username, password)  # to rejoin rooms


# noinspection PyProtectedMember
def check_socket_connections():
    for client in chatcommunicate._clients.values():
Esempio n. 5
0

# noinspection PyProtectedMember
def check_socket_connections():
    for client in chatcommunicate._clients.values():
        if client.last_activity and (datetime.utcnow() - client.last_activity
                                     ).total_seconds() >= 60:
            os._exit(10)


# noinspection PyProtectedMember
def restart_automatically():
    os._exit(5)


Tasks.periodic(check_socket_connections, interval=90)
Tasks.later(restart_automatically, after=21600)

log('info', '{} active'.format(GlobalVars.location))
log('info', 'MS host: {}'.format(GlobalVars.metasmoke_host))


def setup_websocket(attempt, max_attempts):
    try:
        ws = websocket.create_connection("wss://qa.sockets.stackexchange.com/")
        ws.send("155-questions-active")
        return ws
    except:
        log(
            'warning',
            'WS failed to create websocket connection. Attempt {} of {}.'.
Esempio n. 6
0
        if my_lock:
            GlobalVars.recently_scanned_posts_lock.release()


def expire_posts():
    min_retained_timestamp = time.time(
    ) - GlobalVars.recently_scanned_posts_retention_time
    with GlobalVars.recently_scanned_posts_lock:
        # A dict comprehension can be used to do this:
        #   GlobalVars.recently_scanned_posts = {key: value for key, value in GlobalVars.recently_scanned_posts.items()
        #                                        if value['scan_timestamp'] > min_retained_timestamp}
        # But, that has a notably higher memory requirement than deleting the entries.
        # Where the right trade-off wrt. higher memory use vs. maybe more time for del/pop isn't clear and will depend
        # on the size of the dict and memory/CPU available for the particular SD instance.
        rs_posts = GlobalVars.recently_scanned_posts
        original_length = len(rs_posts)
        keys_to_delete = [
            key for key, value in rs_posts.items()
            if value['scan_timestamp'] < min_retained_timestamp
        ]
        for key in keys_to_delete:
            rs_posts.pop(key, None)
        new_length = len(rs_posts)
        log(
            'debug', 'Expire recently scanned posts: start: '
            '{}::  now: {}:: expired: {}'.format(original_length, new_length,
                                                 original_length - new_length))


Tasks.periodic(expire_posts, interval=POSTS_EXPIRE_INTERVAL)