def test_whitelisted_user():
    user_url = 'http://stackoverflow.com/users/2/geoff-dalgas'
    user = get_user_from_url(user_url)
    add_whitelisted_user(user)
    user_url2 = 'http://stackoverflow.com/users/0/test'
    user2 = get_user_from_url(user_url2)
    add_whitelisted_user(user2)
    post = Post(api_response={'title': '', 'body': '',
                              'owner': {'display_name': 'bagprada', 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '1', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is False
    post = Post(api_response={'title': 'baba ji', 'body': '',
                              'owner': {'display_name': '', 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '2', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is True
    post = Post(api_response={'title': 'baba ji', 'body': '',
                              'owner': {'display_name': 'bagprada', 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '3', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is True
    post = Post(api_response={'title': 'test', 'body': '',
                              'owner': {'display_name': 'baba ji - muscle building',
                                        'reputation': 1, 'link': user_url2},
                              'site': 'stackoverflow.com', 'question_id': '0', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is False
    # cleanup
    os.remove("whitelistedUsers.p")
def test_whitelisted_user():
    user_url = 'http://stackoverflow.com/users/2/geoff-dalgas'
    user = get_user_from_url(user_url)
    add_whitelisted_user(user)
    user_url2 = 'http://stackoverflow.com/users/0/test'
    user2 = get_user_from_url(user_url2)
    add_whitelisted_user(user2)
    is_spam, reason, _ = check_if_spam("", "", "bagprada", user_url,
                                       "stackoverflow.com", "1", False, False,
                                       1, 0)
    assert is_spam is False
    is_spam, reason, _ = check_if_spam("baba ji", "", "", user_url,
                                       "stackoverflow.com", "2", False, False,
                                       1, 0)
    assert is_spam is True
    is_spam, reason, _ = check_if_spam("baba ji", "", "bagprada", user_url,
                                       "stackoverflow.com", "3", False, False,
                                       1, 0)
    assert is_spam is True
    is_spam, reason, _ = check_if_spam("test", "", "baba ji - muscle building",
                                       user_url2, "stackoverflow.com", "0",
                                       False, False, 1, 0)
    assert is_spam is False
    # cleanup
    os.remove("whitelistedUsers.txt")
def test_whitelisted_user():
    user_url = 'http://stackoverflow.com/users/2/geoff-dalgas'
    user = get_user_from_url(user_url)
    add_whitelisted_user(user)
    user_url2 = 'http://stackoverflow.com/users/0/test'
    user2 = get_user_from_url(user_url2)
    add_whitelisted_user(user2)
    post = Post(api_response={'title': '', 'body': '',
                              'owner': {'display_name': 'bagprada', 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '1', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is False
    post = Post(api_response={'title': 'baba ji', 'body': '',
                              'owner': {'display_name': '', 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '2', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is True
    post = Post(api_response={'title': 'baba ji', 'body': '',
                              'owner': {'display_name': 'bagprada', 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '3', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is True
    post = Post(api_response={'title': 'test', 'body': '',
                              'owner': {'display_name': 'baba ji - muscle building',
                                        'reputation': 1, 'link': user_url2},
                              'site': 'stackoverflow.com', 'question_id': '0', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is False
    # cleanup
    _remove_pickle("whitelistedUsers.p")
Esempio n. 4
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)
        url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
        # wait to make sure API has/updates post data
        time.sleep(60)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        GlobalVars.apiquota = response["quota_remaining"]

        for post in response["items"]:
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            try:
                owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False)
            if owner_rep <= 50 and is_spam:
                try:
                    handle_spam(title, owner_name, site, link, owner_link, q_id, reason, False)
                except:
                    print "NOP"
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    try:
                        owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False)
                    if owner_rep <= 50 and is_spam:
                        try:
                            handle_spam(title, owner_name, site, link, owner_link, a_id, reason, True)
                        except:
                            print "NOP"
            except:
                print "no answers"
        return
Esempio n. 5
0
def test_whitelisted_user():
    user_url = 'http://stackoverflow.com/users/2/geoff-dalgas'
    user = get_user_from_url(user_url)
    add_whitelisted_user(user)
    is_spam, reason = check_if_spam("", "", "bagprada", user_url, "stackoverflow.com", "1", False, False)
    assert is_spam is False
    is_spam, reason = check_if_spam("baba ji", "", "", user_url, "stackoverflow.com", "2", False, False)
    assert is_spam is True
    is_spam, reason = check_if_spam("baba ji", "", "bagprada", user_url, "stackoverflow.com", "3", False, False)
    assert is_spam is True
    # cleanup
    os.remove("whitelistedUsers.txt")
def test_blacklisted_user():
    user_url = 'http://stackoverflow.com/users/1/jeff-atwood'
    user = get_user_from_url(user_url)
    add_blacklisted_user(user, "", "")
    is_spam, reason, _ = check_if_spam("", "", "", user_url, "stackoverflow.com", "1", False, False, 1, 0)
    assert is_spam is True
    # cleanup
    os.remove("blacklistedUsers.txt")
def test_whitelisted_user():
    user_url = 'http://stackoverflow.com/users/2/geoff-dalgas'
    user = get_user_from_url(user_url)
    add_whitelisted_user(user)
    user_url2 = 'http://stackoverflow.com/users/0/test'
    user2 = get_user_from_url(user_url2)
    add_whitelisted_user(user2)
    is_spam, reason, _ = check_if_spam("", "", "bagprada", user_url, "stackoverflow.com", "1", False, False, 1, 0)
    assert is_spam is False
    is_spam, reason, _ = check_if_spam("baba ji", "", "", user_url, "stackoverflow.com", "2", False, False, 1, 0)
    assert is_spam is True
    is_spam, reason, _ = check_if_spam("baba ji", "", "bagprada", user_url, "stackoverflow.com", "3", False, False, 1, 0)
    assert is_spam is True
    is_spam, reason, _ = check_if_spam("test", "", "baba ji - muscle building", user_url2, "stackoverflow.com", "0", False, False, 1, 0)
    assert is_spam is False
    # cleanup
    os.remove("whitelistedUsers.txt")
Esempio n. 8
0
def test_blacklisted_user():
    user_url = 'http://stackoverflow.com/users/1/jeff-atwood'
    user = get_user_from_url(user_url)
    add_blacklisted_user(user, "", "")
    is_spam, reason, _ = check_if_spam("", "", "", user_url, "stackoverflow.com", "1", False, False, 1, 0)
    assert is_spam is True
    # cleanup
    os.remove("blacklistedUsers.txt")
Esempio n. 9
0
def test_check_if_spam(title, body, username, site, match):
    # We can't check blacklists/whitelists in tests, so these are set to their default values
    user_url = ""
    post_id = 0
    # If we want to test answers separatly, this should be changed
    is_answer = False
    is_spam, reason, _ = check_if_spam(title, body, username, user_url, site, post_id, is_answer, False, 1, 0)
    print title
    assert match == is_spam
Esempio n. 10
0
def test_check_if_spam(title, body, username, site, match):
    # We can't check blacklists/whitelists in tests, so these are set to their default values
    user_url = ""
    post_id = 0
    # If we want to test answers separatly, this should be changed
    is_answer = False
    is_spam, reason, _ = check_if_spam(title, body, username, user_url, site, post_id, is_answer, False, 1, 0)
    print title
    assert match == is_spam
 def test_handle_spam_repeating_characters(cls):
     post = cls.mock_post(title='aaaaaaaaaaaaaa')
     is_spam, reasons, why = check_if_spam(post)
     handle_spam(post=post, reasons=reasons, why=why)
     chatcommunicate.tell_rooms.assert_called_once_with(
         Matcher(containing='aaaaaaaaaaaaaa', without='Potentially offensive title'),
         ANY,
         ANY,
         notify_site=ANY,
         report_data=ANY
     )
def test_blacklisted_user():
    user_url = 'http://stackoverflow.com/users/1/jeff-atwood'
    user = get_user_from_url(user_url)
    add_blacklisted_user(user, "", "")
    # Construct a "fake" post object in API-format
    post = Post(api_response={'title': '', 'body': '',
                              'owner': {'display_name': user, 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '1', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is True
    # cleanup
    os.remove("blacklistedUsers.p")
def test_blacklisted_user():
    user_url = 'http://stackoverflow.com/users/1/jeff-atwood'
    user = get_user_from_url(user_url)
    add_blacklisted_user(user, "", "")
    # Construct a "fake" post object in API-format
    post = Post(api_response={'title': '', 'body': '',
                              'owner': {'display_name': user, 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '1', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is True
    # cleanup
    _remove_pickle("blacklistedUsers.p")
Esempio n. 14
0
def test_handle_spam_offensive_title():
    GlobalVars.deletion_watcher = MagicMock(
    )  # Mock the deletion watcher in test
    chatcommunicate.tell_rooms = MagicMock()
    post = mock_post(title='f**k')
    is_spam, reasons, why = check_if_spam(post)
    handle_spam(post, reasons, why)
    chatcommunicate.tell_rooms.assert_called_once_with(StringMatcher(
        containing='Potentially offensive title', without='f**k'),
                                                       ANY,
                                                       ANY,
                                                       notify_site=ANY,
                                                       report_data=ANY)
 def test_handle_spam_offensive_title(cls):
     post = cls.mock_post(title='f**k')
     is_spam, reasons, why = check_if_spam(post)
     handle_spam(post=post, reasons=reasons, why=why)
     call_a = call(
         Matcher(containing='f**k', without='potentially offensive title'),
         ANY,
         Matcher(containing='offensive-mask', without='no-offensive-mask'),
         notify_site=ANY,
         report_data=ANY
     )
     call_b = call(
         Matcher(containing='potentially offensive title', without='f**k'),
         ANY,
         Matcher(containing='no-offensive-mask', without='offensive-mask'),
         notify_site=ANY,
         report_data=ANY
     )
     chatcommunicate.tell_rooms.assert_has_calls([call_a, call_b])
def test_check_if_spam(title, body, username, site, match):
    # We can't check blacklists/whitelists in tests, so these are set to their default values

    post_dict = {
        "titleEncodedFancy": str(title),
        "bodySummary": str(body),
        "ownerDisplayName": str(username),
        "url": "TEST: No URL passed!",
        "id": "TEST: No ID passed!",
        "siteBaseHostAddress": str(site),
        "ownerUrl": "TEST: No Owner ID passed!"
    }
    json_dict = {
        "action": "155-questions-active",
        'data': json.dumps(post_dict),
        'IsAnswer': False  # If we want to test answers separately, this should be changed.
    }
    json_data = json.dumps(json_dict)
    post = Post(json_data=json_data)
    is_spam, reason, _ = check_if_spam(post)
    assert match == is_spam
def test_check_if_spam(title, body, username, site, match):
    # We can't check blacklists/whitelists in tests, so these are set to their default values

    post_dict = {
        "titleEncodedFancy": str(title),
        "bodySummary": str(body),
        "ownerDisplayName": str(username),
        "url": "TEST: No URL passed!",
        "id": "TEST: No ID passed!",
        "siteBaseHostAddress": str(site),
        "ownerUrl": "TEST: No Owner ID passed!"
    }
    json_dict = {
        "action": "155-questions-active",
        'data': json.dumps(post_dict),
        'IsAnswer': False  # If we want to test answers separately, this should be changed.
    }
    json_data = json.dumps(json_dict)
    post = Post(json_data=json_data)
    is_spam, reason, _ = check_if_spam(post)
    assert match == is_spam
Esempio n. 18
0
    def make_api_call_for_site(self, site):
        if site not in self.queue:
            return

        self.queue_modify_lock.acquire()
        new_posts = self.queue.pop(site)
        store_bodyfetcher_queue()
        self.queue_modify_lock.release()

        new_post_ids = [int(k) for k in new_posts.keys()]

        if GlobalVars.flovis is not None:
            for post_id in new_post_ids:
                GlobalVars.flovis.stage('bodyfetcher/api_request', site, post_id,
                                        {'site': site, 'posts': list(new_posts.keys())})

        self.queue_timing_modify_lock.acquire()
        post_add_times = [v for k, v in new_posts.items()]
        pop_time = datetime.utcnow()

        for add_time in post_add_times:
            try:
                seconds_in_queue = (pop_time - add_time).total_seconds()
                if site in self.queue_timings:
                    self.queue_timings[site].append(seconds_in_queue)
                else:
                    self.queue_timings[site] = [seconds_in_queue]
            except KeyError:  # XXX: Any other possible exception?
                continue  # Skip to next item if we've got invalid data or missing values.

        store_queue_timings()

        self.queue_timing_modify_lock.release()
        self.max_ids_modify_lock.acquire()

        if site in self.previous_max_ids and max(new_post_ids) > self.previous_max_ids[site]:
            previous_max_id = self.previous_max_ids[site]
            intermediate_posts = range(previous_max_id + 1, max(new_post_ids))

            # We don't want to go over the 100-post API cutoff, so take the last
            # (100-len(new_post_ids)) from intermediate_posts

            intermediate_posts = intermediate_posts[(100 - len(new_post_ids)):]

            # new_post_ids could contain edited posts, so merge it back in
            combined = chain(intermediate_posts, new_post_ids)

            # Could be duplicates, so uniquify
            posts = list(set(combined))
        else:
            posts = new_post_ids

        try:
            if max(new_post_ids) > self.previous_max_ids[site]:
                self.previous_max_ids[site] = max(new_post_ids)
                store_bodyfetcher_max_ids()
        except KeyError:
            self.previous_max_ids[site] = max(new_post_ids)
            store_bodyfetcher_max_ids()

        self.max_ids_modify_lock.release()

        log('debug', "New IDs / Hybrid Intermediate IDs for {}:".format(site))
        if len(new_post_ids) > 30:
            log('debug', "{} +{} more".format(sorted(new_post_ids)[:30], len(new_post_ids) - 30))
        else:
            log('debug', sorted(new_post_ids))
        if len(new_post_ids) == len(posts):
            log('debug', "[ *Identical* ]")
        elif len(posts) > 30:
            log('debug', "{} +{} more".format(sorted(posts)[:30], len(posts) - 30))
        else:
            log('debug', sorted(posts))

        question_modifier = ""
        pagesize_modifier = ""

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            if self.last_activity_date != 0:
                pagesize = "50"
            else:
                pagesize = "25"

            pagesize_modifier = "&pagesize={pagesize}" \
                                "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date))
        else:
            question_modifier = "/{0}".format(";".join([str(post) for post in posts]))

        url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \
              "&filter=!*xq08dCDNr)PlxxXfaN8ntivx(BPlY_8XASyXLX-J7F-)VK*Q3KTJVkvp*&key=IAkbitmze4B8KpacUfLqkw((" \
              "{optional_min_query_param}".format(q_modifier=question_modifier, site=site,
                                                  optional_min_query_param=pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)

        GlobalVars.api_request_lock.acquire()
        # Respect backoff, if we were given one
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        try:
            time_request_made = datetime.now().strftime('%H:%M:%S')
            response = requests.get(url, timeout=20).json()
        except (requests.exceptions.Timeout, requests.ConnectionError, Exception):
            # Any failure in the request being made (timeout or otherwise) should be added back to
            # the queue.
            self.queue_modify_lock.acquire()
            if site in self.queue:
                self.queue[site].update(new_posts)
            else:
                self.queue[site] = new_posts
            self.queue_modify_lock.release()
            GlobalVars.api_request_lock.release()
            return

        self.api_data_lock.acquire()
        add_or_update_api_data(site)
        self.api_data_lock.release()

        message_hq = ""
        if "quota_remaining" in response:
            if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0:
                tell_rooms_with("debug", "API quota rolled over with {0} requests remaining. "
                                         "Current quota: {1}.".format(GlobalVars.apiquota,
                                                                      response["quota_remaining"]))

                sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True)
                api_quota_used_per_site = ""
                for site_name, quota_used in sorted_calls_per_site:
                    sanatized_site_name = site_name.replace('.com', '').replace('.stackexchange', '')
                    api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(str(quota_used))
                api_quota_used_per_site = api_quota_used_per_site.strip()

                tell_rooms_with("debug", api_quota_used_per_site)
                clear_api_data()
            if response["quota_remaining"] == 0:
                tell_rooms_with("debug", "API reports no quota left!  May be a glitch.")
                tell_rooms_with("debug", str(response))  # No code format for now?
            if GlobalVars.apiquota == -1:
                tell_rooms_with("debug", "Restart: API quota is {quota}."
                                         .format(quota=response["quota_remaining"]))
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            message_hq = "The quota_remaining property was not in the API response."

        if "error_message" in response:
            message_hq += " Error: {} at {} UTC.".format(response["error_message"], time_request_made)
            if "error_id" in response and response["error_id"] == 502:
                if GlobalVars.api_backoff_time < time.time() + 12:  # Add a backoff of 10 + 2 seconds as a default
                    GlobalVars.api_backoff_time = time.time() + 12
            message_hq += " Backing off on requests for the next 12 seconds."
            message_hq += " Previous URL: `{}`".format(url)

        if "backoff" in response:
            if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
                GlobalVars.api_backoff_time = time.time() + response["backoff"]

        GlobalVars.api_request_lock.release()

        if len(message_hq) > 0:
            tell_rooms_with("debug", message_hq.strip())

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                self.last_activity_date = items[0]["last_activity_date"]

        num_scanned = 0
        start_time = time.time()

        for post in response["items"]:
            pnb = copy.deepcopy(post)
            if 'body' in pnb:
                pnb['body'] = 'Present, but truncated'
            if 'answers' in pnb:
                del pnb['answers']

            if "title" not in post or "body" not in post:
                if GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage('bodyfetcher/api_response/no_content', site, post['question_id'], pnb)
                continue

            post['site'] = site
            try:
                post['edited'] = (post['creation_date'] != post['last_edit_date'])
            except KeyError:
                post['edited'] = False  # last_edit_date not present = not edited

            try:
                post_ = Post(api_response=post)
            except PostParseError as err:
                log('error', 'Error {0} when parsing post: {1!r}'.format(err, post_))
                if GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage('bodyfetcher/api_response/error', site, post['question_id'], pnb)
                continue

            num_scanned += 1

            is_spam, reason, why = check_if_spam(post_)

            if is_spam:
                try:
                    if GlobalVars.flovis is not None and 'question_id' in post:
                        GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, post['question_id'],
                                                {'post': pnb, 'check_if_spam': [is_spam, reason, why]})
                    handle_spam(post=post_,
                                reasons=reason,
                                why=why)
                except Exception as e:
                    log('error', "Exception in handle_spam:", e)
            elif GlobalVars.flovis is not None and 'question_id' in post:
                GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, post['question_id'],
                                        {'post': pnb, 'check_if_spam': [is_spam, reason, why]})

            try:
                if "answers" not in post:
                    pass
                else:
                    for answer in post["answers"]:
                        anb = copy.deepcopy(answer)
                        if 'body' in anb:
                            anb['body'] = 'Present, but truncated'

                        num_scanned += 1
                        answer["IsAnswer"] = True  # Necesssary for Post object
                        answer["title"] = ""  # Necessary for proper Post object creation
                        answer["site"] = site  # Necessary for proper Post object creation
                        try:
                            answer['edited'] = (answer['creation_date'] != answer['last_edit_date'])
                        except KeyError:
                            answer['edited'] = False  # last_edit_date not present = not edited
                        answer_ = Post(api_response=answer, parent=post_)

                        is_spam, reason, why = check_if_spam(answer_)
                        if is_spam:
                            try:
                                if GlobalVars.flovis is not None and 'answer_id' in answer:
                                    GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, answer['answer_id'],
                                                            {'post': anb, 'check_if_spam': [is_spam, reason, why]})
                                handle_spam(answer_,
                                            reasons=reason,
                                            why=why)
                            except Exception as e:
                                log('error', "Exception in handle_spam:", e)
                        elif GlobalVars.flovis is not None and 'answer_id' in answer:
                            GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, answer['answer_id'],
                                                    {'post': anb, 'check_if_spam': [is_spam, reason, why]})

            except Exception as e:
                log('error', "Exception handling answers:", e)

        end_time = time.time()
        GlobalVars.posts_scan_stats_lock.acquire()
        GlobalVars.num_posts_scanned += num_scanned
        GlobalVars.post_scan_time += end_time - start_time
        GlobalVars.posts_scan_stats_lock.release()
        return
    def make_api_call_for_site(self, site):
        with self.queue_lock:
            new_posts = self.queue.pop(site, None)
        if new_posts is None:
            # site was not in the queue
            return
        Tasks.do(store_bodyfetcher_queue)

        new_post_ids = [int(k) for k in new_posts.keys()]

        if GlobalVars.flovis is not None:
            for post_id in new_post_ids:
                GlobalVars.flovis.stage('bodyfetcher/api_request', site,
                                        post_id, {
                                            'site': site,
                                            'posts': list(new_posts.keys())
                                        })

        # Add queue timing data
        pop_time = datetime.utcnow()
        post_add_times = [(pop_time - v).total_seconds()
                          for k, v in new_posts.items()]
        Tasks.do(add_queue_timing_data, site, post_add_times)

        store_max_ids = False
        with self.max_ids_modify_lock:
            if site in self.previous_max_ids and max(
                    new_post_ids) > self.previous_max_ids[site]:
                previous_max_id = self.previous_max_ids[site]
                intermediate_posts = range(previous_max_id + 1,
                                           max(new_post_ids))

                # We don't want to go over the 100-post API cutoff, so take the last
                # (100-len(new_post_ids)) from intermediate_posts

                intermediate_posts = intermediate_posts[-(100 -
                                                          len(new_post_ids)):]

                # new_post_ids could contain edited posts, so merge it back in
                combined = chain(intermediate_posts, new_post_ids)

                # Could be duplicates, so uniquify
                posts = list(set(combined))
            else:
                posts = new_post_ids

            new_post_ids_max = max(new_post_ids)
            if new_post_ids_max > self.previous_max_ids.get(site, 0):
                self.previous_max_ids[site] = new_post_ids_max
                store_max_ids = True

        if store_max_ids:
            schedule_store_bodyfetcher_max_ids()

        log('debug', "New IDs / Hybrid Intermediate IDs for {}:".format(site))
        if len(new_post_ids) > 30:
            log(
                'debug', "{} +{} more".format(
                    sorted(new_post_ids)[:30],
                    len(new_post_ids) - 30))
        else:
            log('debug', sorted(new_post_ids))
        if len(new_post_ids) == len(posts):
            log('debug', "[ *Identical* ]")
        elif len(posts) > 30:
            log('debug',
                "{} +{} more".format(sorted(posts)[:30],
                                     len(posts) - 30))
        else:
            log('debug', sorted(posts))

        question_modifier = ""
        pagesize_modifier = {}

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            with self.last_activity_date_lock:
                if self.last_activity_date != 0:
                    pagesize = "100"
                else:
                    pagesize = "50"

                pagesize_modifier = {
                    'pagesize':
                    pagesize,
                    'min':
                    str(self.last_activity_date -
                        self.ACTIVITY_DATE_EXTRA_EARLIER_MS_TO_FETCH)
                }
        else:
            question_modifier = "/{0}".format(";".join(
                [str(post) for post in posts]))

        url = "https://api.stackexchange.com/2.2/questions{}".format(
            question_modifier)
        params = {
            'filter':
            '!1rs)sUKylwB)8isvCRk.xNu71LnaxjnPS12*pX*CEOKbPFwVFdHNxiMa7GIVgzDAwMa',
            'key': 'IAkbitmze4B8KpacUfLqkw((',
            'site': site
        }
        params.update(pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)

        with GlobalVars.api_request_lock:
            # Respect backoff, if we were given one
            if GlobalVars.api_backoff_time > time.time():
                time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
            try:
                time_request_made = datetime.utcnow().strftime('%H:%M:%S')
                response = requests.get(url, params=params, timeout=20).json()
            except (requests.exceptions.Timeout, requests.ConnectionError,
                    Exception):
                # Any failure in the request being made (timeout or otherwise) should be added back to
                # the queue.
                with self.queue_lock:
                    if site in self.queue:
                        self.queue[site].update(new_posts)
                    else:
                        self.queue[site] = new_posts
                return

            with self.api_data_lock:
                add_or_update_api_data(site)

            message_hq = ""
            with GlobalVars.apiquota_rw_lock:
                if "quota_remaining" in response:
                    quota_remaining = response["quota_remaining"]
                    if quota_remaining - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0 \
                            and quota_remaining > 39980:
                        tell_rooms_with(
                            "debug",
                            "API quota rolled over with {0} requests remaining. "
                            "Current quota: {1}.".format(
                                GlobalVars.apiquota, quota_remaining))

                        sorted_calls_per_site = sorted(
                            GlobalVars.api_calls_per_site.items(),
                            key=itemgetter(1),
                            reverse=True)
                        api_quota_used_per_site = ""
                        for site_name, quota_used in sorted_calls_per_site:
                            sanatized_site_name = site_name.replace(
                                '.com', '').replace('.stackexchange', '')
                            api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(
                                str(quota_used))
                        api_quota_used_per_site = api_quota_used_per_site.strip(
                        )

                        tell_rooms_with("debug", api_quota_used_per_site)
                        clear_api_data()
                    if quota_remaining == 0:
                        tell_rooms_with(
                            "debug",
                            "API reports no quota left!  May be a glitch.")
                        tell_rooms_with(
                            "debug", str(response))  # No code format for now?
                    if GlobalVars.apiquota == -1:
                        tell_rooms_with(
                            "debug", "Restart: API quota is {quota}.".format(
                                quota=quota_remaining))
                    GlobalVars.apiquota = quota_remaining
                else:
                    message_hq = "The quota_remaining property was not in the API response."

            if "error_message" in response:
                message_hq += " Error: {} at {} UTC.".format(
                    response["error_message"], time_request_made)
                if "error_id" in response and response["error_id"] == 502:
                    if GlobalVars.api_backoff_time < time.time(
                    ) + 12:  # Add a backoff of 10 + 2 seconds as a default
                        GlobalVars.api_backoff_time = time.time() + 12
                message_hq += " Backing off on requests for the next 12 seconds."
                message_hq += " Previous URL: `{}`".format(url)

            if "backoff" in response:
                if GlobalVars.api_backoff_time < time.time(
                ) + response["backoff"]:
                    GlobalVars.api_backoff_time = time.time(
                    ) + response["backoff"]

        if len(message_hq) > 0 and "site is required" not in message_hq:
            message_hq = message_hq.strip()
            if len(message_hq) > 500:
                message_hq = "\n" + message_hq
            tell_rooms_with("debug", message_hq)

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                with self.last_activity_date_lock:
                    self.last_activity_date = items[0]["last_activity_date"]

        num_scanned = 0
        start_time = time.time()

        for post in response["items"]:
            if GlobalVars.flovis is not None:
                pnb = copy.deepcopy(post)
                if 'body' in pnb:
                    pnb['body'] = 'Present, but truncated'
                if 'answers' in pnb:
                    del pnb['answers']

            if "title" not in post or "body" not in post:
                if GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage(
                        'bodyfetcher/api_response/no_content', site,
                        post['question_id'], pnb)
                continue

            post['site'] = site
            try:
                post['edited'] = (post['creation_date'] !=
                                  post['last_edit_date'])
            except KeyError:
                post[
                    'edited'] = False  # last_edit_date not present = not edited

            question_doesnt_need_scan = is_post_recently_scanned_and_unchanged(
                post)
            add_recently_scanned_post(post)
            if not question_doesnt_need_scan:
                try:
                    post_ = Post(api_response=post)
                except PostParseError as err:
                    log(
                        'error', 'Error {0} when parsing post: {1!r}'.format(
                            err, post_))
                    if GlobalVars.flovis is not None and 'question_id' in post:
                        GlobalVars.flovis.stage(
                            'bodyfetcher/api_response/error', site,
                            post['question_id'], pnb)
                    continue

                num_scanned += 1

                is_spam, reason, why = check_if_spam(post_)

                if is_spam:
                    try:
                        if GlobalVars.flovis is not None and 'question_id' in post:
                            GlobalVars.flovis.stage(
                                'bodyfetcher/api_response/spam', site,
                                post['question_id'], {
                                    'post': pnb,
                                    'check_if_spam': [is_spam, reason, why]
                                })
                        handle_spam(post=post_, reasons=reason, why=why)
                    except Exception as e:
                        log('error', "Exception in handle_spam:", e)
                elif GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage(
                        'bodyfetcher/api_response/not_spam', site,
                        post['question_id'], {
                            'post': pnb,
                            'check_if_spam': [is_spam, reason, why]
                        })

            try:
                if "answers" not in post:
                    pass
                else:
                    for answer in post["answers"]:
                        if GlobalVars.flovis is not None:
                            anb = copy.deepcopy(answer)
                            if 'body' in anb:
                                anb['body'] = 'Present, but truncated'

                        num_scanned += 1
                        answer["IsAnswer"] = True  # Necesssary for Post object
                        answer[
                            "title"] = ""  # Necessary for proper Post object creation
                        answer[
                            "site"] = site  # Necessary for proper Post object creation
                        try:
                            answer['edited'] = (answer['creation_date'] !=
                                                answer['last_edit_date'])
                        except KeyError:
                            answer[
                                'edited'] = False  # last_edit_date not present = not edited
                        answer_doesnt_need_scan = is_post_recently_scanned_and_unchanged(
                            answer)
                        add_recently_scanned_post(answer)
                        if answer_doesnt_need_scan:
                            continue
                        answer_ = Post(api_response=answer, parent=post_)

                        is_spam, reason, why = check_if_spam(answer_)
                        if is_spam:
                            try:
                                if GlobalVars.flovis is not None and 'answer_id' in answer:
                                    GlobalVars.flovis.stage(
                                        'bodyfetcher/api_response/spam', site,
                                        answer['answer_id'], {
                                            'post': anb,
                                            'check_if_spam':
                                            [is_spam, reason, why]
                                        })
                                handle_spam(answer_, reasons=reason, why=why)
                            except Exception as e:
                                log('error', "Exception in handle_spam:", e)
                        elif GlobalVars.flovis is not None and 'answer_id' in answer:
                            GlobalVars.flovis.stage(
                                'bodyfetcher/api_response/not_spam', site,
                                answer['answer_id'], {
                                    'post': anb,
                                    'check_if_spam': [is_spam, reason, why]
                                })

            except Exception as e:
                log('error', "Exception handling answers:", e)

        end_time = time.time()
        scan_time = end_time - start_time
        GlobalVars.PostScanStat.add_stat(num_scanned, scan_time)
        return
Esempio n. 20
0
    def make_api_call_for_site(self, site):
        if site not in self.queue:
            return

        self.queue_modify_lock.acquire()
        posts = self.queue.pop(site)
        store_bodyfetcher_queue()
        self.queue_modify_lock.release()

        question_modifier = ""
        pagesize_modifier = ""

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            if self.last_activity_date != 0:
                pagesize = "50"
            else:
                pagesize = "25"

            pagesize_modifier = "&pagesize={pagesize}" \
                                "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date))
        else:
            question_modifier = "/{0}".format(";".join(
                str(post) for post in posts))

        url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \
              "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" \
              "{optional_min_query_param}".format(q_modifier=question_modifier, site=site,
                                                  optional_min_query_param=pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)

        GlobalVars.api_request_lock.acquire()
        # Respect backoff, if we were given one
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        try:
            time_request_made = datetime.now().strftime('%H:%M:%S')
            response = requests.get(url, timeout=20).json()
        except (requests.exceptions.Timeout, requests.ConnectionError,
                Exception):
            # Any failure in the request being made (timeout or otherwise) should be added back to
            # the queue.
            self.queue_modify_lock.acquire()
            if site in self.queue:
                self.queue[site].extend(posts)
            else:
                self.queue[site] = posts
            self.queue_modify_lock.release()
            return

        self.api_data_lock.acquire()
        add_or_update_api_data(site)
        self.api_data_lock.release()

        message_hq = ""
        if "quota_remaining" in response:
            if response[
                    "quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0:
                GlobalVars.charcoal_hq.send_message(
                    "API quota rolled over with {0} requests remaining. "
                    "Current quota: {1}.".format(GlobalVars.apiquota,
                                                 response["quota_remaining"]))
                sorted_calls_per_site = sorted(
                    GlobalVars.api_calls_per_site.items(),
                    key=itemgetter(1),
                    reverse=True)
                api_quota_used_per_site = ""
                for site_name, quota_used in sorted_calls_per_site:
                    sanatized_site_name = site_name.replace(
                        '.com', '').replace('.stackexchange', '')
                    api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(
                        str(quota_used))
                api_quota_used_per_site = api_quota_used_per_site.strip()
                GlobalVars.charcoal_hq.send_message(api_quota_used_per_site,
                                                    False)
                clear_api_data()
            if response["quota_remaining"] == 0:
                GlobalVars.charcoal_hq.send_message(
                    "API reports no quota left!  May be a glitch.")
                GlobalVars.charcoal_hq.send_message(
                    str(response))  # No code format for now?
            if GlobalVars.apiquota == -1:
                GlobalVars.charcoal_hq.send_message(
                    "Restart: API quota is {quota}.".format(
                        quota=response["quota_remaining"]))
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            message_hq = "The quota_remaining property was not in the API response."

        if "error_message" in response:
            message_hq += " Error: {} at {} UTC.".format(
                response["error_message"], time_request_made)
            if "error_id" in response and response["error_id"] == 502:
                if GlobalVars.api_backoff_time < time.time(
                ) + 12:  # Add a backoff of 10 + 2 seconds as a default
                    GlobalVars.api_backoff_time = time.time() + 12
            message_hq += " Backing off on requests for the next 12 seconds."

        if "backoff" in response:
            if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
                GlobalVars.api_backoff_time = time.time() + response["backoff"]

        GlobalVars.api_request_lock.release()

        if len(message_hq) > 0:
            GlobalVars.charcoal_hq.send_message(message_hq.strip())

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                self.last_activity_date = items[0]["last_activity_date"]

        num_scanned = 0
        start_time = time.time()

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue

            num_scanned += 1

            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            post_score = post["score"]
            up_vote_count = post["up_vote_count"]
            down_vote_count = post["down_vote_count"]
            try:
                owner_name = GlobalVars.parser.unescape(
                    post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason, why = check_if_spam(title=title,
                                                 body=body,
                                                 user_name=owner_name,
                                                 user_url=owner_link,
                                                 post_site=site,
                                                 post_id=q_id,
                                                 is_answer=False,
                                                 body_is_summary=False,
                                                 owner_rep=owner_rep,
                                                 post_score=post_score)
            if is_spam:
                try:
                    handle_spam(title=title,
                                body=body,
                                poster=owner_name,
                                site=site,
                                post_url=link,
                                poster_url=owner_link,
                                post_id=q_id,
                                reasons=reason,
                                is_answer=False,
                                why=why,
                                owner_rep=owner_rep,
                                post_score=post_score,
                                up_vote_count=up_vote_count,
                                down_vote_count=down_vote_count,
                                question_id=None)
                except:
                    print "NOP"
            try:
                for answer in post["answers"]:
                    num_scanned += 1
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    post_score = answer["score"]
                    up_vote_count = answer["up_vote_count"]
                    down_vote_count = answer["down_vote_count"]
                    try:
                        owner_name = GlobalVars.parser.unescape(
                            answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason, why = check_if_spam(title=answer_title,
                                                         body=body,
                                                         user_name=owner_name,
                                                         user_url=owner_link,
                                                         post_site=site,
                                                         post_id=a_id,
                                                         is_answer=True,
                                                         body_is_summary=False,
                                                         owner_rep=owner_rep,
                                                         post_score=post_score)
                    if is_spam:
                        try:
                            handle_spam(title=title,
                                        body=body,
                                        poster=owner_name,
                                        site=site,
                                        post_url=link,
                                        poster_url=owner_link,
                                        post_id=a_id,
                                        reasons=reason,
                                        is_answer=True,
                                        why=why,
                                        owner_rep=owner_rep,
                                        post_score=post_score,
                                        up_vote_count=up_vote_count,
                                        down_vote_count=down_vote_count,
                                        question_id=q_id)
                        except:
                            print "NOP"
            except:
                print "no answers"

        end_time = time.time()
        GlobalVars.posts_scan_stats_lock.acquire()
        GlobalVars.num_posts_scanned += num_scanned
        GlobalVars.post_scan_time += end_time - start_time
        GlobalVars.posts_scan_stats_lock.release()
        return
Esempio n. 21
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)

        self.queue_store_lock.acquire()
        store_bodyfetcher_queue()
        self.queue_store_lock.release()

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            min_query = ""
            if self.last_activity_date != 0:
                min_query = "&min=" + str(self.last_activity_date)
                pagesize = "50"
            else:
                pagesize = "25"
            url = "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((&pagesize=" + pagesize + min_query
        else:
            url = "http://api.stackexchange.com/2.2/questions/" + ";".join(
                str(x) for x in posts
            ) + "?site=" + site + "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw(("

        # wait to make sure API has/updates post data
        time.sleep(3)
        # Respect backoff, if we were given one
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        self.api_data_lock.acquire()
        add_or_update_api_data(site)
        self.api_data_lock.release()

        message_hq = ""
        if "quota_remaining" in response:
            if response[
                    "quota_remaining"] - GlobalVars.apiquota >= 1000 and GlobalVars.apiquota >= 0:
                GlobalVars.charcoal_hq.send_message(
                    "API quota rolled over with {} requests remaining.".format(
                        GlobalVars.apiquota))
                sorted_calls_per_site = sorted(
                    GlobalVars.api_calls_per_site.items(),
                    key=itemgetter(1),
                    reverse=True)
                api_quota_used_per_site = ""
                for site, quota_used in sorted_calls_per_site:
                    api_quota_used_per_site = api_quota_used_per_site + site.replace(
                        '.com', '').replace('.stackexchange',
                                            '') + ": " + str(quota_used) + "\n"
                api_quota_used_per_site = api_quota_used_per_site.strip()
                GlobalVars.charcoal_hq.send_message(api_quota_used_per_site,
                                                    False)
                clear_api_data()
            if response["quota_remaining"] == 0:
                GlobalVars.charcoal_hq.send_message(
                    "API reports no quota left!  May be a glitch.")
                GlobalVars.charcoal_hq.send_message(
                    str(response))  # No code format for now?
            if GlobalVars.apiquota == -1:
                GlobalVars.charcoal_hq.send_message(
                    "Restart: API quota is {}.".format(
                        response["quota_remaining"]))
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            message_hq = "The quota_remaining property was not in the API response."

        if "error_message" in response:
            message_hq = message_hq + " Error: {}.".format(
                response["error_message"])

        if "backoff" in response:
            if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
                GlobalVars.api_backoff_time = time.time() + response["backoff"]
            message_hq = message_hq + "\n" + "Backoff received of " + str(
                response["backoff"]) + " seconds."

        if len(message_hq) > 0:
            GlobalVars.charcoal_hq.send_message(message_hq.strip())

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            if len(response["items"]
                   ) > 0 and "last_activity_date" in response["items"][0]:
                self.last_activity_date = response["items"][0][
                    "last_activity_date"]

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            post_score = post["score"]
            up_vote_count = post["up_vote_count"]
            down_vote_count = post["down_vote_count"]
            try:
                owner_name = GlobalVars.parser.unescape(
                    post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason, why = check_if_spam(title, body, owner_name,
                                                 owner_link, site, q_id, False,
                                                 False, owner_rep, post_score)
            if is_spam:
                try:
                    handle_spam(title, body, owner_name, site, link,
                                owner_link, q_id, reason, False, why,
                                owner_rep, post_score, up_vote_count,
                                down_vote_count)
                except:
                    print "NOP"
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    post_score = answer["score"]
                    up_vote_count = answer["up_vote_count"]
                    down_vote_count = answer["down_vote_count"]
                    try:
                        owner_name = GlobalVars.parser.unescape(
                            answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason, why = check_if_spam(
                        answer_title, body, owner_name, owner_link, site, a_id,
                        True, False, owner_rep, post_score)
                    if is_spam:
                        try:
                            handle_spam(title, body, owner_name, site, link,
                                        owner_link, a_id, reason, True, why,
                                        owner_rep, post_score, up_vote_count,
                                        down_vote_count)
                        except:
                            print "NOP"
            except:
                print "no answers"
        return
Esempio n. 22
0
    def handle_websocket_data(data):
        if "message" not in data:
            return

        message = data['message']
        if isinstance(message, Iterable):
            if "message" in message:
                chatcommunicate.tell_rooms_with("metasmoke", message['message'])
            elif "autoflag_fp" in message:
                event = message["autoflag_fp"]

                chatcommunicate.tell_rooms(event["message"], ("debug", "site-" + event["site"]),
                                           ("no-site-" + event["site"],), notify_site="/autoflag_fp")
            elif "exit" in message:
                os._exit(message["exit"])
            elif "blacklist" in message:
                ids = (message['blacklist']['uid'], message['blacklist']['site'])

                datahandling.add_blacklisted_user(ids, "metasmoke", message['blacklist']['post'])
                datahandling.last_feedbacked = (ids, time.time() + 60)
            elif "unblacklist" in message:
                ids = (message['unblacklist']['uid'], message['unblacklist']['site'])
                datahandling.remove_blacklisted_user(ids)
            elif "naa" in message:
                post_site_id = parsing.fetch_post_id_and_site_from_url(message["naa"]["post_link"])
                datahandling.add_ignored_post(post_site_id[0:2])
            elif "fp" in message:
                post_site_id = parsing.fetch_post_id_and_site_from_url(message["fp"]["post_link"])
                datahandling.add_false_positive(post_site_id[0:2])
            elif "report" in message:
                post_data = apigetpost.api_get_post(message["report"]["post_link"])
                if post_data is None or post_data is False:
                    return
                if datahandling.has_already_been_posted(post_data.site, post_data.post_id, post_data.title) \
                        and not datahandling.is_false_positive((post_data.post_id, post_data.site)):
                    return
                user = parsing.get_user_from_url(post_data.owner_url)
                post = classes.Post(api_response=post_data.as_dict)

                scan_spam, scan_reasons, scan_why = spamhandling.check_if_spam(post)
                if scan_spam:
                    why_append = u"This post would have also been caught for: " + \
                        u", ".join(scan_reasons).capitalize() + "\n" + scan_why
                else:
                    why_append = u"This post would not have been caught otherwise."

                # Add user to blacklist *after* post is scanned
                if user is not None:
                    datahandling.add_blacklisted_user(user, "metasmoke", post_data.post_url)

                why = u"Post manually reported by user *{}* from metasmoke.\n\n{}".format(
                    message["report"]["user"], why_append)

                spamhandling.handle_spam(post=post,
                                         reasons=["Manually reported " + post_data.post_type],
                                         why=why)
            elif "deploy_updated" in message:
                sha = message["deploy_updated"]["head_commit"]["id"]
                if sha != os.popen('git log -1 --pretty="%H"').read():
                    if "autopull" in message["deploy_updated"]["head_commit"]["message"]:
                        if only_blacklists_changed(GitManager.get_remote_diff()):
                            commit_md = "[`{0}`](https://github.com/Charcoal-SE/SmokeDetector/commit/{0})" \
                                        .format(sha[:7])
                            i = []  # Currently no issues with backlists
                            for bl_file in glob('bad_*.txt') + glob('blacklisted_*.txt'):  # Check blacklists for issues
                                with open(bl_file, 'r') as lines:
                                    seen = dict()
                                    for lineno, line in enumerate(lines, 1):
                                        if line.endswith('\r\n'):
                                            i.append("DOS line ending at `{0}:{1}` in {2}".format(bl_file, lineno,
                                                                                                  commit_md))
                                        if not line.endswith('\n'):
                                            i.append("No newline at end of `{0}` in {1}".format(bl_file, commit_md))
                                        if line == '\n':
                                            i.append("Blank line at `{0}:{1}` in {2}".format(bl_file, lineno,
                                                                                             commit_md))
                                        if line in seen:
                                            i.append("Duplicate entry of {0} at lines {1} and {2} of {3} in {4}"
                                                     .format(line.rstrip('\n'), seen[line], lineno, bl_file, commit_md))
                                        seen[line] = lineno
                            if i == []:  # No issues
                                GitManager.pull_remote()
                                load_blacklists()
                                chatcommunicate.tell_rooms_with("debug", "No code modified in {0}, only blacklists"
                                                                " reloaded.".format(commit_md))
                            else:
                                i.append("please fix before pulling.")
                                chatcommunicate.tell_rooms_with("debug", ", ".join(i))
            elif "commit_status" in message:
                c = message["commit_status"]
                sha = c["commit_sha"][:7]
                if c["commit_sha"] != os.popen('git log -1 --pretty="%H"').read():
                    if c["status"] == "success":
                        if "autopull" in c["commit_message"]:
                            s = "[CI]({ci_link}) on [`{commit_sha}`](https://github.com/Charcoal-SE/SmokeDetector/" \
                                "commit/{commit_sha})"\
                                " succeeded. Message contains 'autopull', pulling...".format(ci_link=c["ci_url"],
                                                                                             commit_sha=sha)
                            chatcommunicate.tell_rooms_with("debug", s, notify_site="/ci")
                            time.sleep(2)
                            os._exit(3)
                        else:
                            s = "[CI]({ci_link}) on [`{commit_sha}`](https://github.com/Charcoal-SE/SmokeDetector/" \
                                "commit/{commit_sha}) succeeded.".format(ci_link=c["ci_url"], commit_sha=sha)

                            chatcommunicate.tell_rooms_with("debug", s, notify_site="/ci")
                    elif c["status"] == "failure":
                        s = "[CI]({ci_link}) on [`{commit_sha}`](https://github.com/Charcoal-SE/SmokeDetector/" \
                            "commit/{commit_sha}) failed.".format(ci_link=c["ci_url"], commit_sha=sha)

                        chatcommunicate.tell_rooms_with("debug", s, notify_site="/ci")
            elif "everything_is_broken" in message:
                if message["everything_is_broken"] is True:
                    os._exit(6)
Esempio n. 23
0
    def make_api_call_for_site(self, site):
        self.queue_modify_lock.acquire()
        if site not in self.queue:
            GlobalVars.charcoal_hq.send_message(
                "Attempted API call to {} but there are no posts to fetch.".
                format(site))
            return
        posts = self.queue.pop(site)
        store_bodyfetcher_queue()
        self.queue_modify_lock.release()

        question_modifier = ""
        pagesize_modifier = ""

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            if self.last_activity_date != 0:
                pagesize = "50"
            else:
                pagesize = "25"

            pagesize_modifier = "&pagesize={pagesize}&min={time_length}".format(
                pagesize=pagesize, time_length=str(self.last_activity_date))
        else:
            question_modifier = "/{0}".format(";".join(
                str(post) for post in posts))

        url = "http://api.stackexchange.com/2.2/questions{q_modifier}?site={site}&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw(({optional_min_query_param}".format(
            q_modifier=question_modifier,
            site=site,
            optional_min_query_param=pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)
        # Respect backoff, if we were given one
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        self.api_data_lock.acquire()
        add_or_update_api_data(site)
        self.api_data_lock.release()

        message_hq = ""
        if "quota_remaining" in response:
            if response[
                    "quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0:
                GlobalVars.charcoal_hq.send_message(
                    "API quota rolled over with {0} requests remaining. Current quota: {1}."
                    .format(GlobalVars.apiquota, response["quota_remaining"]))
                sorted_calls_per_site = sorted(
                    GlobalVars.api_calls_per_site.items(),
                    key=itemgetter(1),
                    reverse=True)
                api_quota_used_per_site = ""
                for site_name, quota_used in sorted_calls_per_site:
                    api_quota_used_per_site += site_name.replace(
                        '.com', '').replace('.stackexchange',
                                            '') + ": {0}\n".format(
                                                str(quota_used))
                api_quota_used_per_site = api_quota_used_per_site.strip()
                GlobalVars.charcoal_hq.send_message(api_quota_used_per_site,
                                                    False)
                clear_api_data()
            if response["quota_remaining"] == 0:
                GlobalVars.charcoal_hq.send_message(
                    "API reports no quota left!  May be a glitch.")
                GlobalVars.charcoal_hq.send_message(
                    str(response))  # No code format for now?
            if GlobalVars.apiquota == -1:
                GlobalVars.charcoal_hq.send_message(
                    "Restart: API quota is {quota}.".format(
                        quota=response["quota_remaining"]))
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            message_hq = "The quota_remaining property was not in the API response."

        if "error_message" in response:
            message_hq += " Error: {}.".format(response["error_message"])

        if "backoff" in response:
            if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
                GlobalVars.api_backoff_time = time.time() + response["backoff"]
            match = regex.compile('/2.2/([^.]*)').search(url)
            url_part = match.group(1) if match else url
            message_hq += "\nBackoff received of {} seconds on request to `{}`".format(
                str(response["backoff"]), url_part)

        if len(message_hq) > 0:
            GlobalVars.charcoal_hq.send_message(message_hq.strip())

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                self.last_activity_date = items[0]["last_activity_date"]

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            post_score = post["score"]
            up_vote_count = post["up_vote_count"]
            down_vote_count = post["down_vote_count"]
            try:
                owner_name = GlobalVars.parser.unescape(
                    post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason, why = check_if_spam(title, body, owner_name,
                                                 owner_link, site, q_id, False,
                                                 False, owner_rep, post_score)
            if is_spam:
                try:
                    handle_spam(title, body, owner_name, site, link,
                                owner_link, q_id, reason, False, why,
                                owner_rep, post_score, up_vote_count,
                                down_vote_count, None)
                except:
                    print "NOP"
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    post_score = answer["score"]
                    up_vote_count = answer["up_vote_count"]
                    down_vote_count = answer["down_vote_count"]
                    try:
                        owner_name = GlobalVars.parser.unescape(
                            answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason, why = check_if_spam(
                        answer_title, body, owner_name, owner_link, site, a_id,
                        True, False, owner_rep, post_score)
                    if is_spam:
                        try:
                            handle_spam(title, body, owner_name, site, link,
                                        owner_link, a_id, reason, True, why,
                                        owner_rep, post_score, up_vote_count,
                                        down_vote_count, q_id)
                        except:
                            print "NOP"
            except:
                print "no answers"
        return
Esempio n. 24
0
    def make_api_call_for_site(self, site):
        if site not in self.queue:
            return

        self.queue_modify_lock.acquire()
        posts = self.queue.pop(site)
        store_bodyfetcher_queue()
        self.queue_modify_lock.release()

        question_modifier = ""
        pagesize_modifier = ""

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            if self.last_activity_date != 0:
                pagesize = "50"
            else:
                pagesize = "25"

            pagesize_modifier = "&pagesize={pagesize}" \
                                "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date))
        else:
            question_modifier = "/{0}".format(";".join(str(post) for post in posts))

        url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \
              "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" \
              "{optional_min_query_param}".format(q_modifier=question_modifier, site=site,
                                                  optional_min_query_param=pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)

        GlobalVars.api_request_lock.acquire()
        # Respect backoff, if we were given one
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        try:
            time_request_made = datetime.now().strftime('%H:%M:%S')
            response = requests.get(url, timeout=20).json()
        except (requests.exceptions.Timeout, requests.ConnectionError, Exception):
            # Any failure in the request being made (timeout or otherwise) should be added back to
            # the queue.
            self.queue_modify_lock.acquire()
            if site in self.queue:
                self.queue[site].extend(posts)
            else:
                self.queue[site] = posts
            self.queue_modify_lock.release()
            return

        self.api_data_lock.acquire()
        add_or_update_api_data(site)
        self.api_data_lock.release()

        message_hq = ""
        if "quota_remaining" in response:
            if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0:
                GlobalVars.charcoal_hq.send_message("API quota rolled over with {0} requests remaining. "
                                                    "Current quota: {1}.".format(GlobalVars.apiquota,
                                                                                 response["quota_remaining"]))
                sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True)
                api_quota_used_per_site = ""
                for site_name, quota_used in sorted_calls_per_site:
                    sanatized_site_name = site_name.replace('.com', '').replace('.stackexchange', '')
                    api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(str(quota_used))
                api_quota_used_per_site = api_quota_used_per_site.strip()
                GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False)
                clear_api_data()
            if response["quota_remaining"] == 0:
                GlobalVars.charcoal_hq.send_message("API reports no quota left!  May be a glitch.")
                GlobalVars.charcoal_hq.send_message(str(response))  # No code format for now?
            if GlobalVars.apiquota == -1:
                GlobalVars.charcoal_hq.send_message("Restart: API quota is {quota}."
                                                    .format(quota=response["quota_remaining"]))
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            message_hq = "The quota_remaining property was not in the API response."

        if "error_message" in response:
            message_hq += " Error: {} at {} UTC.".format(response["error_message"], time_request_made)
            if "error_id" in response and response["error_id"] == 502:
                if GlobalVars.api_backoff_time < time.time() + 12:  # Add a backoff of 10 + 2 seconds as a default
                    GlobalVars.api_backoff_time = time.time() + 12
            message_hq += " Backing off on requests for the next 12 seconds."

        if "backoff" in response:
            if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
                GlobalVars.api_backoff_time = time.time() + response["backoff"]

        GlobalVars.api_request_lock.release()

        if len(message_hq) > 0:
            GlobalVars.charcoal_hq.send_message(message_hq.strip())

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                self.last_activity_date = items[0]["last_activity_date"]

        num_scanned = 0
        start_time = time.time()

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue

            num_scanned += 1

            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            post_score = post["score"]
            up_vote_count = post["up_vote_count"]
            down_vote_count = post["down_vote_count"]
            try:
                owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason, why = check_if_spam(title=title,
                                                 body=body,
                                                 user_name=owner_name,
                                                 user_url=owner_link,
                                                 post_site=site,
                                                 post_id=q_id,
                                                 is_answer=False,
                                                 body_is_summary=False,
                                                 owner_rep=owner_rep,
                                                 post_score=post_score)
            if is_spam:
                try:
                    handle_spam(title=title,
                                body=body,
                                poster=owner_name,
                                site=site,
                                post_url=link,
                                poster_url=owner_link,
                                post_id=q_id,
                                reasons=reason,
                                is_answer=False,
                                why=why,
                                owner_rep=owner_rep,
                                post_score=post_score,
                                up_vote_count=up_vote_count,
                                down_vote_count=down_vote_count,
                                question_id=None)
                except:
                    print "NOP"
            try:
                for answer in post["answers"]:
                    num_scanned += 1
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    post_score = answer["score"]
                    up_vote_count = answer["up_vote_count"]
                    down_vote_count = answer["down_vote_count"]
                    try:
                        owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason, why = check_if_spam(title=answer_title,
                                                         body=body,
                                                         user_name=owner_name,
                                                         user_url=owner_link,
                                                         post_site=site,
                                                         post_id=a_id,
                                                         is_answer=True,
                                                         body_is_summary=False,
                                                         owner_rep=owner_rep,
                                                         post_score=post_score)
                    if is_spam:
                        try:
                            handle_spam(title=title,
                                        body=body,
                                        poster=owner_name,
                                        site=site,
                                        post_url=link,
                                        poster_url=owner_link,
                                        post_id=a_id,
                                        reasons=reason,
                                        is_answer=True,
                                        why=why,
                                        owner_rep=owner_rep,
                                        post_score=post_score,
                                        up_vote_count=up_vote_count,
                                        down_vote_count=down_vote_count,
                                        question_id=q_id)
                        except:
                            print "NOP"
            except:
                print "no answers"

        end_time = time.time()
        GlobalVars.posts_scan_stats_lock.acquire()
        GlobalVars.num_posts_scanned += num_scanned
        GlobalVars.post_scan_time += end_time - start_time
        GlobalVars.posts_scan_stats_lock.release()
        return
Esempio n. 25
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)
        url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
        # wait to make sure API has/updates post data
        time.sleep(60)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        if "quota_remaining" in response:
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            GlobalVars.apiquota = 0
            return

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            try:
                owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False)
            if owner_rep <= 50 and is_spam:
                try:
                    handle_spam(title, owner_name, site, link, owner_link, q_id, reason, False)
                except:
                    print "NOP"

            classified, gibberish_score = classify_gibberish(body, site)
            if classified and gibberish_score >= 65:
                GlobalVars.bayesian_testroom.send_message(
                    "[ SmokeDetector | GibberishClassifierBeta ] "
                    "Potential gibberish body (%s%%): [%s](%s) on `%s`"
                    % (gibberish_score, title, link, site)
                )
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    try:
                        owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False)
                    if owner_rep <= 50 and is_spam:
                        try:
                            handle_spam(title, owner_name, site, link, owner_link, a_id, reason, True)
                        except:
                            print "NOP"

                    classified, gibberish_score = classify_gibberish(body, site)
                    if classified and gibberish_score >= 65:
                        GlobalVars.bayesian_testroom.send_message(
                            "[ SmokeDetector | GibberishClassifierBeta ] "
                            "Potential gibberish answer (%s%%): [%s](%s) on `%s`"
                            % (gibberish_score, title, link, site)
                        )
            except:
                print "no answers"
        return
Esempio n. 26
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)
        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            min_query = ""
            if self.last_activity_date != 0:
                min_query = "&min=" + str(self.last_activity_date)
                pagesize = "50"
            else:
                pagesize = "25"
            url = (
                "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw((&pagesize="
                + pagesize
                + min_query
            )
        else:
            url = (
                "http://api.stackexchange.com/2.2/questions/"
                + ";".join(str(x) for x in posts)
                + "?site="
                + site
                + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
            )
        # wait to make sure API has/updates post data
        time.sleep(60)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        if "quota_remaining" in response:
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            GlobalVars.apiquota = 0
            return

        if site == "stackoverflow.com":
            if len(response["items"]) > 0 and "last_activity_date" in response["items"][0]:
                self.last_activity_date = response["items"][0]["last_activity_date"]

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            try:
                owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            if owner_rep <= 50:
                is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False)
                if is_spam:
                    try:
                        handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why)
                    except:
                        print "NOP"

            classified, gibberish_score = classify_gibberish(body, site)
            if classified and gibberish_score >= 65:
                GlobalVars.bayesian_testroom.send_message(
                    "[ SmokeDetector | GibberishClassifierBeta ] "
                    u"Potential gibberish body ({}%): [{}]({}) on `{}`".format(gibberish_score, title, link, site)
                )
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    try:
                        owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    if owner_rep <= 50:
                        is_spam, reason, why = check_if_spam(
                            answer_title, body, owner_name, owner_link, site, a_id, True, False
                        )
                        if is_spam:
                            try:
                                handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why)
                            except:
                                print "NOP"

                    classified, gibberish_score = classify_gibberish(body, site)
                    if classified and gibberish_score >= 65:
                        GlobalVars.bayesian_testroom.send_message(
                            "[ SmokeDetector | GibberishClassifierBeta ] "
                            u"Potential gibberish answer ({}%): [{}]({}) on `{}`".format(
                                gibberish_score, title, link, site
                            )
                        )
            except:
                print "no answers"
        return
Esempio n. 27
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)
        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            min_query = ""
            if self.last_activity_date != 0:
                min_query = "&min=" + str(self.last_activity_date)
                pagesize = "50"
            else:
                pagesize = "25"
            url = "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw((&pagesize=" + pagesize + min_query
        else:
            url = "http://api.stackexchange.com/2.2/questions/" + ";".join(
                str(x) for x in posts
            ) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
        # wait to make sure API has/updates post data
        time.sleep(60)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        if "quota_remaining" in response:
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            GlobalVars.apiquota = 0
            return

        if site == "stackoverflow.com":
            if len(response["items"]
                   ) > 0 and "last_activity_date" in response["items"][0]:
                self.last_activity_date = response["items"][0][
                    "last_activity_date"]

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            try:
                owner_name = GlobalVars.parser.unescape(
                    post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            if owner_rep <= 50:
                is_spam, reason, why = check_if_spam(title, body, owner_name,
                                                     owner_link, site, q_id,
                                                     False, False)
                if is_spam:
                    try:
                        handle_spam(title, body, owner_name, site, link,
                                    owner_link, q_id, reason, False, why)
                    except:
                        print "NOP"

            classified, gibberish_score = classify_gibberish(body, site)
            if classified and gibberish_score >= 65:
                GlobalVars.bayesian_testroom.send_message(
                    "[ SmokeDetector | GibberishClassifierBeta ] "
                    u"Potential gibberish body ({}%): [{}]({}) on `{}`".format(
                        gibberish_score, title, link, site))
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    try:
                        owner_name = GlobalVars.parser.unescape(
                            answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    if owner_rep <= 50:
                        is_spam, reason, why = check_if_spam(
                            answer_title, body, owner_name, owner_link, site,
                            a_id, True, False)
                        if is_spam:
                            try:
                                handle_spam(title, body, owner_name, site,
                                            link, owner_link, a_id, reason,
                                            True, why)
                            except:
                                print "NOP"

                    classified, gibberish_score = classify_gibberish(
                        body, site)
                    if classified and gibberish_score >= 65:
                        GlobalVars.bayesian_testroom.send_message(
                            "[ SmokeDetector | GibberishClassifierBeta ] "
                            u"Potential gibberish answer ({}%): [{}]({}) on `{}`"
                            .format(gibberish_score, title, link, site))
            except:
                print "no answers"
        return
Esempio n. 28
0
    def make_api_call_for_site(self, site):
        if site not in self.queue:
            return

        self.queue_modify_lock.acquire()
        new_posts = self.queue.pop(site)
        store_bodyfetcher_queue()
        self.queue_modify_lock.release()

        new_post_ids = [int(k) for k, v in new_posts.items()]

        if GlobalVars.flovis is not None:
            for post_id in new_post_ids:
                GlobalVars.flovis.stage('bodyfetcher/api_request', site, post_id,
                                        {'queue':
                                         dict([[sk, [k for k, v in sq.items()]] for sk, sq in self.queue.items()]),
                                         'site': site, 'posts': [k for k, v in new_posts.items()]})

        self.queue_timing_modify_lock.acquire()
        post_add_times = [v for k, v in new_posts.items()]
        pop_time = datetime.utcnow()

        for add_time in post_add_times:
            try:
                seconds_in_queue = (pop_time - add_time).total_seconds()
                if site in self.queue_timings:
                    self.queue_timings[site].append(seconds_in_queue)
                else:
                    self.queue_timings[site] = [seconds_in_queue]
            except:
                continue  # Skip to next item if we've got invalid data or missing values.

        store_queue_timings()

        self.queue_timing_modify_lock.release()
        self.max_ids_modify_lock.acquire()

        if site in self.previous_max_ids and max(new_post_ids) > self.previous_max_ids[site]:
            previous_max_id = self.previous_max_ids[site]
            intermediate_posts = range(previous_max_id + 1, max(new_post_ids))

            # We don't want to go over the 100-post API cutoff, so take the last
            # (100-len(new_post_ids)) from intermediate_posts

            intermediate_posts = intermediate_posts[(100 - len(new_post_ids)):]

            # new_post_ids could contain edited posts, so merge it back in
            combined = chain(intermediate_posts, new_post_ids)

            # Could be duplicates, so uniquify
            posts = list(set(combined))
        else:
            posts = new_post_ids

        try:
            if max(new_post_ids) > self.previous_max_ids[site]:
                self.previous_max_ids[site] = max(new_post_ids)
                store_bodyfetcher_max_ids()
        except KeyError:
            self.previous_max_ids[site] = max(new_post_ids)
            store_bodyfetcher_max_ids()

        self.max_ids_modify_lock.release()

        log('debug', "New IDs / Hybrid Intermediate IDs for {0}:".format(site))
        log('debug', sorted(new_post_ids))
        log('debug', sorted(posts))

        question_modifier = ""
        pagesize_modifier = ""

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            if self.last_activity_date != 0:
                pagesize = "50"
            else:
                pagesize = "25"

            pagesize_modifier = "&pagesize={pagesize}" \
                                "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date))
        else:
            question_modifier = "/{0}".format(";".join(str(post) for post in posts))

        url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \
              "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" \
              "{optional_min_query_param}".format(q_modifier=question_modifier, site=site,
                                                  optional_min_query_param=pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)

        GlobalVars.api_request_lock.acquire()
        # Respect backoff, if we were given one
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        try:
            time_request_made = datetime.now().strftime('%H:%M:%S')
            response = requests.get(url, timeout=20).json()
        except (requests.exceptions.Timeout, requests.ConnectionError, Exception):
            # Any failure in the request being made (timeout or otherwise) should be added back to
            # the queue.
            self.queue_modify_lock.acquire()
            if site in self.queue:
                self.queue[site].update(new_posts)
            else:
                self.queue[site] = new_posts
            self.queue_modify_lock.release()
            GlobalVars.api_request_lock.release()
            return

        self.api_data_lock.acquire()
        add_or_update_api_data(site)
        self.api_data_lock.release()

        message_hq = ""
        if "quota_remaining" in response:
            if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0:
                tell_rooms_with("debug", "API quota rolled over with {0} requests remaining. "
                                         "Current quota: {1}.".format(GlobalVars.apiquota,
                                                                      response["quota_remaining"]))

                sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True)
                api_quota_used_per_site = ""
                for site_name, quota_used in sorted_calls_per_site:
                    sanatized_site_name = site_name.replace('.com', '').replace('.stackexchange', '')
                    api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(str(quota_used))
                api_quota_used_per_site = api_quota_used_per_site.strip()

                tell_rooms_with("debug", api_quota_used_per_site)
                clear_api_data()
            if response["quota_remaining"] == 0:
                tell_rooms_with("debug", "API reports no quota left!  May be a glitch.")
                tell_rooms_with("debug", str(response))  # No code format for now?
            if GlobalVars.apiquota == -1:
                tell_rooms_with("debug", "Restart: API quota is {quota}."
                                         .format(quota=response["quota_remaining"]))
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            message_hq = "The quota_remaining property was not in the API response."

        if "error_message" in response:
            message_hq += " Error: {} at {} UTC.".format(response["error_message"], time_request_made)
            if "error_id" in response and response["error_id"] == 502:
                if GlobalVars.api_backoff_time < time.time() + 12:  # Add a backoff of 10 + 2 seconds as a default
                    GlobalVars.api_backoff_time = time.time() + 12
            message_hq += " Backing off on requests for the next 12 seconds."
            message_hq += " Previous URL: `{}`".format(url)

        if "backoff" in response:
            if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
                GlobalVars.api_backoff_time = time.time() + response["backoff"]

        GlobalVars.api_request_lock.release()

        if len(message_hq) > 0:
            tell_rooms_with("debug", message_hq.strip())

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                self.last_activity_date = items[0]["last_activity_date"]

        num_scanned = 0
        start_time = time.time()

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                if GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage('bodyfetcher/api_response/no_content', site, post['question_id'], post)
                continue

            post['site'] = site
            try:
                post_ = Post(api_response=post)
            except PostParseError as err:
                log('error', 'Error {0} when parsing post: {1!r}'.format(err, post_))
                if GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage('bodyfetcher/api_response/error', site, post['question_id'], post)
                continue

            num_scanned += 1

            is_spam, reason, why = check_if_spam(post_)

            if is_spam:
                try:
                    if GlobalVars.flovis is not None and 'question_id' in post:
                        GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, post['question_id'],
                                                {'post': post, 'check_if_spam': [is_spam, reason, why]})
                    handle_spam(post=post_,
                                reasons=reason,
                                why=why)
                except Exception as e:
                    log('error', "Exception in handle_spam:", e)
            elif GlobalVars.flovis is not None and 'question_id' in post:
                GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, post['question_id'],
                                        {'post': post, 'check_if_spam': [is_spam, reason, why]})

            try:
                if "answers" not in post:
                    pass
                else:
                    for answer in post["answers"]:
                        num_scanned += 1
                        answer["IsAnswer"] = True  # Necesssary for Post object
                        answer["title"] = ""  # Necessary for proper Post object creation
                        answer["site"] = site  # Necessary for proper Post object creation
                        answer_ = Post(api_response=answer, parent=post_)

                        is_spam, reason, why = check_if_spam(answer_)
                        if is_spam:
                            try:
                                if GlobalVars.flovis is not None and 'answer_id' in answer:
                                    GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, answer['answer_id'],
                                                            {'post': answer, 'check_if_spam': [is_spam, reason, why]})
                                handle_spam(answer_,
                                            reasons=reason,
                                            why=why)
                            except Exception as e:
                                log('error', "Exception in handle_spam:", e)
                        elif GlobalVars.flovis is not None and 'answer_id' in answer:
                            GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, answer['answer_id'],
                                                    {'post': answer, 'check_if_spam': [is_spam, reason, why]})

            except Exception as e:
                log('error', "Exception handling answers:", e)

        end_time = time.time()
        GlobalVars.posts_scan_stats_lock.acquire()
        GlobalVars.num_posts_scanned += num_scanned
        GlobalVars.post_scan_time += end_time - start_time
        GlobalVars.posts_scan_stats_lock.release()
        return
Esempio n. 29
0
    def make_api_call_for_site(self, site):
        self.queue_modify_lock.acquire()
        if site not in self.queue:
            GlobalVars.charcoal_hq.send_message("Attempted API call to {} but there are no posts to fetch.".format(site))
            return
        posts = self.queue.pop(site)
        store_bodyfetcher_queue()
        self.queue_modify_lock.release()

        question_modifier = ""
        pagesize_modifier = ""

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            if self.last_activity_date != 0:
                pagesize = "50"
            else:
                pagesize = "25"

            pagesize_modifier = "&pagesize={pagesize}&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date))
        else:
            question_modifier = "/{0}".format(";".join(str(post) for post in posts))

        url = "http://api.stackexchange.com/2.2/questions{q_modifier}?site={site}&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw(({optional_min_query_param}".format(q_modifier=question_modifier, site=site, optional_min_query_param=pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)
        # Respect backoff, if we were given one
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        self.api_data_lock.acquire()
        add_or_update_api_data(site)
        self.api_data_lock.release()

        message_hq = ""
        if "quota_remaining" in response:
            if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0:
                GlobalVars.charcoal_hq.send_message("API quota rolled over with {0} requests remaining. Current quota: {1}.".format(GlobalVars.apiquota, response["quota_remaining"]))
                sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True)
                api_quota_used_per_site = ""
                for site_name, quota_used in sorted_calls_per_site:
                    api_quota_used_per_site += site_name.replace('.com', '').replace('.stackexchange', '') + ": {0}\n".format(str(quota_used))
                api_quota_used_per_site = api_quota_used_per_site.strip()
                GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False)
                clear_api_data()
            if response["quota_remaining"] == 0:
                GlobalVars.charcoal_hq.send_message("API reports no quota left!  May be a glitch.")
                GlobalVars.charcoal_hq.send_message(str(response))  # No code format for now?
            if GlobalVars.apiquota == -1:
                GlobalVars.charcoal_hq.send_message("Restart: API quota is {quota}.".format(quota=response["quota_remaining"]))
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            message_hq = "The quota_remaining property was not in the API response."

        if "error_message" in response:
            message_hq += " Error: {}.".format(response["error_message"])

        if "backoff" in response:
            if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
                GlobalVars.api_backoff_time = time.time() + response["backoff"]
            match = regex.compile('/2.2/([^.]*)').search(url)
            url_part = match.group(1) if match else url
            message_hq += "\nBackoff received of {} seconds on request to `{}`".format(str(response["backoff"]), url_part)

        if len(message_hq) > 0:
            GlobalVars.charcoal_hq.send_message(message_hq.strip())

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                self.last_activity_date = items[0]["last_activity_date"]

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            post_score = post["score"]
            up_vote_count = post["up_vote_count"]
            down_vote_count = post["down_vote_count"]
            try:
                owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False, owner_rep, post_score)
            if is_spam:
                try:
                    handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why, owner_rep, post_score, up_vote_count, down_vote_count, None)
                except:
                    print "NOP"
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    post_score = answer["score"]
                    up_vote_count = answer["up_vote_count"]
                    down_vote_count = answer["down_vote_count"]
                    try:
                        owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason, why = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False, owner_rep, post_score)
                    if is_spam:
                        try:
                            handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why, owner_rep, post_score, up_vote_count, down_vote_count, q_id)
                        except:
                            print "NOP"
            except:
                print "no answers"
        return
Esempio n. 30
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)
        store_bodyfetcher_queue()

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            min_query = ""
            if self.last_activity_date != 0:
                min_query = "&min=" + str(self.last_activity_date)
                pagesize = "50"
            else:
                pagesize = "25"
            url = "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw((&pagesize=" + pagesize + min_query
        else:
            url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!4y_-sca-)pfAwlmP_1FxC6e5yzutRIcQvonAiP&key=IAkbitmze4B8KpacUfLqkw(("
        # wait to make sure API has/updates post data
        time.sleep(60)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        add_or_update_api_data(site)

        if "quota_remaining" in response:
            if response["quota_remaining"] - GlobalVars.apiquota >= 1000 and GlobalVars.apiquota >= 0:
                GlobalVars.charcoal_hq.send_message("API quota rolled over with {} requests remaining.".format(GlobalVars.apiquota))
                sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True)
                api_quota_used_per_site = ""
                for site, quota_used in sorted_calls_per_site:
                    api_quota_used_per_site = api_quota_used_per_site + site.replace('.com', '').replace('.stackexchange', '') + ": " + str(quota_used) + "\n"
                api_quota_used_per_site = api_quota_used_per_site.strip()
                GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False)
                clear_api_data()

            elif response["quota_remaining"] == 0:
                GlobalVars.charcoal_hq.send_message("API reports no quota left!  May be a glitch.")
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            GlobalVars.charcoal_hq.send_message("The quota_remaining property was not in the API response.")

        if site == "stackoverflow.com":
            if len(response["items"]) > 0 and "last_activity_date" in response["items"][0]:
                self.last_activity_date = response["items"][0]["last_activity_date"]

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            try:
                owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False, owner_rep)
            if is_spam:
                try:
                    handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why)
                except:
                    print "NOP"
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    try:
                        owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason, why = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False, owner_rep)
                    if is_spam:
                        try:
                            handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why)
                        except:
                            print "NOP"
            except:
                print "no answers"
        return
Esempio n. 31
0
    def make_api_call_for_site(self, site):
        posts = self.queue.pop(site)

        self.queue_store_lock.acquire()
        store_bodyfetcher_queue()
        self.queue_store_lock.release()

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            min_query = ""
            if self.last_activity_date != 0:
                min_query = "&min=" + str(self.last_activity_date)
                pagesize = "50"
            else:
                pagesize = "25"
            url = "http://api.stackexchange.com/2.2/questions?site=stackoverflow&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((&pagesize=" + pagesize + min_query
        else:
            url = "http://api.stackexchange.com/2.2/questions/" + ";".join(str(x) for x in posts) + "?site=" + site + "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw(("

        # wait to make sure API has/updates post data
        time.sleep(3)
        # Respect backoff, if we were given one
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        try:
            response = requests.get(url, timeout=20).json()
        except requests.exceptions.Timeout:
            return  # could add some retrying logic here, but eh.

        self.api_data_lock.acquire()
        add_or_update_api_data(site)
        self.api_data_lock.release()

        message_hq = ""
        if "quota_remaining" in response:
            if response["quota_remaining"] - GlobalVars.apiquota >= 1000 and GlobalVars.apiquota >= 0:
                GlobalVars.charcoal_hq.send_message("API quota rolled over with {} requests remaining.".format(GlobalVars.apiquota))
                sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True)
                api_quota_used_per_site = ""
                for site, quota_used in sorted_calls_per_site:
                    api_quota_used_per_site = api_quota_used_per_site + site.replace('.com', '').replace('.stackexchange', '') + ": " + str(quota_used) + "\n"
                api_quota_used_per_site = api_quota_used_per_site.strip()
                GlobalVars.charcoal_hq.send_message(api_quota_used_per_site, False)
                clear_api_data()
            if response["quota_remaining"] == 0:
                GlobalVars.charcoal_hq.send_message("API reports no quota left!  May be a glitch.")
                GlobalVars.charcoal_hq.send_message(str(response))  # No code format for now?
            if GlobalVars.apiquota == -1:
                GlobalVars.charcoal_hq.send_message("Restart: API quota is {}.".format(response["quota_remaining"]))
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            message_hq = "The quota_remaining property was not in the API response."

        if "error_message" in response:
            message_hq = message_hq + " Error: {}.".format(response["error_message"])

        if "backoff" in response:
            if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
                GlobalVars.api_backoff_time = time.time() + response["backoff"]
            message_hq = message_hq + "\n" + "Backoff received of " + str(response["backoff"]) + " seconds."

        if len(message_hq) > 0:
            GlobalVars.charcoal_hq.send_message(message_hq.strip())

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            if len(response["items"]) > 0 and "last_activity_date" in response["items"][0]:
                self.last_activity_date = response["items"][0]["last_activity_date"]

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                continue
            title = GlobalVars.parser.unescape(post["title"])
            body = GlobalVars.parser.unescape(post["body"])
            link = post["link"]
            post_score = post["score"]
            up_vote_count = post["up_vote_count"]
            down_vote_count = post["down_vote_count"]
            try:
                owner_name = GlobalVars.parser.unescape(post["owner"]["display_name"])
                owner_link = post["owner"]["link"]
                owner_rep = post["owner"]["reputation"]
            except:
                owner_name = ""
                owner_link = ""
                owner_rep = 0
            q_id = str(post["question_id"])

            is_spam, reason, why = check_if_spam(title, body, owner_name, owner_link, site, q_id, False, False, owner_rep, post_score)
            if is_spam:
                try:
                    handle_spam(title, body, owner_name, site, link, owner_link, q_id, reason, False, why, owner_rep, post_score, up_vote_count, down_vote_count)
                except:
                    print "NOP"
            try:
                for answer in post["answers"]:
                    answer_title = ""
                    body = answer["body"]
                    print "got answer from owner with name " + owner_name
                    link = answer["link"]
                    a_id = str(answer["answer_id"])
                    post_score = answer["score"]
                    up_vote_count = answer["up_vote_count"]
                    down_vote_count = answer["down_vote_count"]
                    try:
                        owner_name = GlobalVars.parser.unescape(answer["owner"]["display_name"])
                        owner_link = answer["owner"]["link"]
                        owner_rep = answer["owner"]["reputation"]
                    except:
                        owner_name = ""
                        owner_link = ""
                        owner_rep = 0

                    is_spam, reason, why = check_if_spam(answer_title, body, owner_name, owner_link, site, a_id, True, False, owner_rep, post_score)
                    if is_spam:
                        try:
                            handle_spam(title, body, owner_name, site, link, owner_link, a_id, reason, True, why, owner_rep, post_score, up_vote_count, down_vote_count)
                        except:
                            print "NOP"
            except:
                print "no answers"
        return
Esempio n. 32
0
    def handle_websocket_data(data):
        if "message" not in data:
            return
        message = data['message']
        if not isinstance(message, Iterable):
            return

        if "message" in message:
            chatcommunicate.tell_rooms_with("metasmoke", message['message'])
        elif "autoflag_fp" in message:
            event = message["autoflag_fp"]

            chatcommunicate.tell_rooms(event["message"],
                                       ("debug", "site-" + event["site"]),
                                       ("no-site-" + event["site"], ),
                                       notify_site="/autoflag_fp")
        elif "exit" in message:
            os._exit(message["exit"])
        elif "blacklist" in message:
            ids = (message['blacklist']['uid'], message['blacklist']['site'])

            datahandling.add_blacklisted_user(ids, "metasmoke",
                                              message['blacklist']['post'])
            datahandling.last_feedbacked = (ids, time.time() + 60)
        elif "unblacklist" in message:
            ids = (message['unblacklist']['uid'],
                   message['unblacklist']['site'])
            datahandling.remove_blacklisted_user(ids)
        elif "naa" in message:
            post_site_id = parsing.fetch_post_id_and_site_from_url(
                message["naa"]["post_link"])
            datahandling.add_ignored_post(post_site_id[0:2])
        elif "fp" in message:
            post_site_id = parsing.fetch_post_id_and_site_from_url(
                message["fp"]["post_link"])
            datahandling.add_false_positive(post_site_id[0:2])
        elif "report" in message:
            import chatcommands  # Do it here
            chatcommands.report_posts([message["report"]["post_link"]],
                                      "the metasmoke API", None,
                                      "the metasmoke API")
            return
            post_data = apigetpost.api_get_post(message["report"]["post_link"])
            if post_data is None or post_data is False:
                return
            if datahandling.has_already_been_posted(post_data.site, post_data.post_id, post_data.title) \
                    and not datahandling.is_false_positive((post_data.post_id, post_data.site)):
                return
            user = parsing.get_user_from_url(post_data.owner_url)
            post = classes.Post(api_response=post_data.as_dict)

            scan_spam, scan_reasons, scan_why = spamhandling.check_if_spam(
                post)
            if scan_spam:
                why_append = u"This post would have also been caught for: " + \
                    u", ".join(scan_reasons).capitalize() + "\n" + scan_why
            else:
                why_append = u"This post would not have been caught otherwise."

            # Add user to blacklist *after* post is scanned
            if user is not None:
                datahandling.add_blacklisted_user(user, "metasmoke",
                                                  post_data.post_url)

            why = u"Post manually reported by user *{}* from metasmoke.\n\n{}".format(
                message["report"]["user"], why_append)

            spamhandling.handle_spam(
                post=post,
                reasons=["Manually reported " + post_data.post_type],
                why=why)
        elif "deploy_updated" in message:
            return  # Disabled
            sha = message["deploy_updated"]["head_commit"]["id"]
            if sha != os.popen('git log -1 --pretty="%H"').read():
                if "autopull" in message["deploy_updated"]["head_commit"][
                        "message"]:
                    if only_blacklists_changed(GitManager.get_remote_diff()):
                        commit_md = "[`{0}`](https://github.com/{1}/commit/{0})" \
                                    .format(sha[:7], GlobalVars.bot_repo_slug)
                        integrity = blacklist_integrity_check()
                        if len(integrity) == 0:  # No issues
                            GitManager.pull_remote()
                            findspam.reload_blacklists()
                            chatcommunicate.tell_rooms_with(
                                "debug",
                                "No code modified in {0}, only blacklists"
                                " reloaded.".format(commit_md))
                        else:
                            integrity.append("please fix before pulling.")
                            chatcommunicate.tell_rooms_with(
                                "debug", ", ".join(integrity))
        elif "commit_status" in message:
            c = message["commit_status"]
            sha = c["commit_sha"][:7]
            if c["commit_sha"] == sp.check_output(
                ["git", "log", "-1", "--pretty=%H"]).decode('utf-8').strip():
                return

            if c["status"] == "success":
                if "autopull" in c["commit_message"]:
                    s = "[CI]({ci_link}) on [`{commit_sha}`](https://github.com/{repo}/" \
                        "commit/{commit_sha}) succeeded. Message contains 'autopull', pulling...".format(
                            ci_link=c["ci_url"], repo=GlobalVars.bot_repo_slug, commit_sha=sha)
                    remote_diff = GitManager.get_remote_diff()
                    if only_blacklists_changed(remote_diff):
                        GitManager.pull_remote()
                        if not GlobalVars.on_master:
                            # Restart if HEAD detached
                            log('warning',
                                "Pulling remote with HEAD detached, checkout deploy",
                                f=True)
                            os._exit(8)
                        GlobalVars.reload()
                        findspam.FindSpam.reload_blacklists()
                        chatcommunicate.tell_rooms_with(
                            'debug', GlobalVars.s_norestart)
                    elif only_modules_changed(remote_diff):
                        GitManager.pull_remote()
                        if not GlobalVars.on_master:
                            # Restart if HEAD detached
                            log('warning',
                                "Pulling remote with HEAD detached, checkout deploy",
                                f=True)
                            os._exit(8)
                        GlobalVars.reload()
                        reload_modules()
                        chatcommunicate.tell_rooms_with(
                            'debug', GlobalVars.s_norestart2)
                    else:
                        chatcommunicate.tell_rooms_with('debug',
                                                        s,
                                                        notify_site="/ci")
                        os._exit(3)
                else:
                    s = "[CI]({ci_link}) on [`{commit_sha}`](https://github.com/{repo}/commit/{commit_sha}) " \
                        "succeeded.".format(ci_link=c["ci_url"], repo=GlobalVars.bot_repo_slug, commit_sha=sha)

                    chatcommunicate.tell_rooms_with("debug",
                                                    s,
                                                    notify_site="/ci")
            elif c["status"] == "failure":
                s = "[CI]({ci_link}) on [`{commit_sha}`](https://github.com/{repo}/commit/{commit_sha}) " \
                    "failed.".format(ci_link=c["ci_url"], repo=GlobalVars.bot_repo_slug, commit_sha=sha)

                chatcommunicate.tell_rooms_with("debug", s, notify_site="/ci")
        elif "everything_is_broken" in message:
            if message["everything_is_broken"] is True:
                os._exit(6)