def test_whitelisted_user():
    user_url = 'http://stackoverflow.com/users/2/geoff-dalgas'
    user = get_user_from_url(user_url)
    add_whitelisted_user(user)
    user_url2 = 'http://stackoverflow.com/users/0/test'
    user2 = get_user_from_url(user_url2)
    add_whitelisted_user(user2)
    post = Post(api_response={'title': '', 'body': '',
                              'owner': {'display_name': 'bagprada', 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '1', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is False
    post = Post(api_response={'title': 'baba ji', 'body': '',
                              'owner': {'display_name': '', 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '2', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is True
    post = Post(api_response={'title': 'baba ji', 'body': '',
                              'owner': {'display_name': 'bagprada', 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '3', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is True
    post = Post(api_response={'title': 'test', 'body': '',
                              'owner': {'display_name': 'baba ji - muscle building',
                                        'reputation': 1, 'link': user_url2},
                              'site': 'stackoverflow.com', 'question_id': '0', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is False
    # cleanup
    _remove_pickle("whitelistedUsers.p")
Beispiel #2
0
def check_blacklist(string_to_test, is_username, is_watchlist):
    # Test the string and provide a warning message if it is already caught.
    if is_username:
        question = Post(api_response={'title': 'Valid title', 'body': 'Valid body',
                                      'owner': {'display_name': string_to_test, 'reputation': 1, 'link': ''},
                                      'site': "", 'IsAnswer': False, 'score': 0})
        answer = Post(api_response={'title': 'Valid title', 'body': 'Valid body',
                                    'owner': {'display_name': string_to_test, 'reputation': 1, 'link': ''},
                                    'site': "", 'IsAnswer': True, 'score': 0})

    else:
        question = Post(api_response={'title': 'Valid title', 'body': string_to_test,
                                      'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''},
                                      'site': "", 'IsAnswer': False, 'score': 0})
        answer = Post(api_response={'title': 'Valid title', 'body': string_to_test,
                                    'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''},
                                    'site': "", 'IsAnswer': True, 'score': 0})

    question_reasons, _ = FindSpam.test_post(question)
    answer_reasons, _ = FindSpam.test_post(answer)

    # Filter out duplicates
    reasons = list(set(question_reasons) | set(answer_reasons))

    # Filter out watchlist results
    if not is_watchlist:
        reasons = list(filter(lambda reason: "potentially bad keyword" not in reason, reasons))

    return reasons
Beispiel #3
0
def updater(bot, domain, last_id):
    log.info('[VK] Проверка на наличие новых постов в {0} с последним ID {1}'.format(domain, last_id))
    posts = get_data(domain)
    for post in reversed(posts):
        if post['id'] > last_id:
            log.info("[VK] Обнаружен новый пост с ID {0}".format(post['id']))
            new_post = Post(post, domain)
            new_post.generate_post()
            send_post(bot, domain, new_post)
            last_id = update_parameter(domain, 'last_id', post['id'])
            time.sleep(5)
    if post['id'] == last_id:
        log.info('[VK] Новых постов больше не обнаружено')
    log.info('[VK] Проверка завершена, last_id = {0}.'.format(last_id))
Beispiel #4
0
def delete_post(postid):
    with dbapi2._connect(current_app.config['dsn']) as connection:
        cursor = connection.cursor()
        query = """SELECT u.ID, m.MOVIEID, COMMENTS FROM
                            USERS u INNER JOIN POSTS p ON (u.ID = p.USER_ID)
                            INNER JOIN MOVIES m ON (m.MOVIEID = p.MOVIE_ID)
                        WHERE (p.POST_ID = %s)"""

        cursor.execute(query, (postid, ))

        post = cursor.fetchone()
        post_to_delete = Post(post[0], post[1], post[2])
        post_to_delete.delete_post_from_db()
        connection.commit()
        return redirect(url_for('page.profile_page'))
Beispiel #5
0
def test_regexes(title, body, username, site, body_is_summary, is_answer,
                 match):
    # If we want to test answers separately, this should be changed
    # is_answer = False
    post = Post(
        api_response={
            'title': title,
            'body': body,
            'owner': {
                'display_name': username,
                'reputation': 1,
                'link': ''
            },
            'site': site,
            'question_id': '1',
            'IsAnswer': is_answer,
            'BodyIsSummary': body_is_summary,
            'score': 0
        })
    result = FindSpam.test_post(post)[0]
    log('info', title)
    log('info', "Result:", result)
    isspam = False
    if len(result) > 0:
        isspam = True
    if match != isspam:
        print((body, match))
    assert match == isspam
 def mock_post(cls,
               title='',
               body='',
               site='stackoverflow.com',
               link='https://stackoverflow.com/q/1732454',
               owner={'link': 'https://stackoverflow.com/users/102937/robert-harvey'},
               post_id=1732454,
               is_question=True,
               increment_auto_post_id=True):
     if increment_auto_post_id:
         cls.auto_post_id += 1
         link = 'https://stackoverflow.com/{}/{}'.format('q' if is_question else 'a', cls.auto_post_id)
         post_id = cls.auto_post_id
     api_response = {
         "title": title,
         "body": body,
         "site": site,
         "link": link,
         "owner": owner
     }
     if is_question:
         api_response['question_id'] = post_id
     else:
         api_response['answer_id'] = post_id
     return Post(api_response=api_response)
Beispiel #7
0
    def new_post( self, user_id=None, date_post=None, status=None, posting=None, type=None, visibility=None ):
        user_post = UserPost()
        user_post.id_user = user_id
        user_post.date_post = date_post
        user_post.status = status

        post = Post()
        post.post = posting
        post.type = type
        post.visibility = visibility

        user_post.post = post

        self.session.add( user_post )
        self.session.commit()

        return user_post
Beispiel #8
0
def test(content, alias_used="test"):
    """
    Test an answer to determine if it'd be automatically reported
    :param content:
    :return: A string
    """
    result = "> "

    if alias_used == "test-q":
        kind = " question."
        fakepost = Post(api_response={'title': 'Valid title', 'body': content,
                                      'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''},
                                      'site': "", 'IsAnswer': False, 'score': 0})
    elif alias_used == "test-a":
        kind = "n answer."
        fakepost = Post(api_response={'title': 'Valid title', 'body': content,
                                      'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''},
                                      'site': "", 'IsAnswer': True, 'score': 0})
    elif alias_used == "test-u":
        kind = " username."
        fakepost = Post(api_response={'title': 'Valid title', 'body': "Valid question body",
                                      'owner': {'display_name': content, 'reputation': 1, 'link': ''},
                                      'site': "", 'IsAnswer': False, 'score': 0})
    elif alias_used == "test-t":
        kind = " title."
        fakepost = Post(api_response={'title': content, 'body': "Valid question body",
                                      'owner': {'display_name': "Valid username", 'reputation': 1, 'link': ''},
                                      'site': "", 'IsAnswer': False, 'score': 0})
    else:
        kind = " post, title or username."
        fakepost = Post(api_response={'title': content, 'body': content,
                                      'owner': {'display_name': content, 'reputation': 1, 'link': ''},
                                      'site': "", 'IsAnswer': False, 'score': 0})

    reasons, why_response = FindSpam.test_post(fakepost)

    if len(reasons) == 0:
        result += "Would not be caught as a{}".format(kind)
    else:
        result += ", ".join(reasons).capitalize()

        if why_response is not None and len(why_response) > 0:
            result += "\n----------\n"
            result += why_response

    return result
Beispiel #9
0
def admFeed():
    usuarioAtivo = current_user.get_id()

    db = utilitariosDB.getDb()
    sideBar = Markup(getSideBar())

    usuario = db['usuarios'].find_one({
        "_id": ObjectId(usuarioAtivo),
        "administrador": "S"
    })
    if usuario == None:
        return "usuários não autorizado"

    navbar = Markup(getNavBar(usuario))

    if request.method == 'POST':

        post = Post()
        post.titulo = request.form.get("feedInTitulo", '')
        post.descricao = request.form.get("feedInMensagem", '')
        post.momento = datetime.now()

        # Salvando no banco
        db = utilitariosDB.getDb()
        post.salvarMongoDb(db)

        return render_template("admFeed.html",
                               sideBarWS=sideBar,
                               navbarWS=navbar)
    else:
        return render_template("admFeed.html",
                               sideBarWS=sideBar,
                               navbarWS=navbar)
Beispiel #10
0
def get_user_posts(username: str, next_max_id: str = None) -> dict:
    """
    Get a list of posts

    :param username: the user
    :param next_max_id: used for requiring next posts
    :return: {next_max_id, List[Posts]}
    """
    url = "user_posts"

    querystring = {"username": username}

    if next_max_id is not None:
        querystring["next_max_id"] = next_max_id

    response = _request(url, json.dumps(querystring))

    if response["statusCode"] != 404:
        return response["status"]

    posts_json = response["body"]["items"]

    posts = []

    for post_json in posts_json:
        try:
            source = post_json["video_versions"][0]["url"]
            views = post_json["view_count"]
            is_video = True
        except:
            source = post_json["image_versions2"]["candidates"][0]["url"]
            views = None
            is_video = False

        caption = post_json["caption"]["text"] if post_json["caption"] is not None else ""

        taken_at = datetime.fromtimestamp(post_json["taken_at"])

        new_post = Post(
            post_json["code"],
            source,
            post_json["like_count"],
            post_json["comment_count"],
            caption,
            is_video,
            views,
            taken_at
        )

        posts.append(new_post)

    to_return = {"post_list": posts}

    if "next_max_id" in response["body"]:
        to_return["next_max_id"] = response["body"]["next_max_id"]

    return to_return
Beispiel #11
0
def test_post_parse_errors():
    from classes import Post, PostParseError
    failure = None
    try:
        failure = Post()
        assert 'Post with no initializer did not fail' is False
    except PostParseError:
        pass
    assert failure is None
Beispiel #12
0
def test_post_parse_errors():
    from classes import Post, PostParseError
    failure = None
    try:
        failure = Post()
        assert 'Post with no initializer did not fail.' == 'An exception should have been generated and caught.'
    except PostParseError:
        pass
    assert failure is None
Beispiel #13
0
def check_if_spam_json(json_data):
    try:
        post = Post(json_data=json_data)
    except PostParseError as err:
        log('error', 'Parse error {0} when parsing json_data {1!r}'.format(
            err, json_data))
        return False, '', ''
    is_spam, reason, why = check_if_spam(post)
    return is_spam, reason, why
Beispiel #14
0
def feedInsert():
    if request.method == 'POST':

        post = Post()
        post.titulo = request.form.get("feedInTitulo", '')
        post.descricao = request.form.get("feedInMensagem", '')
        post.midiaTipo = "imgB64"
        post.midia = request.form.get("fileBase64", '')
        post.momento = datetime.now()

        # Salvando no banco
        db = utilitariosDB.getDb()
        post.salvarMongoDb(db)

        return redirect("/feed")
    else:
        return render_template("feedInsert.html")
def test_blacklisted_user():
    user_url = 'http://stackoverflow.com/users/1/jeff-atwood'
    user = get_user_from_url(user_url)
    add_blacklisted_user(user, "", "")
    # Construct a "fake" post object in API-format
    post = Post(api_response={'title': '', 'body': '',
                              'owner': {'display_name': user, 'reputation': 1, 'link': user_url},
                              'site': 'stackoverflow.com', 'question_id': '1', 'IsAnswer': False, 'score': 0})
    is_spam, reason, _ = check_if_spam(post)
    assert is_spam is True
    # cleanup
    _remove_pickle("blacklistedUsers.p")
Beispiel #16
0
def test_findspam(title, body, username, site, body_is_summary, is_answer, expected_spam):
    post = Post(api_response={'title': title, 'body': body,
                              'owner': {'display_name': username, 'reputation': 1, 'link': ''},
                              'site': site, 'question_id': '1', 'IsAnswer': is_answer,
                              'BodyIsSummary': body_is_summary, 'score': 0})
    result = FindSpam.test_post(post)[0]
    log('info', title)
    log('info', "Result:", result)
    scan_spam = (len(result) > 0)
    if scan_spam != expected_spam:
        print("Expected {1} on {0}".format(body, expected_spam))
    assert scan_spam == expected_spam
Beispiel #17
0
def mock_post(
        title='',
        body='',
        site='stackoverflow.com',
        link='https://stackoverflow.com/a/1732454',
        owner={'link':
               'https://stackoverflow.com/users/102937/robert-harvey'}):
    api_response = {
        "title": title,
        "body": body,
        "site": site,
        "link": link,
        "owner": owner
    }
    return Post(api_response=api_response)
def test_check_if_spam(title, body, username, site, match):
    # We can't check blacklists/whitelists in tests, so these are set to their default values

    post_dict = {
        "titleEncodedFancy": str(title),
        "bodySummary": str(body),
        "ownerDisplayName": str(username),
        "url": "TEST: No URL passed!",
        "id": "TEST: No ID passed!",
        "siteBaseHostAddress": str(site),
        "ownerUrl": "TEST: No Owner ID passed!"
    }
    json_dict = {
        "action": "155-questions-active",
        'data': json.dumps(post_dict),
        'IsAnswer': False  # If we want to test answers separately, this should be changed.
    }
    json_data = json.dumps(json_dict)
    post = Post(json_data=json_data)
    is_spam, reason, _ = check_if_spam(post)
    assert match == is_spam
Beispiel #19
0
def movies_page():

    if request.method == "POST":
        movie = Movie(request.form['title'].title(), "", "", "", "")
        score = request.form['score']
        comments = request.form['comment']

        if int(score) < 1 or int(score) > 10:
            flash("Your rating to the movie should be between 1 and 10.")
            return redirect(url_for('page.movies_page'))

        #checks if user is logged in
        if current_user.get_id() is not None:

            if (movie.search_movie_in_db() != -1):
                movieId = movie.search_movie_in_db()
                userMoviePair = WatchedList(current_user.username, movieId,
                                            score)
                post = Post(current_user.get_user_id(), movieId, comments)

                oldscore = userMoviePair.existsInWatchedList()

                if (oldscore != -1):
                    oldscore = oldscore[0]
                    if int(oldscore) == int(score):
                        flash("You have already added " + movie.title + ".")
                        return redirect(url_for('page.home_page'))
                    else:
                        userMoviePair.updateScoreOfWatchedMovie()

                        oldScoreMoviesTable = int(
                            movie.getscore_in_movie_db(movieId)[0])
                        totalVotes = int(
                            movie.getvotes_in_movie_db(movieId)[0])

                        newscore = ((oldScoreMoviesTable * totalVotes) -
                                    int(oldscore) + int(score)) / (totalVotes)

                        movie.update_votes_and_score(movieId, newscore,
                                                     totalVotes)

                        flash("You score to " + movie.title +
                              " is updated as " + score + ".")
                        return redirect(url_for('page.home_page'))

                else:
                    userMoviePair.add_movie_user_pair()

                    #score and vote need to be updated on movies table
                    oldscore = int(movie.getscore_in_movie_db(movieId)[0])
                    totalVotes = int(movie.getvotes_in_movie_db(movieId)[0])

                    newscore = ((oldscore * totalVotes) +
                                int(score)) / (totalVotes + 1)
                    totalVotes = totalVotes + 1

                    movie.update_votes_and_score(movieId, newscore, totalVotes)

                    post.add_post_to_db()

                    flash(
                        movie.title +
                        " is added to your watched list and your post has been saved."
                    )
                    return redirect(url_for('page.home_page'))

            else:
                movieToAdd = movie.verify_movie_from_api()
                if (movieToAdd == -1):
                    flash("There is no such movie")
                    return redirect(url_for('page.home_page'))
                else:
                    movieToAdd = movie.verify_movie_from_api()
                    movieToAdd.score = score

                    movieToAdd.add_movie_to_db()

                    flash(
                        movieToAdd.title + " (" + movieToAdd.year +
                        ") is added to your watched list and your post has been saved."
                    )

                    movieId = movieToAdd.search_movie_in_db()
                    userMoviePair = WatchedList(current_user.username, movieId,
                                                score)
                    userMoviePair.add_movie_user_pair()

                    post = Post(current_user.get_user_id(), movieId, comments)
                    post.add_post_to_db()

                    return redirect(url_for('page.home_page'))

        else:
            flash("Please log in to MovieShake")
            return redirect(url_for('page.login_page'))
    else:
        if current_user.get_id() is not None:
            return render_template('movies.html')
        else:
            flash("Please log in to MovieShake")
            return redirect(url_for('page.login_page'))
Beispiel #20
0
from classes import Post, Collection
from selenium import webdriver

# Open browser
driver = webdriver.Chrome()

# # Create sample post
# post = Post(65313)
# post.get_text(driver)
# print(post.text)

# Create collection
collection = Collection()
for tag in range(65799, -1, -1):
    post = Post(tag)
    post.get_text(driver)
    print("Tag:", tag, '-', post.has_text())
    collection.insert(post)

    if tag % 100 == 0:
        # Save collection
        collection.to_df()
        collection.to_csv('collection' + str(tag // 100) + '.csv')
        collection.to_excel('collection' + str(tag // 100) + '.xlsx')
        collection = Collection()

# Close browser
driver.close()
Beispiel #21
0
def add_post(post_ch, post_txt, post_user, post_time):
    new_post = Post(post_ch, post_txt, post_user, post_time)
    emit("add_new_post", new_post.get_post_dict(post_ch), broadcast=True)
Beispiel #22
0
    def make_api_call_for_site(self, site):
        if site not in self.queue:
            return

        self.queue_modify_lock.acquire()
        new_posts = self.queue.pop(site)
        store_bodyfetcher_queue()
        self.queue_modify_lock.release()

        new_post_ids = [int(k) for k, v in new_posts.items()]

        if GlobalVars.flovis is not None:
            for post_id in new_post_ids:
                GlobalVars.flovis.stage('bodyfetcher/api_request', site, post_id,
                                        {'queue':
                                         dict([[sk, [k for k, v in sq.items()]] for sk, sq in self.queue.items()]),
                                         'site': site, 'posts': [k for k, v in new_posts.items()]})

        self.queue_timing_modify_lock.acquire()
        post_add_times = [v for k, v in new_posts.items()]
        pop_time = datetime.utcnow()

        for add_time in post_add_times:
            try:
                seconds_in_queue = (pop_time - add_time).total_seconds()
                if site in self.queue_timings:
                    self.queue_timings[site].append(seconds_in_queue)
                else:
                    self.queue_timings[site] = [seconds_in_queue]
            except:
                continue  # Skip to next item if we've got invalid data or missing values.

        store_queue_timings()

        self.queue_timing_modify_lock.release()
        self.max_ids_modify_lock.acquire()

        if site in self.previous_max_ids and max(new_post_ids) > self.previous_max_ids[site]:
            previous_max_id = self.previous_max_ids[site]
            intermediate_posts = range(previous_max_id + 1, max(new_post_ids))

            # We don't want to go over the 100-post API cutoff, so take the last
            # (100-len(new_post_ids)) from intermediate_posts

            intermediate_posts = intermediate_posts[(100 - len(new_post_ids)):]

            # new_post_ids could contain edited posts, so merge it back in
            combined = chain(intermediate_posts, new_post_ids)

            # Could be duplicates, so uniquify
            posts = list(set(combined))
        else:
            posts = new_post_ids

        try:
            if max(new_post_ids) > self.previous_max_ids[site]:
                self.previous_max_ids[site] = max(new_post_ids)
                store_bodyfetcher_max_ids()
        except KeyError:
            self.previous_max_ids[site] = max(new_post_ids)
            store_bodyfetcher_max_ids()

        self.max_ids_modify_lock.release()

        log('debug', "New IDs / Hybrid Intermediate IDs for {0}:".format(site))
        log('debug', sorted(new_post_ids))
        log('debug', sorted(posts))

        question_modifier = ""
        pagesize_modifier = ""

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            if self.last_activity_date != 0:
                pagesize = "50"
            else:
                pagesize = "25"

            pagesize_modifier = "&pagesize={pagesize}" \
                                "&min={time_length}".format(pagesize=pagesize, time_length=str(self.last_activity_date))
        else:
            question_modifier = "/{0}".format(";".join(str(post) for post in posts))

        url = "https://api.stackexchange.com/2.2/questions{q_modifier}?site={site}" \
              "&filter=!)E0g*ODaEZ(SgULQhYvCYbu09*ss(bKFdnTrGmGUxnqPptuHP&key=IAkbitmze4B8KpacUfLqkw((" \
              "{optional_min_query_param}".format(q_modifier=question_modifier, site=site,
                                                  optional_min_query_param=pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)

        GlobalVars.api_request_lock.acquire()
        # Respect backoff, if we were given one
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        try:
            time_request_made = datetime.now().strftime('%H:%M:%S')
            response = requests.get(url, timeout=20).json()
        except (requests.exceptions.Timeout, requests.ConnectionError, Exception):
            # Any failure in the request being made (timeout or otherwise) should be added back to
            # the queue.
            self.queue_modify_lock.acquire()
            if site in self.queue:
                self.queue[site].update(new_posts)
            else:
                self.queue[site] = new_posts
            self.queue_modify_lock.release()
            GlobalVars.api_request_lock.release()
            return

        self.api_data_lock.acquire()
        add_or_update_api_data(site)
        self.api_data_lock.release()

        message_hq = ""
        if "quota_remaining" in response:
            if response["quota_remaining"] - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0:
                tell_rooms_with("debug", "API quota rolled over with {0} requests remaining. "
                                         "Current quota: {1}.".format(GlobalVars.apiquota,
                                                                      response["quota_remaining"]))

                sorted_calls_per_site = sorted(GlobalVars.api_calls_per_site.items(), key=itemgetter(1), reverse=True)
                api_quota_used_per_site = ""
                for site_name, quota_used in sorted_calls_per_site:
                    sanatized_site_name = site_name.replace('.com', '').replace('.stackexchange', '')
                    api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(str(quota_used))
                api_quota_used_per_site = api_quota_used_per_site.strip()

                tell_rooms_with("debug", api_quota_used_per_site)
                clear_api_data()
            if response["quota_remaining"] == 0:
                tell_rooms_with("debug", "API reports no quota left!  May be a glitch.")
                tell_rooms_with("debug", str(response))  # No code format for now?
            if GlobalVars.apiquota == -1:
                tell_rooms_with("debug", "Restart: API quota is {quota}."
                                         .format(quota=response["quota_remaining"]))
            GlobalVars.apiquota = response["quota_remaining"]
        else:
            message_hq = "The quota_remaining property was not in the API response."

        if "error_message" in response:
            message_hq += " Error: {} at {} UTC.".format(response["error_message"], time_request_made)
            if "error_id" in response and response["error_id"] == 502:
                if GlobalVars.api_backoff_time < time.time() + 12:  # Add a backoff of 10 + 2 seconds as a default
                    GlobalVars.api_backoff_time = time.time() + 12
            message_hq += " Backing off on requests for the next 12 seconds."
            message_hq += " Previous URL: `{}`".format(url)

        if "backoff" in response:
            if GlobalVars.api_backoff_time < time.time() + response["backoff"]:
                GlobalVars.api_backoff_time = time.time() + response["backoff"]

        GlobalVars.api_request_lock.release()

        if len(message_hq) > 0:
            tell_rooms_with("debug", message_hq.strip())

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                self.last_activity_date = items[0]["last_activity_date"]

        num_scanned = 0
        start_time = time.time()

        for post in response["items"]:
            if "title" not in post or "body" not in post:
                if GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage('bodyfetcher/api_response/no_content', site, post['question_id'], post)
                continue

            post['site'] = site
            try:
                post_ = Post(api_response=post)
            except PostParseError as err:
                log('error', 'Error {0} when parsing post: {1!r}'.format(err, post_))
                if GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage('bodyfetcher/api_response/error', site, post['question_id'], post)
                continue

            num_scanned += 1

            is_spam, reason, why = check_if_spam(post_)

            if is_spam:
                try:
                    if GlobalVars.flovis is not None and 'question_id' in post:
                        GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, post['question_id'],
                                                {'post': post, 'check_if_spam': [is_spam, reason, why]})
                    handle_spam(post=post_,
                                reasons=reason,
                                why=why)
                except Exception as e:
                    log('error', "Exception in handle_spam:", e)
            elif GlobalVars.flovis is not None and 'question_id' in post:
                GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, post['question_id'],
                                        {'post': post, 'check_if_spam': [is_spam, reason, why]})

            try:
                if "answers" not in post:
                    pass
                else:
                    for answer in post["answers"]:
                        num_scanned += 1
                        answer["IsAnswer"] = True  # Necesssary for Post object
                        answer["title"] = ""  # Necessary for proper Post object creation
                        answer["site"] = site  # Necessary for proper Post object creation
                        answer_ = Post(api_response=answer, parent=post_)

                        is_spam, reason, why = check_if_spam(answer_)
                        if is_spam:
                            try:
                                if GlobalVars.flovis is not None and 'answer_id' in answer:
                                    GlobalVars.flovis.stage('bodyfetcher/api_response/spam', site, answer['answer_id'],
                                                            {'post': answer, 'check_if_spam': [is_spam, reason, why]})
                                handle_spam(answer_,
                                            reasons=reason,
                                            why=why)
                            except Exception as e:
                                log('error', "Exception in handle_spam:", e)
                        elif GlobalVars.flovis is not None and 'answer_id' in answer:
                            GlobalVars.flovis.stage('bodyfetcher/api_response/not_spam', site, answer['answer_id'],
                                                    {'post': answer, 'check_if_spam': [is_spam, reason, why]})

            except Exception as e:
                log('error', "Exception handling answers:", e)

        end_time = time.time()
        GlobalVars.posts_scan_stats_lock.acquire()
        GlobalVars.num_posts_scanned += num_scanned
        GlobalVars.post_scan_time += end_time - start_time
        GlobalVars.posts_scan_stats_lock.release()
        return
Beispiel #23
0
def allspam(msg, url):
    """
    Reports all of a user's posts as spam
    :param msg:
    :param url: A user profile URL
    :return:
    """
    crn, wait = can_report_now(msg.owner.id, msg._client.host)
    if not crn:
        raise CmdException("You can execute the !!/allspam command again in {} seconds. "
                           "To avoid one user sending lots of reports in a few commands and "
                           "slowing SmokeDetector down due to rate-limiting, you have to "
                           "wait 30 seconds after you've reported multiple posts in "
                           "one go.".format(wait))
    user = get_user_from_url(url)
    if user is None:
        raise CmdException("That doesn't look like a valid user URL.")
    user_sites = []
    user_posts = []
    # Detect whether link is to network profile or site profile
    if user[1] == 'stackexchange.com':
        # Respect backoffs etc
        GlobalVars.api_request_lock.acquire()
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        # Fetch sites
        api_filter = "!6Pbp)--cWmv(1"
        request_url = "http://api.stackexchange.com/2.2/users/{}/associated?filter={}&key=IAkbitmze4B8KpacUfLqkw((" \
            .format(user[0], api_filter)
        res = requests.get(request_url).json()
        if "backoff" in res:
            if GlobalVars.api_backoff_time < time.time() + res["backoff"]:
                GlobalVars.api_backoff_time = time.time() + res["backoff"]
        GlobalVars.api_request_lock.release()
        if 'items' not in res or len(res['items']) == 0:
            raise CmdException("The specified user does not appear to exist.")
        if res['has_more']:
            raise CmdException("The specified user has an abnormally high number of accounts. Please consider flagging "
                               "for moderator attention, otherwise use !!/report on the user's posts individually.")
        # Add accounts with posts
        for site in res['items']:
            if site['question_count'] > 0 or site['answer_count'] > 0:
                user_sites.append((site['user_id'], get_api_sitename_from_url(site['site_url'])))
    else:
        user_sites.append((user[0], get_api_sitename_from_url(user[1])))
    # Fetch posts
    for u_id, u_site in user_sites:
        # Respect backoffs etc
        GlobalVars.api_request_lock.acquire()
        if GlobalVars.api_backoff_time > time.time():
            time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
        # Fetch posts
        api_filter = "!)Q4RrMH0DC96Y4g9yVzuwUrW"
        request_url = "http://api.stackexchange.com/2.2/users/{}/posts?site={}&filter={}&key=IAkbitmze4B8KpacUfLqkw((" \
            .format(u_id, u_site, api_filter)
        res = requests.get(request_url).json()
        if "backoff" in res:
            if GlobalVars.api_backoff_time < time.time() + res["backoff"]:
                GlobalVars.api_backoff_time = time.time() + res["backoff"]
        GlobalVars.api_request_lock.release()
        if 'items' not in res or len(res['items']) == 0:
            raise CmdException("The specified user has no posts on this site.")
        posts = res['items']
        if posts[0]['owner']['reputation'] > 100:
            raise CmdException("The specified user's reputation is abnormally high. Please consider flagging for "
                               "moderator attention, otherwise use !!/report on the posts individually.")
        # Add blacklisted user - use most downvoted post as post URL
        message_url = "https://chat.{}/transcript/{}?m={}".format(msg._client.host, msg.room.id, msg.id)
        add_blacklisted_user(user, message_url, sorted(posts, key=lambda x: x['score'])[0]['owner']['link'])
        # TODO: Postdata refactor, figure out a better way to use apigetpost
        for post in posts:
            post_data = PostData()
            post_data.post_id = post['post_id']
            post_data.post_url = url_to_shortlink(post['link'])
            *discard, post_data.site, post_data.post_type = fetch_post_id_and_site_from_url(
                url_to_shortlink(post['link']))
            post_data.title = unescape(post['title'])
            post_data.owner_name = unescape(post['owner']['display_name'])
            post_data.owner_url = post['owner']['link']
            post_data.owner_rep = post['owner']['reputation']
            post_data.body = post['body']
            post_data.score = post['score']
            post_data.up_vote_count = post['up_vote_count']
            post_data.down_vote_count = post['down_vote_count']
            if post_data.post_type == "answer":
                # Annoyingly we have to make another request to get the question ID, since it is only returned by the
                # /answers route
                # Respect backoffs etc
                GlobalVars.api_request_lock.acquire()
                if GlobalVars.api_backoff_time > time.time():
                    time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
                # Fetch posts
                filter = "!*Jxb9s5EOrE51WK*"
                req_url = "http://api.stackexchange.com/2.2/answers/{}?site={}&filter={}&key=IAkbitmze4B8KpacUfLqkw((" \
                    .format(post['post_id'], u_site, filter)
                answer_res = requests.get(req_url).json()
                if "backoff" in res:
                    if GlobalVars.api_backoff_time < time.time() + res["backoff"]:
                        GlobalVars.api_backoff_time = time.time() + res["backoff"]
                GlobalVars.api_request_lock.release()
                # Finally, set the attribute
                post_data.question_id = answer_res['items'][0]['question_id']
                post_data.is_answer = True
            user_posts.append(post_data)
    if len(user_posts) == 0:
        raise CmdException("The specified user hasn't posted anything.")
    if len(user_posts) > 15:
        raise CmdException("The specified user has an abnormally high number of spam posts. Please consider flagging "
                           "for moderator attention, otherwise use !!/report on the posts individually.")
    why_info = u"User manually reported by *{}* in room *{}*.\n".format(msg.owner.name, msg.room.name)
    # Handle all posts
    for index, post in enumerate(user_posts, start=1):
        batch = ""
        if len(user_posts) > 1:
            batch = " (batch report: post {} out of {})".format(index, len(user_posts))
        handle_spam(post=Post(api_response=post.as_dict),
                    reasons=["Manually reported " + post.post_type + batch],
                    why=why_info)
        time.sleep(2)  # Should this be implemented differently?
    if len(user_posts) > 2:
        add_or_update_multiple_reporter(msg.owner.id, msg._client.host, time.time())
    def make_api_call_for_site(self, site):
        with self.queue_lock:
            new_posts = self.queue.pop(site, None)
        if new_posts is None:
            # site was not in the queue
            return
        Tasks.do(store_bodyfetcher_queue)

        new_post_ids = [int(k) for k in new_posts.keys()]

        if GlobalVars.flovis is not None:
            for post_id in new_post_ids:
                GlobalVars.flovis.stage('bodyfetcher/api_request', site,
                                        post_id, {
                                            'site': site,
                                            'posts': list(new_posts.keys())
                                        })

        # Add queue timing data
        pop_time = datetime.utcnow()
        post_add_times = [(pop_time - v).total_seconds()
                          for k, v in new_posts.items()]
        Tasks.do(add_queue_timing_data, site, post_add_times)

        store_max_ids = False
        with self.max_ids_modify_lock:
            if site in self.previous_max_ids and max(
                    new_post_ids) > self.previous_max_ids[site]:
                previous_max_id = self.previous_max_ids[site]
                intermediate_posts = range(previous_max_id + 1,
                                           max(new_post_ids))

                # We don't want to go over the 100-post API cutoff, so take the last
                # (100-len(new_post_ids)) from intermediate_posts

                intermediate_posts = intermediate_posts[-(100 -
                                                          len(new_post_ids)):]

                # new_post_ids could contain edited posts, so merge it back in
                combined = chain(intermediate_posts, new_post_ids)

                # Could be duplicates, so uniquify
                posts = list(set(combined))
            else:
                posts = new_post_ids

            new_post_ids_max = max(new_post_ids)
            if new_post_ids_max > self.previous_max_ids.get(site, 0):
                self.previous_max_ids[site] = new_post_ids_max
                store_max_ids = True

        if store_max_ids:
            schedule_store_bodyfetcher_max_ids()

        log('debug', "New IDs / Hybrid Intermediate IDs for {}:".format(site))
        if len(new_post_ids) > 30:
            log(
                'debug', "{} +{} more".format(
                    sorted(new_post_ids)[:30],
                    len(new_post_ids) - 30))
        else:
            log('debug', sorted(new_post_ids))
        if len(new_post_ids) == len(posts):
            log('debug', "[ *Identical* ]")
        elif len(posts) > 30:
            log('debug',
                "{} +{} more".format(sorted(posts)[:30],
                                     len(posts) - 30))
        else:
            log('debug', sorted(posts))

        question_modifier = ""
        pagesize_modifier = {}

        if site == "stackoverflow.com":
            # Not all SO questions are shown in the realtime feed. We now
            # fetch all recently modified SO questions to work around that.
            with self.last_activity_date_lock:
                if self.last_activity_date != 0:
                    pagesize = "100"
                else:
                    pagesize = "50"

                pagesize_modifier = {
                    'pagesize':
                    pagesize,
                    'min':
                    str(self.last_activity_date -
                        self.ACTIVITY_DATE_EXTRA_EARLIER_MS_TO_FETCH)
                }
        else:
            question_modifier = "/{0}".format(";".join(
                [str(post) for post in posts]))

        url = "https://api.stackexchange.com/2.2/questions{}".format(
            question_modifier)
        params = {
            'filter':
            '!1rs)sUKylwB)8isvCRk.xNu71LnaxjnPS12*pX*CEOKbPFwVFdHNxiMa7GIVgzDAwMa',
            'key': 'IAkbitmze4B8KpacUfLqkw((',
            'site': site
        }
        params.update(pagesize_modifier)

        # wait to make sure API has/updates post data
        time.sleep(3)

        with GlobalVars.api_request_lock:
            # Respect backoff, if we were given one
            if GlobalVars.api_backoff_time > time.time():
                time.sleep(GlobalVars.api_backoff_time - time.time() + 2)
            try:
                time_request_made = datetime.utcnow().strftime('%H:%M:%S')
                response = requests.get(url, params=params, timeout=20).json()
            except (requests.exceptions.Timeout, requests.ConnectionError,
                    Exception):
                # Any failure in the request being made (timeout or otherwise) should be added back to
                # the queue.
                with self.queue_lock:
                    if site in self.queue:
                        self.queue[site].update(new_posts)
                    else:
                        self.queue[site] = new_posts
                return

            with self.api_data_lock:
                add_or_update_api_data(site)

            message_hq = ""
            with GlobalVars.apiquota_rw_lock:
                if "quota_remaining" in response:
                    quota_remaining = response["quota_remaining"]
                    if quota_remaining - GlobalVars.apiquota >= 5000 and GlobalVars.apiquota >= 0 \
                            and quota_remaining > 39980:
                        tell_rooms_with(
                            "debug",
                            "API quota rolled over with {0} requests remaining. "
                            "Current quota: {1}.".format(
                                GlobalVars.apiquota, quota_remaining))

                        sorted_calls_per_site = sorted(
                            GlobalVars.api_calls_per_site.items(),
                            key=itemgetter(1),
                            reverse=True)
                        api_quota_used_per_site = ""
                        for site_name, quota_used in sorted_calls_per_site:
                            sanatized_site_name = site_name.replace(
                                '.com', '').replace('.stackexchange', '')
                            api_quota_used_per_site += sanatized_site_name + ": {0}\n".format(
                                str(quota_used))
                        api_quota_used_per_site = api_quota_used_per_site.strip(
                        )

                        tell_rooms_with("debug", api_quota_used_per_site)
                        clear_api_data()
                    if quota_remaining == 0:
                        tell_rooms_with(
                            "debug",
                            "API reports no quota left!  May be a glitch.")
                        tell_rooms_with(
                            "debug", str(response))  # No code format for now?
                    if GlobalVars.apiquota == -1:
                        tell_rooms_with(
                            "debug", "Restart: API quota is {quota}.".format(
                                quota=quota_remaining))
                    GlobalVars.apiquota = quota_remaining
                else:
                    message_hq = "The quota_remaining property was not in the API response."

            if "error_message" in response:
                message_hq += " Error: {} at {} UTC.".format(
                    response["error_message"], time_request_made)
                if "error_id" in response and response["error_id"] == 502:
                    if GlobalVars.api_backoff_time < time.time(
                    ) + 12:  # Add a backoff of 10 + 2 seconds as a default
                        GlobalVars.api_backoff_time = time.time() + 12
                message_hq += " Backing off on requests for the next 12 seconds."
                message_hq += " Previous URL: `{}`".format(url)

            if "backoff" in response:
                if GlobalVars.api_backoff_time < time.time(
                ) + response["backoff"]:
                    GlobalVars.api_backoff_time = time.time(
                    ) + response["backoff"]

        if len(message_hq) > 0 and "site is required" not in message_hq:
            message_hq = message_hq.strip()
            if len(message_hq) > 500:
                message_hq = "\n" + message_hq
            tell_rooms_with("debug", message_hq)

        if "items" not in response:
            return

        if site == "stackoverflow.com":
            items = response["items"]
            if len(items) > 0 and "last_activity_date" in items[0]:
                with self.last_activity_date_lock:
                    self.last_activity_date = items[0]["last_activity_date"]

        num_scanned = 0
        start_time = time.time()

        for post in response["items"]:
            if GlobalVars.flovis is not None:
                pnb = copy.deepcopy(post)
                if 'body' in pnb:
                    pnb['body'] = 'Present, but truncated'
                if 'answers' in pnb:
                    del pnb['answers']

            if "title" not in post or "body" not in post:
                if GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage(
                        'bodyfetcher/api_response/no_content', site,
                        post['question_id'], pnb)
                continue

            post['site'] = site
            try:
                post['edited'] = (post['creation_date'] !=
                                  post['last_edit_date'])
            except KeyError:
                post[
                    'edited'] = False  # last_edit_date not present = not edited

            question_doesnt_need_scan = is_post_recently_scanned_and_unchanged(
                post)
            add_recently_scanned_post(post)
            if not question_doesnt_need_scan:
                try:
                    post_ = Post(api_response=post)
                except PostParseError as err:
                    log(
                        'error', 'Error {0} when parsing post: {1!r}'.format(
                            err, post_))
                    if GlobalVars.flovis is not None and 'question_id' in post:
                        GlobalVars.flovis.stage(
                            'bodyfetcher/api_response/error', site,
                            post['question_id'], pnb)
                    continue

                num_scanned += 1

                is_spam, reason, why = check_if_spam(post_)

                if is_spam:
                    try:
                        if GlobalVars.flovis is not None and 'question_id' in post:
                            GlobalVars.flovis.stage(
                                'bodyfetcher/api_response/spam', site,
                                post['question_id'], {
                                    'post': pnb,
                                    'check_if_spam': [is_spam, reason, why]
                                })
                        handle_spam(post=post_, reasons=reason, why=why)
                    except Exception as e:
                        log('error', "Exception in handle_spam:", e)
                elif GlobalVars.flovis is not None and 'question_id' in post:
                    GlobalVars.flovis.stage(
                        'bodyfetcher/api_response/not_spam', site,
                        post['question_id'], {
                            'post': pnb,
                            'check_if_spam': [is_spam, reason, why]
                        })

            try:
                if "answers" not in post:
                    pass
                else:
                    for answer in post["answers"]:
                        if GlobalVars.flovis is not None:
                            anb = copy.deepcopy(answer)
                            if 'body' in anb:
                                anb['body'] = 'Present, but truncated'

                        num_scanned += 1
                        answer["IsAnswer"] = True  # Necesssary for Post object
                        answer[
                            "title"] = ""  # Necessary for proper Post object creation
                        answer[
                            "site"] = site  # Necessary for proper Post object creation
                        try:
                            answer['edited'] = (answer['creation_date'] !=
                                                answer['last_edit_date'])
                        except KeyError:
                            answer[
                                'edited'] = False  # last_edit_date not present = not edited
                        answer_doesnt_need_scan = is_post_recently_scanned_and_unchanged(
                            answer)
                        add_recently_scanned_post(answer)
                        if answer_doesnt_need_scan:
                            continue
                        answer_ = Post(api_response=answer, parent=post_)

                        is_spam, reason, why = check_if_spam(answer_)
                        if is_spam:
                            try:
                                if GlobalVars.flovis is not None and 'answer_id' in answer:
                                    GlobalVars.flovis.stage(
                                        'bodyfetcher/api_response/spam', site,
                                        answer['answer_id'], {
                                            'post': anb,
                                            'check_if_spam':
                                            [is_spam, reason, why]
                                        })
                                handle_spam(answer_, reasons=reason, why=why)
                            except Exception as e:
                                log('error', "Exception in handle_spam:", e)
                        elif GlobalVars.flovis is not None and 'answer_id' in answer:
                            GlobalVars.flovis.stage(
                                'bodyfetcher/api_response/not_spam', site,
                                answer['answer_id'], {
                                    'post': anb,
                                    'check_if_spam': [is_spam, reason, why]
                                })

            except Exception as e:
                log('error', "Exception handling answers:", e)

        end_time = time.time()
        scan_time = end_time - start_time
        GlobalVars.PostScanStat.add_stat(num_scanned, scan_time)
        return
Beispiel #25
0
def report(msg, urls):
    """
    Report a post (or posts)
    :param msg:
    :param urls:
    :return: A string (or None)
    """
    crn, wait = can_report_now(msg.owner.id, msg._client.host)
    if not crn:
        raise CmdException("You can execute the !!/report command again in {} seconds. "
                           "To avoid one user sending lots of reports in a few commands and "
                           "slowing SmokeDetector down due to rate-limiting, you have to "
                           "wait 30 seconds after you've reported multiple posts in "
                           "one go.".format(wait))

    output = []
    urls = list(set(urls.split()))

    if len(urls) > 5:
        raise CmdException("To avoid SmokeDetector reporting posts too slowly, you can "
                           "report at most 5 posts at a time. This is to avoid "
                           "SmokeDetector's chat messages getting rate-limited too much, "
                           "which would slow down reports.")

    for index, url in enumerate(urls, start=1):
        post_data = api_get_post(url)

        if post_data is None:
            output.append("Post {}: That does not look like a valid post URL.".format(index))
            continue

        if post_data is False:
            output.append("Post {}: Could not find data for this post in the API. "
                          "It may already have been deleted.".format(index))
            continue

        if has_already_been_posted(post_data.site, post_data.post_id, post_data.title) and not is_false_positive(
                (post_data.post_id, post_data.site)):
            # Don't re-report if the post wasn't marked as a false positive. If it was marked as a false positive,
            # this re-report might be attempting to correct that/fix a mistake/etc.

            if GlobalVars.metasmoke_key is not None:
                se_link = to_protocol_relative(post_data.post_url)
                ms_link = "https://m.erwaysoftware.com/posts/by-url?url={}".format(se_link)
                output.append("Post {}: Already recently reported [ [MS]({}) ]".format(index, ms_link))
                continue
            else:
                output.append("Post {}: Already recently reported".format(index))
                continue

        post_data.is_answer = (post_data.post_type == "answer")
        post = Post(api_response=post_data.as_dict)
        user = get_user_from_url(post_data.owner_url)

        if user is not None:
            message_url = "https://chat.{}/transcript/{}?m={}".format(msg._client.host, msg.room.id, msg.id)
            add_blacklisted_user(user, message_url, post_data.post_url)

        why_info = u"Post manually reported by user *{}* in room *{}*.\n".format(msg.owner.name, msg.room.name)
        batch = ""
        if len(urls) > 1:
            batch = " (batch report: post {} out of {})".format(index, len(urls))

        handle_spam(post=post,
                    reasons=["Manually reported " + post_data.post_type + batch],
                    why=why_info)

    if 1 < len(urls) > len(output):
        add_or_update_multiple_reporter(msg.owner.id, msg._client.host, time.time())

    if len(output) > 0:
        return os.linesep.join(output)