コード例 #1
0
class GetSubsites:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_subsites(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))
        self.__get_subsites_list("sections")
        self.__get_subsites_list("companies")
        self.db.commit()

    def __get_subsites_list(self, subsite_type):
        response = self.api.execute("subsites_list/" + subsite_type)
        subsites_list = response.json()["result"]
        for subsite_data in subsites_list:
            print(subsite_data)
            self.db.execute_insert(
                """
                    insert into subsites (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (subsite_data["id"], json.dumps(subsite_data)))
class GetSubsites:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_subsites(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))

        errors = self.db.execute_select(
            """
                select user_id
                from user_errors
                where status_code = 404
            """, None)

        for error in errors:
            user_id = error[0]
            subsite_data = self.db.execute_select_one(
                """
                    select id
                    from subsites
                    where id = %s
                """, (user_id, ))
            if subsite_data is None:
                self.__get_subsite(user_id)

    def __get_subsite(self, subsite_id):
        response = self.api.execute("subsite/" + str(subsite_id))
        if response.status_code == 429:
            # Too Many Requests
            print(
                datetime.now().strftime("%H:%M:%S") +
                ": 429 Too Many Requests. Requests processed since last 429 error: "
                + str(self.stats.requests_since_last_429) +
                ". Wait for 60 seconds and repeat")
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            self.__get_subsite(subsite_id)
            return

        print(
            str(subsite_id) + ": " + str(response.status_code) + ": " +
            str(response.json()))
        if response.status_code == 200:
            self.db.execute_insert(
                """
                    insert into subsites (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (subsite_id, json.dumps(response.json()["result"])))
            self.db.commit()
コード例 #3
0
class GetSubsiteTimeline:
    @dataclass
    class Stats():
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()
        self.offset = 1
        self.count = 50
        self.subsite_id = 203796

    @staticmethod
    def __time():
        return datetime.now().strftime("%H:%M:%S")

    def get_posts(self):
        print(f"Started at {self.__time()}")
        timeline = self.__get_timeline(self.subsite_id, 'new', self.count,
                                       self.offset)
        while timeline:
            print(f'{len(timeline)}/{self.offset}')
            parsed_timeline = self.__parse_timeline(timeline)
            self.__db_insert(parsed_timeline)

            if self.stats.request_count % 10 == 0:
                self.db.commit()
                print(
                    f'{self.__time()}: {self.stats.request_count} requests processed ({self.stats.post_count} posts, {self.stats.error_count} errors)'
                )
            self.offset += self.count
            timeline = self.__get_timeline(self.subsite_id, 'new', self.count,
                                           self.offset)

        self.db.commit()

    def __db_insert(self, parsed_timeline: list):
        for post in parsed_timeline:
            self.db.execute_insert(
                """
                    insert into posts (id, commentscount, favoritescount, hitscount, likescount, date_created, subsite_id, is_show_thanks, is_filled_by_editors, iseditorial)
                        values (%s, %s, %s, %s, %s, to_timestamp(%s), %s, %s, %s, %s)
                    on conflict (id)
                        do update set date_created = excluded.date_created;
                """, ([*post.values()]))
            self.stats.post_count += 1

        self.stats.request_count += 1
        self.stats.requests_since_last_429 += 1

    def __parse_timeline(self, timeline: list) -> list:
        parsed = [
            dict(entry_id=post.get('id'),
                 commentsCount=post.get('commentsCount'),
                 favoritesCount=post.get('favoritesCount'),
                 hitsCount=post.get('hitsCount'),
                 likesCount=post.get('likes').get('count', 0),
                 date_created=post.get('date'),
                 subsite_id=self.subsite_id,
                 is_show_thanks=post.get('is_show_thanks'),
                 is_filled_by_editors=post.get('is_filled_by_editors'),
                 isEditorial=post.get('isEditorial')) for post in timeline
            if not post.get('isRepost')
        ]
        return parsed

    def __get_timeline(self,
                       subsite: int,
                       sorting: str = 'new',
                       count: int = 50,
                       offset: int = 0) -> list:
        response = self.api.execute(
            f"subsite/{subsite}/timeline/{sorting}?count={count}&offset={offset}"
        )
        if response.status_code == 429:
            print(
                f'{self.__time()}: 429 Too Many Requests. Requests processed since last 429 error: {self.stats.requests_since_last_429}'
            )
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            return self.__get_timeline(subsite, sorting, count, offset)

        response_json = response.json()
        print(f"__get_timeline:{response.status_code}: {self.__time()}")

        return response_json.get('result')
コード例 #4
0
class GetPosts:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_posts(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))
        try:
            for post_id in range(1, 165000):
                if self.stats.request_count % 100 == 0:
                    self.db.commit()
                    print(
                        "{0}: {1} requests processed ({2} posts, {3} errors)".
                        format(datetime.now().strftime("%H:%M:%S"),
                               self.stats.request_count, self.stats.post_count,
                               self.stats.error_count))

                self.__get_post(post_id)

        except Exception:
            print("Exception!")
            raise

        finally:
            self.db.commit()

    def __get_post(self, post_id):
        response = self.api.execute("entry/" + str(post_id))
        if response.status_code == 429:
            # Too Many Requests
            print(
                datetime.now().strftime("%H:%M:%S") +
                ": 429 Too Many Requests. Requests processed since last 429 error: "
                + str(self.stats.requests_since_last_429) +
                ". Wait for 60 seconds and repeat")
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            self.__get_post(post_id)
            return

        response_json = response.json()
        print(str(response.status_code) + ": " + str(response_json))

        if "error" in response_json:
            self.db.execute_insert(
                """
                    insert into post_errors (post_id, status_code, response)
                        values (%s, %s, %s);
                """,
                (post_id, response.status_code, json.dumps(response_json)))
            self.stats.error_count += 1

        else:
            self.db.execute_insert(
                """
                    insert into posts (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (post_id, json.dumps(response_json)))
            self.stats.post_count += 1

        self.stats.request_count += 1
        self.stats.requests_since_last_429 += 1
コード例 #5
0
import json
import time
from datetime import datetime

from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper
from src.common.ochoba_api_wrapper import OchobaApiWrapper

post_id = 220958
request_interval_minutes = 30

config = ConfigLoader.load()
api = OchobaApiWrapper(config["api"])
db = DataBaseWrapper(config["db"])

print(datetime.now().strftime("%H:%M:%S") + ": Started")

while True:
    response = api.execute("entry/" + str(post_id))
    print(datetime.now().strftime("%H:%M:%S") + ": Got " +
          str(response.status_code))
    if response.status_code == 200:
        db.execute_insert(
            """
                insert into post_history (post_id, request_time, json)
                    values (%s, %s, %s);
            """, (post_id, datetime.now(), json.dumps(response.json())))
        db.commit()

    time.sleep(60 * request_interval_minutes)
コード例 #6
0
class GetPosts:
    def __init__(self):
        config = ConfigLoader.load()
        self.categories = config["parse_targets"]["categories"]
        self.users = config["parse_targets"]["users"]
        self.api = OchobaApiWrapper(config["api"])
        self.db = GetDataBaseWrapper(config["db"])
        self.middle_rating = {}

        self.__init_database()

    def scan(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))

        print("update categories")
        self.__update_categories()

        print("check subsites")
        for category in self.categories:
            time.sleep(self.api.min_delay)

            if category["is_user"] > 0:
                timeline = self.api.execute("user/%s/entries" %
                                            category["userid"])
            else:
                timeline = self.api.execute("subsite/%s/timeline/new" %
                                            category["userid"])

            if self.__is_error(timeline):
                continue

            timeline = timeline.json()
            #print("found %d posts in %s category" % (len(timeline["result"]), category["name"]))
            if len(timeline["result"]) == 0: continue

            for post in timeline["result"]:
                if post["date"] < category["last_post_time"]:
                    break
                if post["isEditorial"] > 0: continue

                self.add_post_to_database(post)

            first_post_time = timeline["result"][0]["date"]

            if first_post_time > category["last_post_time"]:
                category["last_post_time"] = first_post_time
                self.db.execute_update(
                    "update categories set last_post_time=%d where id='%s'",
                    (first_post_time, category["id"]))
            self.db.commit()

        posts = self.db.execute_select(
            "select * from posts where statsCalculated < 5 and published=0 order by publication_time asc",
            [])
        if posts is None: return

        print("update posts")
        for post in posts:
            timeFromPublication = time.time() - post[3]
            if post[6] == 0 and timeFromPublication < 3600: continue
            if post[6] == 1 and timeFromPublication < 7200: continue
            if post[6] == 2 and timeFromPublication < 10800: continue

            time.sleep(self.api.min_delay)
            post_stats = self.api.execute("entry/%d" % post[0])
            if self.__is_error(post_stats):
                self.db.execute("delete from posts where post_id=%d" % post[0])
                continue

            post_stats = post_stats.json()["result"]
            views = post_stats["hitsCount"]
            bookmarks = post_stats["favoritesCount"]
            comments = post_stats["commentsCount"]
            rating = post_stats["likes"]["summ"]

            if post[6] == 0:
                if self.db.execute_select_one(
                        "select * from posts_views where post_id=%d",
                    (post[0])) is None:
                    self.db.execute_insert(
                        "insert into posts_views (post_id, count1hr, count2hr, count3hr) values (%d, 0, 0, 0)",
                        (post[0]))
                if self.db.execute_select_one(
                        "select * from posts_bookmarks where post_id=%d",
                    (post[0])) is None:
                    self.db.execute_insert(
                        "insert into posts_bookmarks (post_id, count1hr, count2hr, count3hr) values (%d, 0, 0, 0)",
                        (post[0]))
                if self.db.execute_select_one(
                        "select * from posts_comments where post_id=%d",
                    (post[0])) is None:
                    self.db.execute_insert(
                        "insert into posts_comments (post_id, count1hr, count2hr, count3hr) values (%d, 0, 0, 0)",
                        (post[0]))
                if self.db.execute_select_one(
                        "select * from posts_rating where post_id=%d",
                    (post[0])) is None:
                    self.db.execute_insert(
                        "insert into posts_rating (post_id, count1hr, count2hr, count3hr) values (%d, 0, 0, 0)",
                        (post[0]))

            if timeFromPublication > 43000:
                nextState = 5
                column = "now"
            elif timeFromPublication > 12000:
                nextState = 4
                column = "now"
            elif timeFromPublication > 10800:
                nextState = 3
                column = "count3hr"
            elif timeFromPublication > 7200:
                nextState = 2
                column = "count2hr"
            elif timeFromPublication > 3600:
                nextState = 1
                column = "count1hr"

            self.db.execute_update(
                "update posts_views set %s=%d, now=%d where post_id=%d",
                (column, views, views, post[0]))
            self.db.execute_update(
                "update posts_bookmarks set %s=%d, now=%d where post_id=%d",
                (column, bookmarks, bookmarks, post[0]))
            self.db.execute_update(
                "update posts_comments set %s=%d, now=%d where post_id=%d",
                (column, comments, comments, post[0]))
            self.db.execute_update(
                "update posts_rating set %s=%d, now=%d where post_id=%d",
                (column, rating, rating, post[0]))
            self.db.execute_update(
                "update posts set statsCalculated=%d where post_id='%d'",
                (nextState, post[0]))

            #print ("updated state for post %d" % post[0])

            self.publish(post, post_stats)
            if nextState >= 4:
                self.db.execute_update(
                    "update posts set published=1 where post_id='%d'",
                    (post[0]))

            self.db.commit()

    def __init_database(self):
        self.db.execute("""
                    create table if not exists categories (
                        id integer primary key autoincrement not null,
                        userid integer not null,
                        name varchar,
                        is_user integer not null,
                        subscribers integer,
                        last_post_time integer not null,
                        watch integer not null
                    );
            """)
        self.db.execute("""
                    create table if not exists posts (
                        post_id integer primary key not null,
                        cat_id integer not null,
                        name varchar not null,
                        publication_time integer not null,
                        words integer not null,
                        media integer not null,
                        statsCalculated integer not null,
                        published integer not null,
                        score integer
                    );
            """)
        self.db.execute("""
                    create table if not exists posts_views (
                        post_id integer not null,
                        count1hr integer,
                        count2hr integer,
                        count3hr integer,
                        now integer
                    );
           """)
        self.db.execute("""
                    create table if not exists posts_rating (
                        post_id integer not null,
                        count1hr integer,
                        count2hr integer,
                        count3hr integer,
                        now integer
                    );
           """)
        self.db.execute("""
                    create table if not exists posts_comments (
                        post_id integer not null,
                        count1hr integer,
                        count2hr integer,
                        count3hr integer,
                        now integer
                    );
           """)
        self.db.execute("""
                    create table if not exists posts_bookmarks (
                        post_id integer not null,
                        count1hr integer,
                        count2hr integer,
                        count3hr integer,
                        now integer
                    );
            """)

        last_post_time = time.time() - 2 * 60 * 60 * 24

        for cat in self.categories:
            if self.db.execute_select_one(
                    "select userid from categories where userid='%s'",
                (cat)) is None:
                self.db.execute_insert(
                    """
                        insert into categories (userid, is_user, last_post_time, watch)
                            values (%d, 0, %d, 1);
                    """, (cat, last_post_time))

        for user in self.users:
            if self.db.execute_select_one(
                    "select userid from categories where userid='%s'",
                (user)) is None:
                self.db.execute_insert(
                    """
                        insert into categories (userid, is_user, last_post_time, watch)
                            values (%d, 1, %d, 1);
                    """, (user, last_post_time))

        self.db.commit()

    def __update_categories(self):
        self.categories = self.db.execute_select(
            "select id, userid, is_user from categories;", [])
        for line in self.categories:
            time.sleep(self.api.min_delay)

            json = self.__get_category(line[1], line[2])
            if json is None:
                continue

            self.db.execute_update(
                "update categories set subscribers='%s', name='%s' where id='%s'",
                (json["subscribers_count"], json["name"], line[0]))

        self.categories = [{
            "id": i[0],
            "userid": i[1],
            "name": i[2],
            "is_user": i[3],
            "subscribers": i[4],
            "last_post_time": i[5]
        } for i in self.db.execute_select(
            "select * from categories where watch=1;", [])]
        self.db.commit()

    def __get_category(self, userid, isuser):
        if isuser > 0:
            response = self.api.execute('user/' + str(userid))
        else:
            response = self.api.execute('subsite/' + str(userid))

        if self.__is_error(response):
            return None

        return response.json()["result"]

    def __is_error(self, response):
        if response.status_code == 429:
            # Too Many Requests
            traceback.print_stack()
            print(datetime.now().strftime("%H:%M:%S") +
                  ": 429 Too Many Requests ")
            return True
        if response.status_code == 404:
            # Too Many Requests
            traceback.print_stack()
            print(datetime.now().strftime("%H:%M:%S") + ": 404 Not Found ")
            return True
        return False

    def add_post_to_database(self, post):

        if self.db.execute_select_one("select * from posts where post_id=%d",
                                      (post["id"])) is not None:
            return

        data = {}
        data["post_id"] = post["id"]
        data["cat_id"] = post["subsite"]["id"]
        data["name"] = re.sub(r"[\"\']", "", post["title"])
        data["media"] = 0
        data["words"] = 0
        data["publication_time"] = post["date"]

        post["blocks"].append({"type": "text", "data": {"text": data["name"]}})

        for block in post["blocks"]:
            blocktype = block["type"]
            text = ""

            if blocktype == "text" or blocktype == "header" or blocktype == "quote" or blocktype == "incut":
                text = block['data']['text']
            elif blocktype == "media":
                text = ""
                for item in block['data']['items']:
                    text += item['title'] + " "
                    data["media"] += 1
            elif blocktype == "list":
                text = ""
                for item in block['data']['items']:
                    text += item + " "
            elif blocktype == "tweet":
                text = block['data']['tweet']['data']['tweet_data'][
                    'full_text']
                data["media"] += 1
            elif blocktype == "video" or blocktype == "audio" or blocktype == "link":
                data["media"] += 1

            text = re.sub(r']\([^\(\)]+\)', '', text)
            text = re.sub(r'—', '', text)
            text = re.sub(r'\s+', ' ', text)
            text = re.sub(r'\#\w+', '', text)
            data["words"] += len(text.split())

        self.db.execute_insert(
            """
                insert into posts (post_id, cat_id, name, publication_time, words, media, statsCalculated, published)
                values (%d, %d, "%s", %d, %d, %d, 0, 0);
            """, (data["post_id"], data["cat_id"], str(data["name"]),
                  data["publication_time"], data["words"], data["media"]))

        return data
        #print("added new post: [%d] %s" % (data["post_id"], data["name"]))

    def publish(self, post, post_stats):

        post_data = self.db.execute_select_one(
            """
                select posts.post_id, categories.name, posts.name, posts.publication_time, posts.words, posts.media,
                    posts_views.now, posts_bookmarks.now, posts_rating.now, posts_comments.now, categories.subscribers
                from posts
                left join categories on posts.cat_id=categories.userid
                left join posts_views on posts.post_id=posts_views.post_id
                left join posts_bookmarks on posts.post_id=posts_bookmarks.post_id
                left join posts_rating on posts.post_id=posts_rating.post_id
                left join posts_comments on posts.post_id=posts_comments.post_id
                where posts.post_id=%d
            """, (post[0]))

        if post_data is None: return

        post_id = post_data[0]
        category = post_data[1]
        title = post_data[2]
        publication_time = post_data[3]
        words = post_data[4]
        media = post_data[5]
        views = post_data[6]
        bookmarks = post_data[7]
        rating = post_data[8]
        comments = post_data[9]
        category_subs = post_data[10]

        if views is None or bookmarks is None or rating is None or comments is None:
            return

        if category_subs is None:
            json = self.__get_category(post_stats["subsite"]["id"],
                                       post_stats["subsite"]["type"] == 1)
            time.sleep(self.api.min_delay)
            category = json['name']
            category_subs = json["subscribers_count"]
            self.db.execute_insert(
                """
                    insert into categories (userid, name, is_user, subscribers, last_post_time, watch)
                        values (%s, '%s', %d, %d, 0, 0);
                """, (json["id"], json['name'], post_stats["subsite"]["type"]
                      == 1, json["subscribers_count"]))

        category_subs = min(category_subs, 50000)

        expected_views = pow(category_subs * 2.3, 0.71)
        expected_likes = pow(category_subs * 3.5, 0.38)
        expected_bookmarks = pow(category_subs * 3.5, 0.38)
        expected_comments = pow(category_subs * 3.5, 0.38)

        score = words / 500.0 + media / 10.0 + views / expected_views + bookmarks / expected_bookmarks + rating / expected_likes + comments / expected_comments
        score = int(score * 10)
        self.db.execute_update("update posts set score=%d where post_id='%d'",
                               (score, post[0]))

        if score > 50:
            print(
                "Recommended: [https://dtf.ru/%d][%s][Score: %s] \"%s\" From %s / %d Words / %d Mediafiles / %d Views / %d Bookmarks / %d Rating / %d Comments"
                % (post_id,
                   time.strftime('%H:%M %m-%d-%YMSK',
                                 time.localtime(publication_time + 10800)),
                   str(score).zfill(4), title, category, words, media, views,
                   bookmarks, rating, comments))
            self.db.execute_update(
                "update posts set published=1 where post_id='%d'", (post[0]))

    def __get_post(self, post_id):
        response = self.api.execute("entry/" + str(post_id))

        response_json = response.json()
        print(str(response.status_code) + ": " + str(response_json))

        if "error" in response_json:
            self.db.execute_insert(
                """
                    insert into post_errors (post_id, status_code, response)
                        values (%s, %s, %s);
                """,
                (post_id, response.status_code, json.dumps(response_json)))
            self.stats.error_count += 1

        else:
            self.db.execute_insert(
                """
                    insert into posts (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (post_id, json.dumps(response_json)))
            self.stats.post_count += 1

        self.stats.request_count += 1
        self.stats.requests_since_last_429 += 1

    def print_post_info(self, id):
        self.db.execute("delete from posts where post_id=%d" % id)

        post = self.api.execute("entry/%d" % id)
        if self.__is_error(post):
            print("error")

        print(self.add_post_to_database(post.json()['result']))