class SqlPlot:
    def __init__(self):
        config = ConfigLoader.load()
        self.db = DataBaseWrapper(config["db"])

    def show(self, sql_queries, title, x_label, y_label):
        show_legend = False
        for sql_query in sql_queries:
            data = self.db.execute_select(sql_query.get('query'), None)
            x = []
            y = []
            for row in data:
                x.append(row[0])
                y.append(row[1])

            label = sql_query.get('label')
            plot.plot(x, y, label=label)
            show_legend |= label is not None

        plot.title(title)
        plot.xlabel(x_label)
        plot.xticks(rotation=45)
        plot.ylabel(y_label)
        plot.grid(True)

        if show_legend:
            plot.legend()

        plot.show()
class ParseUserData:
    def __init__(self):
        config = ConfigLoader.load()
        self.db = DataBaseWrapper(config["db"])

    def parse(self):
        page_size = 500
        page = 0
        while True:
            print("Fetch page #{0} ({1})".format(page, page_size * page))
            result = self.db.execute_select(
                """
                    select json from users
                        order by id
                        limit %s offset %s
                """, (page_size, page_size * page))
            if len(result) == 0:
                break

            for row in result:
                user_data = json.loads(row[0])["result"]
                self.db.execute_update(
                    """
                        update users
                            set
                                created = to_timestamp(%s),
                                name = %s,
                                type = %s,
                                karma = %s,
                                is_plus = %s,
                                is_verified = %s,
                                is_available_for_messenger = %s,
                                entries_count = %s,
                                comments_count = %s,
                                favorites_count = %s,
                                subscribers_count = %s
                            where id = %s
                    """, (user_data["created"], user_data["name"],
                          user_data["type"], user_data["karma"],
                          user_data["is_plus"], user_data["is_verified"],
                          user_data["isAvailableForMessenger"],
                          user_data["counters"]["entries"],
                          user_data["counters"]["comments"],
                          user_data["counters"]["favorites"],
                          user_data["subscribers_count"], user_data["id"]))

            page += 1
            self.db.commit()
class GetSubsites:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_subsites(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))

        errors = self.db.execute_select(
            """
                select user_id
                from user_errors
                where status_code = 404
            """, None)

        for error in errors:
            user_id = error[0]
            subsite_data = self.db.execute_select_one(
                """
                    select id
                    from subsites
                    where id = %s
                """, (user_id, ))
            if subsite_data is None:
                self.__get_subsite(user_id)

    def __get_subsite(self, subsite_id):
        response = self.api.execute("subsite/" + str(subsite_id))
        if response.status_code == 429:
            # Too Many Requests
            print(
                datetime.now().strftime("%H:%M:%S") +
                ": 429 Too Many Requests. Requests processed since last 429 error: "
                + str(self.stats.requests_since_last_429) +
                ". Wait for 60 seconds and repeat")
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            self.__get_subsite(subsite_id)
            return

        print(
            str(subsite_id) + ": " + str(response.status_code) + ": " +
            str(response.json()))
        if response.status_code == 200:
            self.db.execute_insert(
                """
                    insert into subsites (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (subsite_id, json.dumps(response.json()["result"])))
            self.db.commit()
Beispiel #4
0
class ParsePostData:
    def __init__(self):
        config = ConfigLoader.load()
        self.db = DataBaseWrapper(config["db"])
        self.tag_regex = re.compile(config["api"]["tag_regex"])

    def __parse_tags(self, post_id, text):
        search_index = 0
        while True:
            match = self.tag_regex.search(text, search_index)
            if match is None:
                break

            parsed_tag = urllib.parse.unquote(match.group(0))
            if len(parsed_tag) >= 3 and not parsed_tag[1:].isdigit():
                self.db.execute_insert(
                    """
                        insert into post_tags (post_id, value, source)
                            values (%s, %s, %s)
                    """,
                    (post_id, parsed_tag.lower(), text)
                )
            search_index = match.end(0)

    def parse(self):
        offset_base = 0
        page_size = 500
        page = 0
        while True:
            offset = offset_base + page_size * page
            print(f"Fetch page #{page} ({offset})")
            result = self.db.execute_select(
                """
                    select id, json from posts
                        order by id
                        limit %s offset %s
                """,
                (page_size, offset)
            )
            if len(result) == 0:
                break

            for row in result:
                post_id = row[0]
                try:
                    post_data = json.loads(row[1])["result"]
                    if "blocks" in post_data:
                        blocks = post_data["blocks"]
                        for block in blocks:
                            block_type = block["type"]
                            block_data = block["data"]
                            text_length = 0
                            if "text" in block_data:
                                text_length = len(block_data["text"])
                                self.__parse_tags(post_id, block_data["text"])
                            if block_type == "list":
                                for item in block_data["items"]:
                                    text_length += len(item)
                                    self.__parse_tags(post_id, item)

                            self.db.execute_insert(
                                """
                                    insert into post_blocks (post_id, type, data, text_length)
                                        values (%s, %s, %s, %s)
                                """,
                                (post_id, block_type, json.dumps(block_data), text_length)
                            )

                    co_author_id = None
                    co_author_name = None
                    if "co_author" in post_data:
                        co_author_id = post_data["co_author"]["id"]
                        co_author_name = post_data["co_author"]["name"]

                    self.db.execute_update(
                        """
                            update posts
                                set
                                    created = to_timestamp(%s),
                                    type = %s,
                                    subsite_id = %s,
                                    subsite_name = %s,
                                    subsite_type = %s,
                                    author_id = %s,
                                    author_name = %s,
                                    co_author_id = %s,
                                    co_author_name = %s,
                                    title = %s,
                                    is_enabled_comments = %s,
                                    is_enabled_likes = %s,
                                    is_repost = %s,
                                    is_show_thanks = %s,
                                    is_filled_by_editors = %s,
                                    is_editorial = %s,
                                    hotness = %s,
                                    comments_count = %s,
                                    favorites_count = %s,
                                    hits_count = %s,
                                    likes_count = %s,
                                    likes_sum = %s
                                where id = %s
                        """,
                        (
                            post_data["date"],
                            post_data["type"],
                            post_data["subsite"]["id"],
                            post_data["subsite"]["name"],
                            post_data["subsite"]["type"],
                            post_data["author"]["id"],
                            post_data["author"]["name"],
                            co_author_id,
                            co_author_name,
                            post_data["title"],
                            post_data["isEnabledComments"],
                            post_data["isEnabledLikes"],
                            post_data["isRepost"],
                            post_data.get("is_show_thanks"),
                            post_data.get("is_filled_by_editors"),
                            post_data.get("isEditorial"),
                            post_data.get("hotness"),
                            post_data["commentsCount"],
                            post_data["favoritesCount"],
                            post_data["hitsCount"],
                            post_data["likes"]["count"],
                            post_data["likes"]["summ"],
                            post_id
                        )
                    )

                except Exception:
                    print(f"Exception for post #{post_id}")
                    raise

            page += 1
            self.db.commit()
from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

offset_base = 0
page_size = 500
page = 0
while True:
    offset = offset_base + page_size * page
    result = db.execute_select(
        """
            select id, json from post_history
                where hits is null
                order by id
                limit %s offset %s
        """,
        (page_size, offset)
    )
    if len(result) == 0:
        break

    for row in result:
        record_id = row[0]
        print("parsing " + str(record_id))

        post_data = json.loads(row[1])["result"]
        db.execute_update(
            """
                update post_history
from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

offset_base = 0
page_size = 500
page = 0
while True:
    offset = offset_base + page_size * page
    print(f"Fetch page #{page} ({offset})")
    result = db.execute_select(
        """
            select id, json from subsites
                order by id
                limit %s offset %s
        """, (page_size, offset))
    if len(result) == 0:
        break

    for row in result:
        subsite_id = row[0]
        subsite_data = json.loads(row[1])
        print(subsite_data)
        db.execute_update(
            """
                update subsites
                    set
                        created = to_timestamp(%s),
                        name = %s,