class GetSubsites:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_subsites(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))
        self.__get_subsites_list("sections")
        self.__get_subsites_list("companies")
        self.db.commit()

    def __get_subsites_list(self, subsite_type):
        response = self.api.execute("subsites_list/" + subsite_type)
        subsites_list = response.json()["result"]
        for subsite_data in subsites_list:
            print(subsite_data)
            self.db.execute_insert(
                """
                    insert into subsites (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (subsite_data["id"], json.dumps(subsite_data)))
 def __init__(self):
     config = ConfigLoader.load()
     self.api = OchobaApiWrapper(config["api"])
     self.db = DataBaseWrapper(config["db"])
     self.stats = self.Stats()
     self.offset = 1
     self.count = 50
     self.subsite_id = 203796
class ParseUserData:
    def __init__(self):
        config = ConfigLoader.load()
        self.db = DataBaseWrapper(config["db"])

    def parse(self):
        page_size = 500
        page = 0
        while True:
            print("Fetch page #{0} ({1})".format(page, page_size * page))
            result = self.db.execute_select(
                """
                    select json from users
                        order by id
                        limit %s offset %s
                """, (page_size, page_size * page))
            if len(result) == 0:
                break

            for row in result:
                user_data = json.loads(row[0])["result"]
                self.db.execute_update(
                    """
                        update users
                            set
                                created = to_timestamp(%s),
                                name = %s,
                                type = %s,
                                karma = %s,
                                is_plus = %s,
                                is_verified = %s,
                                is_available_for_messenger = %s,
                                entries_count = %s,
                                comments_count = %s,
                                favorites_count = %s,
                                subscribers_count = %s
                            where id = %s
                    """, (user_data["created"], user_data["name"],
                          user_data["type"], user_data["karma"],
                          user_data["is_plus"], user_data["is_verified"],
                          user_data["isAvailableForMessenger"],
                          user_data["counters"]["entries"],
                          user_data["counters"]["comments"],
                          user_data["counters"]["favorites"],
                          user_data["subscribers_count"], user_data["id"]))

            page += 1
            self.db.commit()
class SqlPlot:
    def __init__(self):
        config = ConfigLoader.load()
        self.db = DataBaseWrapper(config["db"])

    def show(self, sql_queries, title, x_label, y_label):
        show_legend = False
        for sql_query in sql_queries:
            data = self.db.execute_select(sql_query.get('query'), None)
            x = []
            y = []
            for row in data:
                x.append(row[0])
                y.append(row[1])

            label = sql_query.get('label')
            plot.plot(x, y, label=label)
            show_legend |= label is not None

        plot.title(title)
        plot.xlabel(x_label)
        plot.xticks(rotation=45)
        plot.ylabel(y_label)
        plot.grid(True)

        if show_legend:
            plot.legend()

        plot.show()
Esempio n. 5
0
import matplotlib.pyplot as plt

from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

x, y = db.fetch_data(
    """
        select extract(hour from created) as hour, count(*)
        from posts
        where type = 1 and created between '2020-01-01' and '2020-07-01' 
            and not is_editorial and subsite_type = 2
        group by hour
        order by hour
    """, None)

plt.bar(x, y, color='green')
plt.xlabel("Время (Москва, GMT+3)")
plt.ylabel("Количество постов")
plt.title("Количество постов (январь-июнь 2020, UGC, подсайты)")
plt.grid(True, axis='y')

plt.xticks(x)

plt.show()
Esempio n. 6
0
 def __init__(self):
     config = ConfigLoader.load()
     self.db = DataBaseWrapper(config["db"])
     self.tag_regex = re.compile(config["api"]["tag_regex"])
Esempio n. 7
0
class ParsePostData:
    def __init__(self):
        config = ConfigLoader.load()
        self.db = DataBaseWrapper(config["db"])
        self.tag_regex = re.compile(config["api"]["tag_regex"])

    def __parse_tags(self, post_id, text):
        search_index = 0
        while True:
            match = self.tag_regex.search(text, search_index)
            if match is None:
                break

            parsed_tag = urllib.parse.unquote(match.group(0))
            if len(parsed_tag) >= 3 and not parsed_tag[1:].isdigit():
                self.db.execute_insert(
                    """
                        insert into post_tags (post_id, value, source)
                            values (%s, %s, %s)
                    """,
                    (post_id, parsed_tag.lower(), text)
                )
            search_index = match.end(0)

    def parse(self):
        offset_base = 0
        page_size = 500
        page = 0
        while True:
            offset = offset_base + page_size * page
            print(f"Fetch page #{page} ({offset})")
            result = self.db.execute_select(
                """
                    select id, json from posts
                        order by id
                        limit %s offset %s
                """,
                (page_size, offset)
            )
            if len(result) == 0:
                break

            for row in result:
                post_id = row[0]
                try:
                    post_data = json.loads(row[1])["result"]
                    if "blocks" in post_data:
                        blocks = post_data["blocks"]
                        for block in blocks:
                            block_type = block["type"]
                            block_data = block["data"]
                            text_length = 0
                            if "text" in block_data:
                                text_length = len(block_data["text"])
                                self.__parse_tags(post_id, block_data["text"])
                            if block_type == "list":
                                for item in block_data["items"]:
                                    text_length += len(item)
                                    self.__parse_tags(post_id, item)

                            self.db.execute_insert(
                                """
                                    insert into post_blocks (post_id, type, data, text_length)
                                        values (%s, %s, %s, %s)
                                """,
                                (post_id, block_type, json.dumps(block_data), text_length)
                            )

                    co_author_id = None
                    co_author_name = None
                    if "co_author" in post_data:
                        co_author_id = post_data["co_author"]["id"]
                        co_author_name = post_data["co_author"]["name"]

                    self.db.execute_update(
                        """
                            update posts
                                set
                                    created = to_timestamp(%s),
                                    type = %s,
                                    subsite_id = %s,
                                    subsite_name = %s,
                                    subsite_type = %s,
                                    author_id = %s,
                                    author_name = %s,
                                    co_author_id = %s,
                                    co_author_name = %s,
                                    title = %s,
                                    is_enabled_comments = %s,
                                    is_enabled_likes = %s,
                                    is_repost = %s,
                                    is_show_thanks = %s,
                                    is_filled_by_editors = %s,
                                    is_editorial = %s,
                                    hotness = %s,
                                    comments_count = %s,
                                    favorites_count = %s,
                                    hits_count = %s,
                                    likes_count = %s,
                                    likes_sum = %s
                                where id = %s
                        """,
                        (
                            post_data["date"],
                            post_data["type"],
                            post_data["subsite"]["id"],
                            post_data["subsite"]["name"],
                            post_data["subsite"]["type"],
                            post_data["author"]["id"],
                            post_data["author"]["name"],
                            co_author_id,
                            co_author_name,
                            post_data["title"],
                            post_data["isEnabledComments"],
                            post_data["isEnabledLikes"],
                            post_data["isRepost"],
                            post_data.get("is_show_thanks"),
                            post_data.get("is_filled_by_editors"),
                            post_data.get("isEditorial"),
                            post_data.get("hotness"),
                            post_data["commentsCount"],
                            post_data["favoritesCount"],
                            post_data["hitsCount"],
                            post_data["likes"]["count"],
                            post_data["likes"]["summ"],
                            post_id
                        )
                    )

                except Exception:
                    print(f"Exception for post #{post_id}")
                    raise

            page += 1
            self.db.commit()
class GetSubsiteTimeline:
    @dataclass
    class Stats():
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()
        self.offset = 1
        self.count = 50
        self.subsite_id = 203796

    @staticmethod
    def __time():
        return datetime.now().strftime("%H:%M:%S")

    def get_posts(self):
        print(f"Started at {self.__time()}")
        timeline = self.__get_timeline(self.subsite_id, 'new', self.count,
                                       self.offset)
        while timeline:
            print(f'{len(timeline)}/{self.offset}')
            parsed_timeline = self.__parse_timeline(timeline)
            self.__db_insert(parsed_timeline)

            if self.stats.request_count % 10 == 0:
                self.db.commit()
                print(
                    f'{self.__time()}: {self.stats.request_count} requests processed ({self.stats.post_count} posts, {self.stats.error_count} errors)'
                )
            self.offset += self.count
            timeline = self.__get_timeline(self.subsite_id, 'new', self.count,
                                           self.offset)

        self.db.commit()

    def __db_insert(self, parsed_timeline: list):
        for post in parsed_timeline:
            self.db.execute_insert(
                """
                    insert into posts (id, commentscount, favoritescount, hitscount, likescount, date_created, subsite_id, is_show_thanks, is_filled_by_editors, iseditorial)
                        values (%s, %s, %s, %s, %s, to_timestamp(%s), %s, %s, %s, %s)
                    on conflict (id)
                        do update set date_created = excluded.date_created;
                """, ([*post.values()]))
            self.stats.post_count += 1

        self.stats.request_count += 1
        self.stats.requests_since_last_429 += 1

    def __parse_timeline(self, timeline: list) -> list:
        parsed = [
            dict(entry_id=post.get('id'),
                 commentsCount=post.get('commentsCount'),
                 favoritesCount=post.get('favoritesCount'),
                 hitsCount=post.get('hitsCount'),
                 likesCount=post.get('likes').get('count', 0),
                 date_created=post.get('date'),
                 subsite_id=self.subsite_id,
                 is_show_thanks=post.get('is_show_thanks'),
                 is_filled_by_editors=post.get('is_filled_by_editors'),
                 isEditorial=post.get('isEditorial')) for post in timeline
            if not post.get('isRepost')
        ]
        return parsed

    def __get_timeline(self,
                       subsite: int,
                       sorting: str = 'new',
                       count: int = 50,
                       offset: int = 0) -> list:
        response = self.api.execute(
            f"subsite/{subsite}/timeline/{sorting}?count={count}&offset={offset}"
        )
        if response.status_code == 429:
            print(
                f'{self.__time()}: 429 Too Many Requests. Requests processed since last 429 error: {self.stats.requests_since_last_429}'
            )
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            return self.__get_timeline(subsite, sorting, count, offset)

        response_json = response.json()
        print(f"__get_timeline:{response.status_code}: {self.__time()}")

        return response_json.get('result')
class GetSubsites:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_subsites(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))

        errors = self.db.execute_select(
            """
                select user_id
                from user_errors
                where status_code = 404
            """, None)

        for error in errors:
            user_id = error[0]
            subsite_data = self.db.execute_select_one(
                """
                    select id
                    from subsites
                    where id = %s
                """, (user_id, ))
            if subsite_data is None:
                self.__get_subsite(user_id)

    def __get_subsite(self, subsite_id):
        response = self.api.execute("subsite/" + str(subsite_id))
        if response.status_code == 429:
            # Too Many Requests
            print(
                datetime.now().strftime("%H:%M:%S") +
                ": 429 Too Many Requests. Requests processed since last 429 error: "
                + str(self.stats.requests_since_last_429) +
                ". Wait for 60 seconds and repeat")
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            self.__get_subsite(subsite_id)
            return

        print(
            str(subsite_id) + ": " + str(response.status_code) + ": " +
            str(response.json()))
        if response.status_code == 200:
            self.db.execute_insert(
                """
                    insert into subsites (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (subsite_id, json.dumps(response.json()["result"])))
            self.db.commit()
Esempio n. 10
0
import plotly.graph_objects as go

from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper

# https://plotly.com/python/filled-area-plots/

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

fig = go.Figure()

subsite_ids, subsite_names = db.fetch_data(
    """
        select subsite_id, subsite_name
        from posts
        where type = 1 and subsite_type = 2
        group by subsite_id, subsite_name
        order by count(id) desc
        limit 19
    """,
    None
)

for (subsite_id, subsite_name) in zip(subsite_ids, subsite_names):
    x, y = db.fetch_data(
        """
            with time_scale as (
                select date_trunc('week', generate_series) as time_window
                from generate_series('2018-06-06'::timestamp, '2020-06-22'::timestamp, '1 week'::interval)
                order by time_window
Esempio n. 11
0
import json
import time
from datetime import datetime

from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper
from src.common.ochoba_api_wrapper import OchobaApiWrapper

post_id = 220958
request_interval_minutes = 30

config = ConfigLoader.load()
api = OchobaApiWrapper(config["api"])
db = DataBaseWrapper(config["db"])

print(datetime.now().strftime("%H:%M:%S") + ": Started")

while True:
    response = api.execute("entry/" + str(post_id))
    print(datetime.now().strftime("%H:%M:%S") + ": Got " +
          str(response.status_code))
    if response.status_code == 200:
        db.execute_insert(
            """
                insert into post_history (post_id, request_time, json)
                    values (%s, %s, %s);
            """, (post_id, datetime.now(), json.dumps(response.json())))
        db.commit()

    time.sleep(60 * request_interval_minutes)
import json

from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

offset_base = 0
page_size = 500
page = 0
while True:
    offset = offset_base + page_size * page
    result = db.execute_select(
        """
            select id, json from post_history
                where hits is null
                order by id
                limit %s offset %s
        """,
        (page_size, offset)
    )
    if len(result) == 0:
        break

    for row in result:
        record_id = row[0]
        print("parsing " + str(record_id))

        post_data = json.loads(row[1])["result"]
        db.execute_update(
Esempio n. 13
0
import matplotlib.pyplot as plt

from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

x, y = db.fetch_data(
    """
        select extract(hour from created) as hour, avg(hits_count)
        from posts
        where type = 1 and created between '2020-01-01' and '2020-07-01'
        group by hour
        order by hour
    """, None
)

plt.bar(x, y, color='green')
plt.xlabel("Время публикации (Москва, GMT+3)")
plt.ylabel("Среднее количество просмотров")
plt.title("Среднее количество просмотров (январь-июнь 2020)")
plt.grid(True, axis='y')

plt.xticks(x)

plt.show()
 def __init__(self):
     config = ConfigLoader.load()
     self.db = DataBaseWrapper(config["db"])
import matplotlib.pyplot as plt

from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

x, y = db.fetch_data(
    """
        select extract(isodow from created) as hour, avg(hits_count)
        from posts
        join post_tags
            on posts.id = post_tags.post_id
                and posts.type = 1 and created between '2020-01-01' and '2020-07-01'
                and not posts.is_editorial
                and post_tags.value in ('#лонг', '#лонгрид', '#long', '#longread')
        group by hour
        order by hour
    """, None)

plt.bar(x, y, color='green')
plt.xlabel("День недели (Москва, GMT+3)")
plt.ylabel("Среднее количество просмотров лонгов")
plt.title("Среднее количество просмотров лонгов (январь-июнь 2020)")
plt.grid(True, axis='y')

plt.xticks(x)

plt.show()
Esempio n. 16
0
class GetPosts:
    @dataclass
    class Stats:
        request_count: int = 0
        post_count: int = 0
        error_count: int = 0
        requests_since_last_429: int = 0

    def __init__(self):
        config = ConfigLoader.load()
        self.api = OchobaApiWrapper(config["api"])
        self.db = DataBaseWrapper(config["db"])
        self.stats = self.Stats()

    def get_posts(self):
        print("Started at " + datetime.now().strftime("%H:%M:%S"))
        try:
            for post_id in range(1, 165000):
                if self.stats.request_count % 100 == 0:
                    self.db.commit()
                    print(
                        "{0}: {1} requests processed ({2} posts, {3} errors)".
                        format(datetime.now().strftime("%H:%M:%S"),
                               self.stats.request_count, self.stats.post_count,
                               self.stats.error_count))

                self.__get_post(post_id)

        except Exception:
            print("Exception!")
            raise

        finally:
            self.db.commit()

    def __get_post(self, post_id):
        response = self.api.execute("entry/" + str(post_id))
        if response.status_code == 429:
            # Too Many Requests
            print(
                datetime.now().strftime("%H:%M:%S") +
                ": 429 Too Many Requests. Requests processed since last 429 error: "
                + str(self.stats.requests_since_last_429) +
                ". Wait for 60 seconds and repeat")
            self.stats.requests_since_last_429 = 0
            time.sleep(60)
            self.__get_post(post_id)
            return

        response_json = response.json()
        print(str(response.status_code) + ": " + str(response_json))

        if "error" in response_json:
            self.db.execute_insert(
                """
                    insert into post_errors (post_id, status_code, response)
                        values (%s, %s, %s);
                """,
                (post_id, response.status_code, json.dumps(response_json)))
            self.stats.error_count += 1

        else:
            self.db.execute_insert(
                """
                    insert into posts (id, json)
                        values (%s, %s)
                    on conflict (id)
                        do update set json = excluded.json;
                """, (post_id, json.dumps(response_json)))
            self.stats.post_count += 1

        self.stats.request_count += 1
        self.stats.requests_since_last_429 += 1
Esempio n. 17
0
from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper
from src.common.sql_plot import SqlPlot

percentiles = [0.75, 0.5, 0.25]

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

queries = []
for percentile in percentiles:
    queries.append({
        'query': f"""
            with length_data as (
                select
                       posts.id as post_id,
                       posts.created as created,
                       sum(coalesce(blocks.text_length, 0)) as text_length
                from posts
                join post_blocks blocks
                    on posts.id = blocks.post_id
                        and posts.type = 1
                group by posts.id
            )
            select
                date_trunc('week', created) as time_window,
                percentile_disc({percentile}) within group (order by text_length) as percentile
            from length_data
            where created between '2016-06-01' and '2020-07-19'
            group by time_window
            order by time_window
Esempio n. 18
0
from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper
from src.common.sql_plot import SqlPlot

subsite_ids = [64954]

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

queries = []
for subsite_id in subsite_ids:
    subsite_name = db.execute_select_one(
        "select name from subsites where id = %s", (subsite_id, ))[0]
    queries.append({
        'query': f"""
            select date_trunc('week', created) as time_window, count(*)
            from posts
            where type = 1 and subsite_id = {subsite_id}
                and created between '2018-06-01' and '2020-07-20'
            group by time_window
            order by time_window
        """,
        'label': subsite_name
    })

SqlPlot().show(sql_queries=queries,
               title="Количество новых постов за неделю",
               x_label="Время",
               y_label="Новые посты за неделю")
 def __init__(self):
     config = ConfigLoader.load()
     self.api = OchobaApiWrapper(config["api"])
     self.db = DataBaseWrapper(config["db"])
     self.stats = self.Stats()
import json

from src.common.config_loader import ConfigLoader
from src.common.data_base_wrapper import DataBaseWrapper

config = ConfigLoader.load()
db = DataBaseWrapper(config["db"])

offset_base = 0
page_size = 500
page = 0
while True:
    offset = offset_base + page_size * page
    print(f"Fetch page #{page} ({offset})")
    result = db.execute_select(
        """
            select id, json from subsites
                order by id
                limit %s offset %s
        """, (page_size, offset))
    if len(result) == 0:
        break

    for row in result:
        subsite_id = row[0]
        subsite_data = json.loads(row[1])
        print(subsite_data)
        db.execute_update(
            """
                update subsites
                    set