class GetSubsites: @dataclass class Stats: request_count: int = 0 post_count: int = 0 error_count: int = 0 requests_since_last_429: int = 0 def __init__(self): config = ConfigLoader.load() self.api = OchobaApiWrapper(config["api"]) self.db = DataBaseWrapper(config["db"]) self.stats = self.Stats() def get_subsites(self): print("Started at " + datetime.now().strftime("%H:%M:%S")) self.__get_subsites_list("sections") self.__get_subsites_list("companies") self.db.commit() def __get_subsites_list(self, subsite_type): response = self.api.execute("subsites_list/" + subsite_type) subsites_list = response.json()["result"] for subsite_data in subsites_list: print(subsite_data) self.db.execute_insert( """ insert into subsites (id, json) values (%s, %s) on conflict (id) do update set json = excluded.json; """, (subsite_data["id"], json.dumps(subsite_data)))
class GetSubsites: @dataclass class Stats: request_count: int = 0 post_count: int = 0 error_count: int = 0 requests_since_last_429: int = 0 def __init__(self): config = ConfigLoader.load() self.api = OchobaApiWrapper(config["api"]) self.db = DataBaseWrapper(config["db"]) self.stats = self.Stats() def get_subsites(self): print("Started at " + datetime.now().strftime("%H:%M:%S")) errors = self.db.execute_select( """ select user_id from user_errors where status_code = 404 """, None) for error in errors: user_id = error[0] subsite_data = self.db.execute_select_one( """ select id from subsites where id = %s """, (user_id, )) if subsite_data is None: self.__get_subsite(user_id) def __get_subsite(self, subsite_id): response = self.api.execute("subsite/" + str(subsite_id)) if response.status_code == 429: # Too Many Requests print( datetime.now().strftime("%H:%M:%S") + ": 429 Too Many Requests. Requests processed since last 429 error: " + str(self.stats.requests_since_last_429) + ". Wait for 60 seconds and repeat") self.stats.requests_since_last_429 = 0 time.sleep(60) self.__get_subsite(subsite_id) return print( str(subsite_id) + ": " + str(response.status_code) + ": " + str(response.json())) if response.status_code == 200: self.db.execute_insert( """ insert into subsites (id, json) values (%s, %s) on conflict (id) do update set json = excluded.json; """, (subsite_id, json.dumps(response.json()["result"]))) self.db.commit()
class GetSubsiteTimeline: @dataclass class Stats(): request_count: int = 0 post_count: int = 0 error_count: int = 0 requests_since_last_429: int = 0 def __init__(self): config = ConfigLoader.load() self.api = OchobaApiWrapper(config["api"]) self.db = DataBaseWrapper(config["db"]) self.stats = self.Stats() self.offset = 1 self.count = 50 self.subsite_id = 203796 @staticmethod def __time(): return datetime.now().strftime("%H:%M:%S") def get_posts(self): print(f"Started at {self.__time()}") timeline = self.__get_timeline(self.subsite_id, 'new', self.count, self.offset) while timeline: print(f'{len(timeline)}/{self.offset}') parsed_timeline = self.__parse_timeline(timeline) self.__db_insert(parsed_timeline) if self.stats.request_count % 10 == 0: self.db.commit() print( f'{self.__time()}: {self.stats.request_count} requests processed ({self.stats.post_count} posts, {self.stats.error_count} errors)' ) self.offset += self.count timeline = self.__get_timeline(self.subsite_id, 'new', self.count, self.offset) self.db.commit() def __db_insert(self, parsed_timeline: list): for post in parsed_timeline: self.db.execute_insert( """ insert into posts (id, commentscount, favoritescount, hitscount, likescount, date_created, subsite_id, is_show_thanks, is_filled_by_editors, iseditorial) values (%s, %s, %s, %s, %s, to_timestamp(%s), %s, %s, %s, %s) on conflict (id) do update set date_created = excluded.date_created; """, ([*post.values()])) self.stats.post_count += 1 self.stats.request_count += 1 self.stats.requests_since_last_429 += 1 def __parse_timeline(self, timeline: list) -> list: parsed = [ dict(entry_id=post.get('id'), commentsCount=post.get('commentsCount'), favoritesCount=post.get('favoritesCount'), hitsCount=post.get('hitsCount'), likesCount=post.get('likes').get('count', 0), date_created=post.get('date'), subsite_id=self.subsite_id, is_show_thanks=post.get('is_show_thanks'), is_filled_by_editors=post.get('is_filled_by_editors'), isEditorial=post.get('isEditorial')) for post in timeline if not post.get('isRepost') ] return parsed def __get_timeline(self, subsite: int, sorting: str = 'new', count: int = 50, offset: int = 0) -> list: response = self.api.execute( f"subsite/{subsite}/timeline/{sorting}?count={count}&offset={offset}" ) if response.status_code == 429: print( f'{self.__time()}: 429 Too Many Requests. Requests processed since last 429 error: {self.stats.requests_since_last_429}' ) self.stats.requests_since_last_429 = 0 time.sleep(60) return self.__get_timeline(subsite, sorting, count, offset) response_json = response.json() print(f"__get_timeline:{response.status_code}: {self.__time()}") return response_json.get('result')
class GetPosts: @dataclass class Stats: request_count: int = 0 post_count: int = 0 error_count: int = 0 requests_since_last_429: int = 0 def __init__(self): config = ConfigLoader.load() self.api = OchobaApiWrapper(config["api"]) self.db = DataBaseWrapper(config["db"]) self.stats = self.Stats() def get_posts(self): print("Started at " + datetime.now().strftime("%H:%M:%S")) try: for post_id in range(1, 165000): if self.stats.request_count % 100 == 0: self.db.commit() print( "{0}: {1} requests processed ({2} posts, {3} errors)". format(datetime.now().strftime("%H:%M:%S"), self.stats.request_count, self.stats.post_count, self.stats.error_count)) self.__get_post(post_id) except Exception: print("Exception!") raise finally: self.db.commit() def __get_post(self, post_id): response = self.api.execute("entry/" + str(post_id)) if response.status_code == 429: # Too Many Requests print( datetime.now().strftime("%H:%M:%S") + ": 429 Too Many Requests. Requests processed since last 429 error: " + str(self.stats.requests_since_last_429) + ". Wait for 60 seconds and repeat") self.stats.requests_since_last_429 = 0 time.sleep(60) self.__get_post(post_id) return response_json = response.json() print(str(response.status_code) + ": " + str(response_json)) if "error" in response_json: self.db.execute_insert( """ insert into post_errors (post_id, status_code, response) values (%s, %s, %s); """, (post_id, response.status_code, json.dumps(response_json))) self.stats.error_count += 1 else: self.db.execute_insert( """ insert into posts (id, json) values (%s, %s) on conflict (id) do update set json = excluded.json; """, (post_id, json.dumps(response_json))) self.stats.post_count += 1 self.stats.request_count += 1 self.stats.requests_since_last_429 += 1
import json import time from datetime import datetime from src.common.config_loader import ConfigLoader from src.common.data_base_wrapper import DataBaseWrapper from src.common.ochoba_api_wrapper import OchobaApiWrapper post_id = 220958 request_interval_minutes = 30 config = ConfigLoader.load() api = OchobaApiWrapper(config["api"]) db = DataBaseWrapper(config["db"]) print(datetime.now().strftime("%H:%M:%S") + ": Started") while True: response = api.execute("entry/" + str(post_id)) print(datetime.now().strftime("%H:%M:%S") + ": Got " + str(response.status_code)) if response.status_code == 200: db.execute_insert( """ insert into post_history (post_id, request_time, json) values (%s, %s, %s); """, (post_id, datetime.now(), json.dumps(response.json()))) db.commit() time.sleep(60 * request_interval_minutes)
class GetPosts: def __init__(self): config = ConfigLoader.load() self.categories = config["parse_targets"]["categories"] self.users = config["parse_targets"]["users"] self.api = OchobaApiWrapper(config["api"]) self.db = GetDataBaseWrapper(config["db"]) self.middle_rating = {} self.__init_database() def scan(self): print("Started at " + datetime.now().strftime("%H:%M:%S")) print("update categories") self.__update_categories() print("check subsites") for category in self.categories: time.sleep(self.api.min_delay) if category["is_user"] > 0: timeline = self.api.execute("user/%s/entries" % category["userid"]) else: timeline = self.api.execute("subsite/%s/timeline/new" % category["userid"]) if self.__is_error(timeline): continue timeline = timeline.json() #print("found %d posts in %s category" % (len(timeline["result"]), category["name"])) if len(timeline["result"]) == 0: continue for post in timeline["result"]: if post["date"] < category["last_post_time"]: break if post["isEditorial"] > 0: continue self.add_post_to_database(post) first_post_time = timeline["result"][0]["date"] if first_post_time > category["last_post_time"]: category["last_post_time"] = first_post_time self.db.execute_update( "update categories set last_post_time=%d where id='%s'", (first_post_time, category["id"])) self.db.commit() posts = self.db.execute_select( "select * from posts where statsCalculated < 5 and published=0 order by publication_time asc", []) if posts is None: return print("update posts") for post in posts: timeFromPublication = time.time() - post[3] if post[6] == 0 and timeFromPublication < 3600: continue if post[6] == 1 and timeFromPublication < 7200: continue if post[6] == 2 and timeFromPublication < 10800: continue time.sleep(self.api.min_delay) post_stats = self.api.execute("entry/%d" % post[0]) if self.__is_error(post_stats): self.db.execute("delete from posts where post_id=%d" % post[0]) continue post_stats = post_stats.json()["result"] views = post_stats["hitsCount"] bookmarks = post_stats["favoritesCount"] comments = post_stats["commentsCount"] rating = post_stats["likes"]["summ"] if post[6] == 0: if self.db.execute_select_one( "select * from posts_views where post_id=%d", (post[0])) is None: self.db.execute_insert( "insert into posts_views (post_id, count1hr, count2hr, count3hr) values (%d, 0, 0, 0)", (post[0])) if self.db.execute_select_one( "select * from posts_bookmarks where post_id=%d", (post[0])) is None: self.db.execute_insert( "insert into posts_bookmarks (post_id, count1hr, count2hr, count3hr) values (%d, 0, 0, 0)", (post[0])) if self.db.execute_select_one( "select * from posts_comments where post_id=%d", (post[0])) is None: self.db.execute_insert( "insert into posts_comments (post_id, count1hr, count2hr, count3hr) values (%d, 0, 0, 0)", (post[0])) if self.db.execute_select_one( "select * from posts_rating where post_id=%d", (post[0])) is None: self.db.execute_insert( "insert into posts_rating (post_id, count1hr, count2hr, count3hr) values (%d, 0, 0, 0)", (post[0])) if timeFromPublication > 43000: nextState = 5 column = "now" elif timeFromPublication > 12000: nextState = 4 column = "now" elif timeFromPublication > 10800: nextState = 3 column = "count3hr" elif timeFromPublication > 7200: nextState = 2 column = "count2hr" elif timeFromPublication > 3600: nextState = 1 column = "count1hr" self.db.execute_update( "update posts_views set %s=%d, now=%d where post_id=%d", (column, views, views, post[0])) self.db.execute_update( "update posts_bookmarks set %s=%d, now=%d where post_id=%d", (column, bookmarks, bookmarks, post[0])) self.db.execute_update( "update posts_comments set %s=%d, now=%d where post_id=%d", (column, comments, comments, post[0])) self.db.execute_update( "update posts_rating set %s=%d, now=%d where post_id=%d", (column, rating, rating, post[0])) self.db.execute_update( "update posts set statsCalculated=%d where post_id='%d'", (nextState, post[0])) #print ("updated state for post %d" % post[0]) self.publish(post, post_stats) if nextState >= 4: self.db.execute_update( "update posts set published=1 where post_id='%d'", (post[0])) self.db.commit() def __init_database(self): self.db.execute(""" create table if not exists categories ( id integer primary key autoincrement not null, userid integer not null, name varchar, is_user integer not null, subscribers integer, last_post_time integer not null, watch integer not null ); """) self.db.execute(""" create table if not exists posts ( post_id integer primary key not null, cat_id integer not null, name varchar not null, publication_time integer not null, words integer not null, media integer not null, statsCalculated integer not null, published integer not null, score integer ); """) self.db.execute(""" create table if not exists posts_views ( post_id integer not null, count1hr integer, count2hr integer, count3hr integer, now integer ); """) self.db.execute(""" create table if not exists posts_rating ( post_id integer not null, count1hr integer, count2hr integer, count3hr integer, now integer ); """) self.db.execute(""" create table if not exists posts_comments ( post_id integer not null, count1hr integer, count2hr integer, count3hr integer, now integer ); """) self.db.execute(""" create table if not exists posts_bookmarks ( post_id integer not null, count1hr integer, count2hr integer, count3hr integer, now integer ); """) last_post_time = time.time() - 2 * 60 * 60 * 24 for cat in self.categories: if self.db.execute_select_one( "select userid from categories where userid='%s'", (cat)) is None: self.db.execute_insert( """ insert into categories (userid, is_user, last_post_time, watch) values (%d, 0, %d, 1); """, (cat, last_post_time)) for user in self.users: if self.db.execute_select_one( "select userid from categories where userid='%s'", (user)) is None: self.db.execute_insert( """ insert into categories (userid, is_user, last_post_time, watch) values (%d, 1, %d, 1); """, (user, last_post_time)) self.db.commit() def __update_categories(self): self.categories = self.db.execute_select( "select id, userid, is_user from categories;", []) for line in self.categories: time.sleep(self.api.min_delay) json = self.__get_category(line[1], line[2]) if json is None: continue self.db.execute_update( "update categories set subscribers='%s', name='%s' where id='%s'", (json["subscribers_count"], json["name"], line[0])) self.categories = [{ "id": i[0], "userid": i[1], "name": i[2], "is_user": i[3], "subscribers": i[4], "last_post_time": i[5] } for i in self.db.execute_select( "select * from categories where watch=1;", [])] self.db.commit() def __get_category(self, userid, isuser): if isuser > 0: response = self.api.execute('user/' + str(userid)) else: response = self.api.execute('subsite/' + str(userid)) if self.__is_error(response): return None return response.json()["result"] def __is_error(self, response): if response.status_code == 429: # Too Many Requests traceback.print_stack() print(datetime.now().strftime("%H:%M:%S") + ": 429 Too Many Requests ") return True if response.status_code == 404: # Too Many Requests traceback.print_stack() print(datetime.now().strftime("%H:%M:%S") + ": 404 Not Found ") return True return False def add_post_to_database(self, post): if self.db.execute_select_one("select * from posts where post_id=%d", (post["id"])) is not None: return data = {} data["post_id"] = post["id"] data["cat_id"] = post["subsite"]["id"] data["name"] = re.sub(r"[\"\']", "", post["title"]) data["media"] = 0 data["words"] = 0 data["publication_time"] = post["date"] post["blocks"].append({"type": "text", "data": {"text": data["name"]}}) for block in post["blocks"]: blocktype = block["type"] text = "" if blocktype == "text" or blocktype == "header" or blocktype == "quote" or blocktype == "incut": text = block['data']['text'] elif blocktype == "media": text = "" for item in block['data']['items']: text += item['title'] + " " data["media"] += 1 elif blocktype == "list": text = "" for item in block['data']['items']: text += item + " " elif blocktype == "tweet": text = block['data']['tweet']['data']['tweet_data'][ 'full_text'] data["media"] += 1 elif blocktype == "video" or blocktype == "audio" or blocktype == "link": data["media"] += 1 text = re.sub(r']\([^\(\)]+\)', '', text) text = re.sub(r'—', '', text) text = re.sub(r'\s+', ' ', text) text = re.sub(r'\#\w+', '', text) data["words"] += len(text.split()) self.db.execute_insert( """ insert into posts (post_id, cat_id, name, publication_time, words, media, statsCalculated, published) values (%d, %d, "%s", %d, %d, %d, 0, 0); """, (data["post_id"], data["cat_id"], str(data["name"]), data["publication_time"], data["words"], data["media"])) return data #print("added new post: [%d] %s" % (data["post_id"], data["name"])) def publish(self, post, post_stats): post_data = self.db.execute_select_one( """ select posts.post_id, categories.name, posts.name, posts.publication_time, posts.words, posts.media, posts_views.now, posts_bookmarks.now, posts_rating.now, posts_comments.now, categories.subscribers from posts left join categories on posts.cat_id=categories.userid left join posts_views on posts.post_id=posts_views.post_id left join posts_bookmarks on posts.post_id=posts_bookmarks.post_id left join posts_rating on posts.post_id=posts_rating.post_id left join posts_comments on posts.post_id=posts_comments.post_id where posts.post_id=%d """, (post[0])) if post_data is None: return post_id = post_data[0] category = post_data[1] title = post_data[2] publication_time = post_data[3] words = post_data[4] media = post_data[5] views = post_data[6] bookmarks = post_data[7] rating = post_data[8] comments = post_data[9] category_subs = post_data[10] if views is None or bookmarks is None or rating is None or comments is None: return if category_subs is None: json = self.__get_category(post_stats["subsite"]["id"], post_stats["subsite"]["type"] == 1) time.sleep(self.api.min_delay) category = json['name'] category_subs = json["subscribers_count"] self.db.execute_insert( """ insert into categories (userid, name, is_user, subscribers, last_post_time, watch) values (%s, '%s', %d, %d, 0, 0); """, (json["id"], json['name'], post_stats["subsite"]["type"] == 1, json["subscribers_count"])) category_subs = min(category_subs, 50000) expected_views = pow(category_subs * 2.3, 0.71) expected_likes = pow(category_subs * 3.5, 0.38) expected_bookmarks = pow(category_subs * 3.5, 0.38) expected_comments = pow(category_subs * 3.5, 0.38) score = words / 500.0 + media / 10.0 + views / expected_views + bookmarks / expected_bookmarks + rating / expected_likes + comments / expected_comments score = int(score * 10) self.db.execute_update("update posts set score=%d where post_id='%d'", (score, post[0])) if score > 50: print( "Recommended: [https://dtf.ru/%d][%s][Score: %s] \"%s\" From %s / %d Words / %d Mediafiles / %d Views / %d Bookmarks / %d Rating / %d Comments" % (post_id, time.strftime('%H:%M %m-%d-%YMSK', time.localtime(publication_time + 10800)), str(score).zfill(4), title, category, words, media, views, bookmarks, rating, comments)) self.db.execute_update( "update posts set published=1 where post_id='%d'", (post[0])) def __get_post(self, post_id): response = self.api.execute("entry/" + str(post_id)) response_json = response.json() print(str(response.status_code) + ": " + str(response_json)) if "error" in response_json: self.db.execute_insert( """ insert into post_errors (post_id, status_code, response) values (%s, %s, %s); """, (post_id, response.status_code, json.dumps(response_json))) self.stats.error_count += 1 else: self.db.execute_insert( """ insert into posts (id, json) values (%s, %s) on conflict (id) do update set json = excluded.json; """, (post_id, json.dumps(response_json))) self.stats.post_count += 1 self.stats.request_count += 1 self.stats.requests_since_last_429 += 1 def print_post_info(self, id): self.db.execute("delete from posts where post_id=%d" % id) post = self.api.execute("entry/%d" % id) if self.__is_error(post): print("error") print(self.add_post_to_database(post.json()['result']))