def load_cfg(): global config_collector config_collector = config.get_json(COLLECTOR_CONFIG) global last_id last_id = config_collector["last_id"] global main_url_part main_url_part = config_collector["main_url_part"] global view_url_part view_url_part = config_collector["view_url_part"] global delay_sec delay_sec = config_collector["delay_sec"] global length length = config_collector["length"] global chrome_headers chrome_headers = { "User-Agent": config_collector["user_agent"], "accept": config_collector["accept"] } global url_tail url_tail = config_collector["url_tail"] global send_ids send_ids = config_collector["send_ids"] global result_list if os.path.isfile(FILE_DB): result_list = config.get_json(FILE_DB) else: result_list = list()
def convert_history_json(file_in, timezone_local): json = config.get_json(file_in) json_array = json.get("messages") channel_id = json.get("id") if json_array is None or channel_id is None: print("Cannot parse Telegram history json") return messages_list = list() for message in json_array: msg_props = dict() msg_props["id"] = message["id"] msg_props["date"] = helper.str_to_utc_datetime( message["date"], timezone_local, config.ISO_LOCAL_DATE_FORMAT).isoformat() message_item = message.get("text") if message_item is not None: msg_props["text"] = str(message_item) reply_to_message_id = message.get("reply_to_message_id") if reply_to_message_id is not None: msg_props["reply_to_msg_id"] = reply_to_message_id messages_list.append(msg_props) out_path = os.path.join( config.CHANNELS_HISTORY_DIR, f"{channel_id}.json") config.set_json(out_path, messages_list) print(f"Saved as {out_path}")
def import_json(input_file): sql_connection = sqlite3.connect(config.DB_STATS_PATH) cur = sql_connection.cursor() json_array = config.get_json(input_file) for link_item in json_array: id_ = link_item["id"] add_date_utc = link_item.get("add_date_utc") if add_date_utc is None: add_date_utc = link_item.get("add_date_utc:") if add_date_utc is None: add_date_utc = "NULL" change_date_utc = link_item.get("change_date_utc") if change_date_utc is None: change_date_utc = link_item.get("change_date_utc:") if change_date_utc is None: change_date_utc = "NULL" access_url = link_item["access_url"] name = link_item["name"].replace("'", "''") exec_string = f"INSERT INTO Channel VALUES ({id_},'{name}','{access_url}','{add_date_utc}','{change_date_utc}')" cur.execute(exec_string) sql_connection.commit() sql_connection.close()
def insertion_delete_performance(self,file_name,safe_mode): ''' use pymongo to caculate the insertion time ''' try: json_array = get_json(file_name) collection = conn.repost_detail tm_srl = [] indx = 0 start_time = time.time() for row in json_array[:1000000]: indx += 1 collection.insert( { '_id': MAX_ID+indx, 'city': "", 'created_at': datetime.now(), 'favourites_count': 10, 'statuses_count': 10, 'friends_count': row.get("friends_count", ""), 'gender': "m", 'location': "beijing", 'sm_user_id': 1, 'profile_image_url': "", 'province': "need to delete", 'retweet_status_created_at': datetime.now(), 'user_id': 1, 'retweet_status_source': "", 'screen_name': "", 'post_id':1, 'verified': False, 'sm_flash_factor': 1, 'sm_user_followers_count': 1, 'retweet_status_id': 1, 'sm_user_screen_name': "", 'source': "", 'followers_count': row.get("followers_count", ""), 'text': row.get("text", ""), 'user_created_at': datetime.now(), 'retweet_status_text':row.get("retweet_status_text","") }, safe=safe_mode ) if indx%10000 == 0: tm_srl.append(time.time() - start_time) print time.time() - start_time return time_serial except IOError: print 'file io error' return
def insertion_delete_performance(self,file_name,safe_mode): try: json_array = get_json(file_name) indx = 0 time_serial = [] start_time = time.time() #caculate the start time for row in json_array[:1000000]: indx += 1 repost = Repost( _id=MAX_ID+indx, city="", created_at=date.today(), favourites_count=10, statuses_count=10, friends_count=row.get("friends_count",""), gender="m", location="beijing", sm_user_id=1, profile_image_url="", province="need to delete", retweet_status_created_at=date.today(), user_id=1, retweet_status_source="", screen_name="", retweet_status_text=row.get("retweet_status_text", ""), post_id=10, verified=False, sm_flash_factor=1, sm_user_followers_count=1, retweet_status_id=1, sm_user_screen_name="", source="", followers_count=row.get("followers_count", ""), text=row.get("text",""), user_created_at=date.today() ) repost.save(safe=safe_mode, force_insert=True) if indx%10000 == 0: time_serial.append(time.time() - start_time) print time.time() - start_time return time_serial except IOError: print "open file failed" return
def update_all_ids(initial_id): try: json_array = get_json() for row in json_array: initial_id += 1 collection.insert({ '_id': initial_id, 'city': "", 'created_at': datetime.now(), 'favourites_count': 10, 'statuses_count': 10, 'friends_count': row["friends_count"], 'gender': "m", 'location': "beijing", 'sm_user_id': 1, 'profile_image_url': "", 'province': "20", 'retweet_status_created_at': datetime.now(), 'user_id': 1, 'retweet_status_source': "", 'screen_name': "", 'post_id':1, 'verified': False, 'sm_flash_factor': 1, 'sm_user_followers_count': 1, 'retweet_status_id': 1, 'sm_user_screen_name': "", 'source': "", 'followers_count': row["followers_count"], 'text': row["text"], 'user_created_at': datetime.now(), 'retweet_status_text': row.get("retweet_status_text", "")}) except IOError: print 'db error'
def analyze_channel(channel_id): logging.info('analyze_channel, channel id: %s', channel_id) out_path = os.path.join(config.CHANNELS_HISTORY_DIR, f"{channel_id}.json") messages = None try: messages = config.get_json(out_path) except Exception as ex: logging.error('analyze_channel: %s, error: %s', ex, traceback.format_exc()) with classes.SQLite(config.DB_STATS_PATH, 'analyze_channel_error:', lock) as cur: update_string = f"UPDATE Channel SET HistoryLoaded = 0 WHERE Id={channel_id}" cur.execute(update_string) return if messages is None or len(messages) < 1: logging.info('analyze_channel: no data from %s', out_path) return ordered_messges = sorted(messages, key=lambda x: x["id"], reverse=False) min_channel_date = helper.str_to_utc_datetime(ordered_messges[0]["date"]) max_channel_date = helper.str_to_utc_datetime( ordered_messges[len(ordered_messges) - 1]["date"]) for symbol in signal_parser.symbols_regex_map: min_date = db_poll.db_time_ranges[symbol][0] max_date = db_poll.db_time_ranges[symbol][1] if (min_channel_date > min_date): min_date = min_channel_date if (max_channel_date < max_date): max_date = max_channel_date min_date_rounded_minutes = min_date - datetime.timedelta( seconds=min_date.second, microseconds=min_date.microsecond) max_date_rounded_minutes = max_date - datetime.timedelta( seconds=max_date.second, microseconds=max_date.microsecond) while not WAIT_EVENT_OUTER.is_set(): if is_theads_busy(): WAIT_EVENT_INNER.wait(STATS_ANALYZE_LOOP_GAP_SEC) else: logging.info( 'analyze_channel: id: %s, symbol: %s, start: %s, end: %s', channel_id, symbol, min_date_rounded_minutes, max_date_rounded_minutes) process_channel_typle = (ordered_messges, symbol, min_date_rounded_minutes, max_date_rounded_minutes, channel_id) atomic_increment() pool.apply_async(signal_parser.analyze_channel_symbol, process_channel_typle, callback=write_db) break if WAIT_EVENT_OUTER.is_set(): return
from telethon.tl.functions.messages import ImportChatInviteRequest, CheckChatInviteRequest, GetDialogsRequest, EditMessageRequest from telethon.tl.functions.channels import GetMessagesRequest, JoinChannelRequest, LeaveChannelRequest import config import classes import datetime import os import pytz import requests import json import youtube_dl utc = pytz.UTC COLLECTOR_CONFIG = "collector_config.json" load_dotenv() config_collector = config.get_json(COLLECTOR_CONFIG) SESSION = 'secure_session_history_collector.session' ON_ERROR_SLEEP_SEC = 60 STEP = 1 FILE_DB = "collector_db.json" ISO_DATE_FORMAT = r"%Y-%m-%dT%H:%M:%S.%fZ" def load_cfg(): global config_collector config_collector = config.get_json(COLLECTOR_CONFIG) global last_id last_id = config_collector["last_id"] global main_url_part main_url_part = config_collector["main_url_part"] global view_url_part
def load(username): conf = config.get_json('users.json') if username not in conf: raise ValueError('Can\'t find user "{}"'.format(username)) return User(conf[username])