Ejemplo n.º 1
0
def load_cfg():
    global config_collector
    config_collector = config.get_json(COLLECTOR_CONFIG)
    global last_id
    last_id = config_collector["last_id"]
    global main_url_part
    main_url_part = config_collector["main_url_part"]
    global view_url_part
    view_url_part = config_collector["view_url_part"]
    global delay_sec
    delay_sec = config_collector["delay_sec"]
    global length
    length = config_collector["length"]
    global chrome_headers
    chrome_headers = {
        "User-Agent": config_collector["user_agent"],
        "accept": config_collector["accept"]
    }
    global url_tail
    url_tail = config_collector["url_tail"]

    global send_ids
    send_ids = config_collector["send_ids"]

    global result_list
    if os.path.isfile(FILE_DB):
        result_list = config.get_json(FILE_DB)
    else:
        result_list = list()
Ejemplo n.º 2
0
def convert_history_json(file_in, timezone_local):
    json = config.get_json(file_in)
    json_array = json.get("messages")
    channel_id = json.get("id")
    if json_array is None or channel_id is None:
        print("Cannot parse Telegram history json")
        return

    messages_list = list()
    for message in json_array:
        msg_props = dict()
        msg_props["id"] = message["id"]
        msg_props["date"] = helper.str_to_utc_datetime(
            message["date"], timezone_local, config.ISO_LOCAL_DATE_FORMAT).isoformat()

        message_item = message.get("text")
        if message_item is not None:
            msg_props["text"] = str(message_item)

        reply_to_message_id = message.get("reply_to_message_id")
        if reply_to_message_id is not None:
            msg_props["reply_to_msg_id"] = reply_to_message_id

        messages_list.append(msg_props)

    out_path = os.path.join(
        config.CHANNELS_HISTORY_DIR, f"{channel_id}.json")
    config.set_json(out_path, messages_list)
    print(f"Saved as {out_path}")
Ejemplo n.º 3
0
def import_json(input_file):
    sql_connection = sqlite3.connect(config.DB_STATS_PATH)
    cur = sql_connection.cursor()

    json_array = config.get_json(input_file)

    for link_item in json_array:
        id_ = link_item["id"]
        add_date_utc = link_item.get("add_date_utc")

        if add_date_utc is None:
            add_date_utc = link_item.get("add_date_utc:")
            if add_date_utc is None:
                add_date_utc = "NULL"

        change_date_utc = link_item.get("change_date_utc")

        if change_date_utc is None:
            change_date_utc = link_item.get("change_date_utc:")
            if change_date_utc is None:
                change_date_utc = "NULL"

        access_url = link_item["access_url"]
        name = link_item["name"].replace("'", "''")
        exec_string = f"INSERT INTO Channel VALUES ({id_},'{name}','{access_url}','{add_date_utc}','{change_date_utc}')"
        cur.execute(exec_string)

    sql_connection.commit()
    sql_connection.close()
Ejemplo n.º 4
0
 def insertion_delete_performance(self,file_name,safe_mode):
     '''
     use pymongo to caculate the insertion time
     '''
     try:
         json_array = get_json(file_name)
         collection = conn.repost_detail
         tm_srl = []
         indx = 0
         start_time = time.time()
         for row in json_array[:1000000]:
             indx += 1
             collection.insert(
                 {
                     '_id': MAX_ID+indx,
                     'city': "",
                     'created_at': datetime.now(),
                     'favourites_count': 10,
                     'statuses_count': 10,
                     'friends_count': row.get("friends_count", ""),
                     'gender': "m",
                     'location': "beijing",
                     'sm_user_id': 1,
                     'profile_image_url': "",
                     'province': "need to delete",
                     'retweet_status_created_at': datetime.now(),
                     'user_id': 1,
                     'retweet_status_source': "",
                     'screen_name': "",
                     'post_id':1,
                     'verified': False,
                     'sm_flash_factor': 1,
                     'sm_user_followers_count': 1,
                     'retweet_status_id': 1,
                     'sm_user_screen_name': "",
                     'source': "",
                     'followers_count': row.get("followers_count", ""),
                     'text': row.get("text", ""),
                     'user_created_at': datetime.now(),
                     'retweet_status_text':row.get("retweet_status_text","")
                 },
                 safe=safe_mode
             )
             if indx%10000 == 0:
                 tm_srl.append(time.time() - start_time)
                 print time.time() - start_time
         return time_serial
     except IOError:
         print 'file io error'
         return
Ejemplo n.º 5
0
 def insertion_delete_performance(self,file_name,safe_mode):
     try:
         json_array = get_json(file_name)
         indx = 0
         time_serial = []
         start_time = time.time() #caculate the start time
         for row in json_array[:1000000]:
             indx += 1
             repost = Repost(
                 _id=MAX_ID+indx,
                 city="",
                 created_at=date.today(),
                 favourites_count=10,
                 statuses_count=10,
                 friends_count=row.get("friends_count",""),
                 gender="m",
                 location="beijing",
                 sm_user_id=1,
                 profile_image_url="",
                 province="need to delete",
                 retweet_status_created_at=date.today(),
                 user_id=1,
                 retweet_status_source="",
                 screen_name="",
                 retweet_status_text=row.get("retweet_status_text", ""),
                 post_id=10,
                 verified=False,
                 sm_flash_factor=1,
                 sm_user_followers_count=1,
                 retweet_status_id=1,
                 sm_user_screen_name="",
                 source="",
                 followers_count=row.get("followers_count", ""),
                 text=row.get("text",""),
                 user_created_at=date.today()
             )
             repost.save(safe=safe_mode, force_insert=True)
             if indx%10000 == 0:
                 time_serial.append(time.time() - start_time)
                 print time.time() - start_time
         return time_serial 
     except IOError:
         print "open file failed"
         return
Ejemplo n.º 6
0
def update_all_ids(initial_id):
    try:
        json_array = get_json()
        for row in json_array:
            initial_id += 1
            collection.insert({
                '_id': initial_id,
                'city': "", 'created_at': datetime.now(),
                'favourites_count': 10, 'statuses_count': 10,
                'friends_count': row["friends_count"], 'gender': "m",
                'location': "beijing", 'sm_user_id': 1,
                'profile_image_url': "", 'province': "20",
                'retweet_status_created_at': datetime.now(), 'user_id': 1,
                'retweet_status_source': "", 'screen_name': "",
                'post_id':1, 'verified': False,
                'sm_flash_factor': 1, 'sm_user_followers_count': 1,
                'retweet_status_id': 1, 'sm_user_screen_name': "",
                'source': "", 'followers_count': row["followers_count"],
                'text': row["text"], 'user_created_at': datetime.now(),
                'retweet_status_text': row.get("retweet_status_text", "")})
    except IOError:
        print 'db error' 
Ejemplo n.º 7
0
def analyze_channel(channel_id):
    logging.info('analyze_channel, channel id: %s', channel_id)
    out_path = os.path.join(config.CHANNELS_HISTORY_DIR, f"{channel_id}.json")
    messages = None
    try:
        messages = config.get_json(out_path)
    except Exception as ex:
        logging.error('analyze_channel: %s, error: %s', ex,
                      traceback.format_exc())
        with classes.SQLite(config.DB_STATS_PATH, 'analyze_channel_error:',
                            lock) as cur:
            update_string = f"UPDATE Channel SET HistoryLoaded = 0 WHERE Id={channel_id}"
            cur.execute(update_string)
        return

    if messages is None or len(messages) < 1:
        logging.info('analyze_channel: no data from %s', out_path)
        return

    ordered_messges = sorted(messages, key=lambda x: x["id"], reverse=False)

    min_channel_date = helper.str_to_utc_datetime(ordered_messges[0]["date"])
    max_channel_date = helper.str_to_utc_datetime(
        ordered_messges[len(ordered_messges) - 1]["date"])

    for symbol in signal_parser.symbols_regex_map:
        min_date = db_poll.db_time_ranges[symbol][0]
        max_date = db_poll.db_time_ranges[symbol][1]

        if (min_channel_date > min_date):
            min_date = min_channel_date

        if (max_channel_date < max_date):
            max_date = max_channel_date

        min_date_rounded_minutes = min_date - datetime.timedelta(
            seconds=min_date.second, microseconds=min_date.microsecond)

        max_date_rounded_minutes = max_date - datetime.timedelta(
            seconds=max_date.second, microseconds=max_date.microsecond)

        while not WAIT_EVENT_OUTER.is_set():

            if is_theads_busy():
                WAIT_EVENT_INNER.wait(STATS_ANALYZE_LOOP_GAP_SEC)
            else:
                logging.info(
                    'analyze_channel: id: %s, symbol: %s, start: %s, end: %s',
                    channel_id, symbol, min_date_rounded_minutes,
                    max_date_rounded_minutes)
                process_channel_typle = (ordered_messges, symbol,
                                         min_date_rounded_minutes,
                                         max_date_rounded_minutes, channel_id)
                atomic_increment()
                pool.apply_async(signal_parser.analyze_channel_symbol,
                                 process_channel_typle,
                                 callback=write_db)
                break

        if WAIT_EVENT_OUTER.is_set():
            return
Ejemplo n.º 8
0
from telethon.tl.functions.messages import ImportChatInviteRequest, CheckChatInviteRequest, GetDialogsRequest, EditMessageRequest
from telethon.tl.functions.channels import GetMessagesRequest, JoinChannelRequest, LeaveChannelRequest
import config
import classes
import datetime
import os
import pytz
import requests
import json
import youtube_dl

utc = pytz.UTC

COLLECTOR_CONFIG = "collector_config.json"
load_dotenv()
config_collector = config.get_json(COLLECTOR_CONFIG)
SESSION = 'secure_session_history_collector.session'
ON_ERROR_SLEEP_SEC = 60
STEP = 1
FILE_DB = "collector_db.json"
ISO_DATE_FORMAT = r"%Y-%m-%dT%H:%M:%S.%fZ"


def load_cfg():
    global config_collector
    config_collector = config.get_json(COLLECTOR_CONFIG)
    global last_id
    last_id = config_collector["last_id"]
    global main_url_part
    main_url_part = config_collector["main_url_part"]
    global view_url_part
Ejemplo n.º 9
0
def load(username):
    conf = config.get_json('users.json')
    if username not in conf:
        raise ValueError('Can\'t find user "{}"'.format(username))

    return User(conf[username])