def clean_csv(av_id: int): ''' clean csv ''' csv_path = os.path.join(history_dir, '{}.csv'.format(av_id)) output_path = os.path.join(history_data_dir, '{}_new.csv'.format(av_id)) csv = read_file(csv_path) last_time, last_view = csv[0].split(',')[:2] result = [csv[0]] last_time = time_stamp(last_time) last_view = int(last_view) empty_line = ','.join([' '] * (len(csv[0].split(',')) + 1)) for line in csv[1:]: now_time, now_view = line.split(',')[:2] now_time = time_stamp(now_time) now_view = int(now_view) time_gap = now_time - last_time if now_view < last_view or now_view - last_view > 5000: # echo(1, last_view, last_time, now_view, now_time) continue if abs(time_gap) > 150: for ii in range(int((time_gap - 30) // 120)): result.append(empty_line) if abs(time_gap) > 90: # echo(0, last_view, last_time, now_view, now_time) result.append(line) last_view, last_time = now_view, now_time # else: # echo(2, last_view, last_time, now_view, now_time) with open(output_path, 'w') as f: f.write('\n'.join(result))
def getTitleMap(self): ''' get title map ''' slug = read_file('{}slug'.format(data_dir)) title = read_file('{}title'.format(data_dir)) self.title_map = { tempslug.split('"')[1]: title[num].split('"')[1] for num, tempslug in enumerate(slug) } title2slug = { self.title_map[index]: index for index in self.title_map.keys() } noemoji_title = { self.filter_emoji(self.title_map[index]).replace('\u200d', ''): index for index in self.title_map.keys() } self.title2slug = {**noemoji_title, **title2slug}
def getZhihuView(self): cookie = ''.join(read_file('{}cookie'.format(data_dir))) changeCookie(cookie) url_basic = [ self.ZHIHU_URL, 'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=', datetime.datetime.now().strftime("%Y-%m-%d"), '&page_no=' ] url = ''.join(url_basic) json = self.get_request('{}{}'.format(url, 1), 1, lambda i: not i) if not json: return if not 'data' in json: if 'code' in json: echo('0|warning', json) return echo(3, 'zhihu', json) for index in json['data']: zhihu_title = index['title'] zhihu_id = int(index['url_token']) zhihu_count = int(index['read_count']) if zhihu_title in self.title2slug: temp_slug = self.title2slug[zhihu_title] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count elif zhihu_id in self.zhihu_id_map: temp_slug = self.zhihu_id_map[zhihu_id] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count else: echo('0|debug', index['title']) for index in range(1, json['count'] // 10): echo(1, 'zhihu', index) json = self.get_request('{}{}'.format(url, 1 + index), 1, lambda i: not i) echo(2, 'zhihu', json) if not json: continue for index in json['data']: zhihu_title = index['title'] zhihu_id = int(index['url_token']) zhihu_count = int(index['read_count']) if zhihu_title in self.title2slug: temp_slug = self.title2slug[zhihu_title] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count elif zhihu_id in self.zhihu_id_map: temp_slug = self.zhihu_id_map[zhihu_id] self.zhihu_id[temp_slug] = zhihu_id self.zhihu_views[temp_slug] = zhihu_count else: echo('0|debug', index['title'])
def create_table(self, sql_path: str): if not os.path.exists(sql_path): echo(0, "Create Table {} error, file not found".format(sql_path)) return False create_table_sql = "\n".join(read_file(sql_path)) try: cursor = self.db.cursor() cursor.execute(create_table_sql) echo(2, "Create Table from {} Success!!!".format(sql_path)) return True except Exception as e: echo(0, "Create Table from {} error".format(sql_path), e) return False
def loadLocalView(self): ''' load local view ''' test = read_file('{}google'.format(data_dir))[7:] for index in test: arr = index.split(',') slug = self.matchSlug(arr[0]) if slug is None or slug not in self.title_map: continue print(slug + ' ' + str(arr[1]) + ' ' + arr[0]) if slug in self.local_views: self.local_views[slug] += int(arr[1]) else: self.local_views[slug] = int(arr[1])
def load_history_file(self, av_id: int, av_info: dict): data_path = '{}{}_new.csv'.format(history_data_dir, av_id) history_list = read_file(data_path)[:2880] if not len(history_list): return created, title = av_info['created'], av_info['title'] history_list = [ii.split(',') for ii in history_list] time_map = { round((time_stamp(ii[0]) - created) / 120) * 2: ii for ii in history_list if ii[0] != '' } last_data = [0] * 8 for ii in self.history_map.keys(): if ii in time_map: self.history_map[ii][av_id] = time_map[ii] last_data = time_map[ii] + last_data[len(time_map[ii]):] else: self.history_map[ii][av_id] = last_data
def load_history_file(self, bv_id: str, bv_info: dict): data_path = "{}{}_new.csv".format(history_data_dir, bv_id) history_list = read_file(data_path)[:3660] if not len(history_list): return created, title = bv_info["created"], bv_info["title"] history_list = [ii.split(",") for ii in history_list] time_map = { round((time_stamp(ii[0]) - created) / 120) * 2: ii for ii in history_list if ii[0] != "" } last_data = [0] * 8 for ii in self.history_map.keys(): if ii in time_map: self.history_map[ii][bv_id] = time_map[ii] last_data = time_map[ii] + last_data[len(time_map[ii]):] else: self.history_map[ii][bv_id] = last_data
def load_article_local(self, file_path: str): if file_path not in self.tpwds: tt = '||||'.join(read_file(file_path)) tpwds = regex.findall(self.TPWD_REG, tt) self.tpwds[file_path] = tpwds else: tpwds = self.tpwds[file_path] if file_path not in self.tpwd_map: self.tpwd_map[file_path] = {} time = 0 while (len(self.tpwd_map[file_path]) < len(tpwds)) and time < 5: thread_list = [ii for ii in tpwds if not ii in self.tpwd_map[file_path]] echo(1, file_path, "tpwds len:", len(tpwds), "need load", len(thread_list)) thread_list = [ self.tpwd_exec.submit(self.decoder_tpwd_once, file_path, ii, 1) for ii in thread_list ] list(as_completed(thread_list)) time += 1
def gatherproxy(self, types: int): """ :100: very nice website first of all you should download proxy ip txt from: http://www.gatherproxy.com/zh/proxylist/country/?c=China """ if not os.path.exists("{}gatherproxy".format(data_dir)): echo("0|warning", "Gather file not exist!!!") return file_d = read_file("{}gatherproxy".format(data_dir)) waitjudge_http = ["http://" + ii for ii in file_d] waitjudge_https = ["https://" + ii for ii in file_d] if not types: self.waitjudge += waitjudge_http elif types == 1: self.waitjudge += waitjudge_https elif types == 2: self.waitjudge += waitjudge_http + waitjudge_https else: self.waitjudge += file_d echo("2|warning", "load gather over!")