def run(): keys = getTasks.getTasks().btc123() db = getTasks.getTasks().getMongo() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', "Connection": "close", } conn = connRedis.OPRedis() item = {} for i in keys: url = 'https://apioperate.btc123.com/api/content/selectPageFlashNews?pageNumber=1&title={}&sourceId=1'.format( str(i['key'])) j = 0 while j < 5: try: response = requests.get( url, headers=headers, proxies={'https': conn.randomOneIp('proxy:new_ip_list')}, timeout=3) break except: j += 1 print(url + "请求失败") data = json.loads(response.text) lists = data['data']['list'] for list in lists: item['post_title'] = list['title'] item['created_at'] = list['createTime'] item['read_count'] = 0 item[ 'original_url'] = 'https://www.btc123.com/search?type=flash&keyword={}'.format( str(i['key'])) item['page_url'] = item['original_url'] item['source_host'] = list['source'] item['screen_name'] = '匿名' item['text'] = list['content'] item['time'] = int(time.time()) item['floor'] = int(list['id']) item['column'] = i['key'] item['platform'] = '区块链快讯' item['column1'] = i['column1'] item['originalPlatformId'] = i['originalPlatformId'] item['keywordId'] = i['keywordId'] item['reptileType'] = i['reptileType'] item['contentType'] = i['contentType'] title = db.btc123.find_one({'post_title': item['post_title']}) if title is None: print(item) getTasks.post_data(item) db.btc123.insert(deepcopy(item)) print('end')
def run(): keys = getTasks.getTasks().heCaijing() db = getTasks.getTasks().getMongo() headers = { 'Cookie': 'PHPSESSID=dq7c7te4bmvco8ddmj4kt171p7; _ga=GA1.2.1003817106.1566180235; _gid=GA1.2.2147195660.1566180235; Hm_lvt_b94ff1ee8863337601c8a7baf17d031c=1566180235; Hm_lpvt_b94ff1ee8863337601c8a7baf17d031c=1566209238; _gat_gtag_UA_122528065_1=1', 'Host': 'www.hecaijing.com', 'Referer': 'https://www.hecaijing.com/kuaixun/', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', } conn = connRedis.OPRedis() item = {} for i in range(1, 5): url = 'https://www.hecaijing.com/express/loadmore?coin=&pn={}'.format( i) response = requests.get( url, headers=headers, proxies={'https': conn.randomOneIp('proxy:new_ip_list')}, timeout=3) data = json.loads(response.text) lists = data['data'][0]['buttom'] for list in lists: item['post_title'] = list['title'] item['created_at'] = list['update_time'] item['read_count'] = 0 item['original_url'] = 'https://www.hecaijing.com/kuaixun/' item['page_url'] = item['original_url'] item['source_host'] = "" item['screen_name'] = list['publish_adminuser'] item['text'] = list['main'] item['time'] = int(time.time()) item['floor'] = int(list['id']) item['column'] = '火币' item['platform'] = keys['platform'] item['column1'] = keys['column1'] item['originalPlatformId'] = keys['originalPlatformId'] item['keywordId'] = 12235 item['reptileType'] = keys['reptileType'] item['contentType'] = keys['contentType'] title = db.hecaijing.find_one({'post_title': item['post_title']}) if title is None: print(item) getTasks.post_data(item) db.hecaijing.insert(deepcopy(item)) print('end')
def run(): keys = getTasks.getTasks().bitKan() db = getTasks.getTasks().getMongo() headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', } conn = connRedis.OPRedis() item = {} url = 'https://bitkan.com/api/news/weibo/705014?locale=zh' response = requests.get( url, headers=headers, proxies={'https': conn.randomOneIp('proxy:new_ip_list')}, timeout=3) data = json.loads(response.text) lists = data['briefs'] for list in lists: item['post_title'] = list['title'] item['created_at'] = list['updated_at'] item['read_count'] = 0 item['original_url'] = 'https://bitkan.com/zh/news' item['page_url'] = item['original_url'] item['source_host'] = "" item['screen_name'] = list['name'] item['text'] = list['content']['text'] item['time'] = int(time.time()) item['floor'] = int(list['id']) item['column'] = '火币' item['platform'] = '比特币快讯' item['column1'] = '比特币快讯' item['originalPlatformId'] = 188 item['keywordId'] = 12235 item['reptileType'] = keys['reptileType'] item['contentType'] = keys['contentType'] title = db.bitkan.find_one({'post_title': item['post_title']}) if title is None: print(item) getTasks.post_data(item) db.bitkan.insert(deepcopy(item)) print('end')
def run(): keys = getTasks.getTasks().huoXing() db = getTasks.getTasks().getMongo() headers = { "Connection": "close", 'accept': 'application/json, text/plain, */*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cookie': 'UM_distinctid=16c99cbf4b9d1-08963401d03938-7373e61-100200-16c99cbf4ba0; _ga=GA1.2.955456149.1565948376; Hm_lvt_d70f8822d1ff168453d5ea7b3e359297=1567396038,1567646981,1569203292; CNZZDATA1272858809=353535772-1566174481-https%253A%252F%252Fflash.huoxing24.com%252F%7C1569198357; _gid=GA1.2.620194914.1569203293; _gat_gtag_UA_121795392_1=1; USD=6.833898; rightAdImgCloseTime=2019-09-23; Hm_lpvt_d70f8822d1ff168453d5ea7b3e359297=1569203315; SERVERID=29dcb2c2e0682adea06ad95c2d4fe0cc|1569203446|1569203415', 'referer': 'https://www.huoxing24.com/search/%E7%81%AB%E5%B8%81', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'sign-param': 'eyJwbGF0Zm9ybSI6InBjIiwibm9uY2UiOiJQR1NObk8iLCJ0aW1lc3RhbXAiOjE1NjYyMDUxNjUyMDYsInNpZyI6IjhhODg5MDdiMmFmYjhiNGM4ODVjMTc4MmY2NjNkZjUxIn0=', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', } conn = connRedis.OPRedis() item = {} for i in keys: url = 'https://www.huoxing24.com/info/news/multisearch?page=1&pageSize=18&type=2&q={}&deviceSource=web'.format( str(i['key'])) # quote(key, encoding="gbk") j = 0 while j < 5: try: response = requests.get( url, headers=headers, proxies={'https': conn.randomOneIp('proxy:new_ip_list')}, timeout=3) print(response.text) break except: j += 1 print(url + "请求失败") data = json.loads(response.text) if data['code'] == 1: lists = data['obj']['inforList'] for list in lists: p1 = re.compile(r'[【](.*?)[】]', re.S) # 最小匹配 item['post_title'] = re.findall(p1, list['content'])[0] year = list['id'][:4] mounth = list['id'][4:6] day = list['id'][6:8] hour = list['id'][8:10] minute = list['id'][10:12] item[ 'created_at'] = year + "-" + mounth + "-" + day + " " + hour + ":" + minute item['read_count'] = 0 item[ 'original_url'] = 'https://www.huoxing24.com/search/{}'.format( str(i['key'])) item['page_url'] = item['original_url'] item['source_host'] = "" item['screen_name'] = list['author'] item['text'] = list['content'] item['time'] = int(time.time()) item['floor'] = int(list['id'][8:]) item['column'] = i['key'] item['platform'] = '火星财经' item['column1'] = i['column1'] item['originalPlatformId'] = i['originalPlatformId'] item['keywordId'] = i['keywordId'] item['reptileType'] = i['reptileType'] item['contentType'] = i['contentType'] title = db.huoxing.find_one({'post_title': item['post_title']}) if title is None: print(item) getTasks.post_data(item) db.huoxing.insert(deepcopy(item)) print('end')
def run(): db = getTasks.getTasks().getMongo() keys = getTasks.getTasks().huoxun() headers = { "Connection": "close", 'accept': 'application/json, text/javascript, */*; q=0.01', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'cookie': 'PHPSESSID=0clbstiheafjj5gr2ob50p0rj6; Hm_lvt_f396f0424d21da4c5df398bf0ca78f23=1566180318; Hm_lvt_b7769c8d87ab17b2001f99ab6b37c33d=1566180318; Hm_lpvt_f396f0424d21da4c5df398bf0ca78f23=1566206928; Hm_lpvt_b7769c8d87ab17b2001f99ab6b37c33d=1566206928', 'referer': 'https://huoxun.com/search.html', 'sec-fetch-mode': 'cors', 'sec-fetch-site': 'same-origin', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36', 'x-requested-with': 'XMLHttpRequest', } conn = connRedis.OPRedis() item = {} for i in keys: url = 'https://huoxun.com/cms/api/search_quick.html?search_field=title&keyword={}&page=1'.format( str(i['key'])) j = 0 while j < 5: try: response = requests.get( url, headers=headers, proxies={'https': conn.randomOneIp('proxy:new_ip_list')}, timeout=3) break except: j += 1 print(url + "请求失败") data = json.loads(response.text) lists = data['data'] for list in lists: item['post_title'] = list['title'] timeStamp = list['update_time'] timeArray = time.localtime(timeStamp) item['created_at'] = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) item['read_count'] = 0 item['original_url'] = 'https://huoxun.com/search.html' item['page_url'] = item['original_url'] item['source_host'] = "" item['screen_name'] = '匿名' item['text'] = list['des'] item['time'] = int(time.time()) item['floor'] = int(list['id']) item['column'] = i['key'] item['platform'] = '火讯财经' item['column1'] = i['column1'] item['originalPlatformId'] = i['originalPlatformId'] item['keywordId'] = i['keywordId'] item['reptileType'] = i['reptileType'] item['contentType'] = i['contentType'] title = db.huoxun.find_one({'post_title': item['post_title']}) if title is None: print(item) getTasks.post_data(item) db.huoxun.insert(deepcopy(item)) print('success')