def insertdb(data): downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') try: collection.bulk_write(data) print('添加完成' + downloadTime) except: print('重复添加' + downloadTime)
def insertdb(urls, title, aa, pub_time, downloadTime, aid, onlyIds): site = "雪球网" siteId = 1048420 push_state = 0 data = [] data.append( InsertOne({ "url": urls, "title": title, "aid": aid, "content": aa, "site": site, "pub_time": pub_time, "push_state": push_state, "site_id": siteId, "download_Time": downloadTime, "only_id": onlyIds, })) downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') try: collection.bulk_write(data) print('添加完成' + downloadTime) except: print('重复添加' + downloadTime)
def insertdb(data): try: collection.bulk_write(data) print('添加完成') except Exception as err: print("添加重复") pass
def insertdb(datass): downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') try: collection.bulk_write(datass) print('添加完成' + downloadTime) except Exception as err: print("重") pass
def insertdb (data): downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') try: collection.bulk_write(data) collection.update_one(data, {'$set': data}, upsert=True) print('添加完成'+downloadTime) except: print('重复添加'+downloadTime)
def article(): idd = [ '{"page":1,"rows":10,"id":"59"}', '{}', '{"page":1,"rows":10,"id":"51"}', '{"page":1,"rows":10,"id":"58"}', '{"page":1,"rows":10,"id":"56"}' ] for datas in idd: try: datass = datas response = ss.post( 'https://api.hunan-show.com/system/topicBase/getPageSetHome', headers=headers, data=datass) content = response.content.decode('utf-8') id = re.compile('"uuid":"(.*?)",').findall(str(content)) for ids in id: try: url = "https://api.hunan-show.com/system/topicBase/getDocDetailByUuid?uuid=" + ids res = ss.get(url) article = res.content.decode('utf-8') print() title = re.compile('"title":"(.*?)",').findall( str(article)) pubtime = re.compile('"releaseTime":"(.*?)",').findall( str(article)) content = re.compile('"content":"(.*?)",').findall( str(article)) site = "中国(湖南)国际矿物宝石博览会" siteId = 1049645 pushState = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = [] data.append( InsertOne({ "url": url, "title": title[0], "pub_time": pubtime[0], "content": content[0], "download_time": downloadTime, "site": site, "site_id": siteId, "aid": ids, 'push_state': pushState, })) try: collection.bulk_write(data) print('添加完成') print(downloadTime) except Exception as err: print("添加重复") except Exception as err: time.sleep(10) pass except Exception as err: time.sleep(10) pass
def insertdb(data): downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') try: collection.bulk_write(data) print('添加完成' + downloadTime) except Exception as err: import traceback print('添加重复' + downloadTime) pass
def safe_bulk_delete(collection: pymongo.collection.Collection, ids, id_key='_id'): """Sometimes when you want to delete a bunch of documents using an identifier the 'delete document' itself exceeds the 16MB Mongo limit. This function will catch such cases and break up the command into suitably batches""" ids = list(set(ids)) # No needs to repeat ourselves try: collection.delete_many({id_key: q.in_(*ids)}) except pymongo.errors.DocumentTooLarge: # Use bulk operation instead # Note, this could be spead up further by batching the deletes but for now it's not worth it bulk_ops = [pymongo.DeleteOne({id_key: entry_id}) for entry_id in ids] collection.bulk_write(bulk_ops)
def article(): response = requests.get( 'https://www.0735cs.com/article/list_20_1_0_0_1_1.html', headers=headers, cookies=cookies) content = response.content.decode('utf-8') id = re.compile('article_(.*?).html').findall(str(content)) for ids in id: try: url = 'https://www.0735cs.com/article/article_' + ids + '.html' res = requests.get(url) article = res.content.decode('utf-8') title = re.compile('<h3 .*?>(.*?)</h3>').findall(str(article)) pubtime = re.compile('<font id="createtime">(.*?)</font>').findall( str(article)) content = re.compile( '<div class="txt" id="resizeIMG">([\s\S]*?.)<div class="contentPadding" .*?>' ).findall(str(article)) site = "郴州城事" siteId = 1049649 pushState = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = [] data.append( InsertOne({ "url": url, "title": title[0], "pub_time": pubtime[0], "content": content[0], "download_time": downloadTime, "site": site, "site_id": siteId, "aid": ids, 'push_state': pushState, })) try: collection.bulk_write(data) print('添加完成') print(downloadTime) except Exception as err: print("添加重复") except Exception as err: print()
siteId = 1050145 pushState = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = [] data.append( InsertOne({ "url": urll, "title": title, "aid": urll, "content": strs, "site": site, "pub_time": pubTime, "push_state": pushState, "site_id": siteId, "download_Time": downloadTime })) try: collection.bulk_write(data) print('添加完成') print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: print("添加重复") print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: print() time.sleep(300)
def article(): try: list = guanjianci.key_list for ids in list: url = "https://www.instagram.com" urlss = "https://www.instagram.com/explore/tags/" + ids + "/" # chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--no-sandbox') # 解决DevToolsActivePort文件不存在的报错 # chrome_options.add_argument('window-size=1920x3000') # 指定浏览器分辨率 # chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug # chrome_options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面 # chrome_options.add_argument('blink-settings=imagesEnabled=false') # 不加载图片, 提升速度 # chrome_options.add_argument('--headless') # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败 # driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=chrome_options) option = webdriver.ChromeOptions() option.binary_location = 'C:\\Users\\86139\\AppData\\Local\\googles\\Chrome-bin\\chrome.exe' driver = webdriver.Chrome( 'C:\\Users\\86139\\AppData\\Local\\googles\\Chrome-bin\\chromedriver.exe', options=option) driver.get(url) time.sleep(10) print('111') username_xpath = '//input[@name="username"]' login_xpath = '//button[@class="sqdOP L3NKy y3zKF "]' password_xpath = '//input[@name="password"]' driver.find_element_by_xpath(username_xpath).send_keys( '+8615313137407') driver.find_element_by_xpath(password_xpath).send_keys('wqs159888') driver.find_element_by_xpath(login_xpath).click() time.sleep(10) driver.get(urlss) print('222') content = driver.page_source # response = requests.get('https://www.instagram.com/explore/grid/', headers=headers, params=params) # response = requests.get(url, headers=headers) # if response.status_code == 429: # downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # print(downloadTime) # time.sleep(86400) # content = response.content.decode('unicode-escape') # content = response.content id = re.compile('"code":"(.*?)"').findall(str(content)) idshuzu = set(id) from random import choice for articleid in idshuzu: # arr = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20] # arrs = choice(arr) # time.sleep(int(arrs)) try: imgs = '' videourl = '' articleurl = "https://www.instagram.com/p/" + articleid + "/" driver.get(articleurl) content = driver.page_source ac = re.compile('"text":"(.*?)"}').findall(str(content)) ac = ac[0] ab = re.compile('(\\\\ud...)').findall(str(ac)) for te in ab: ac = ac.replace(te, '') if ac: ac = ac.encode('utf-8', 'replace').decode('unicode-escape') else: pass pubTime = re.compile( '"taken_at_timestamp":(.*?),"').findall(str(content)) timeArray = time.localtime(int(pubTime[0])) pubTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) replyCount = re.compile( '"edge_media_to_parent_comment":{"count":(.*?),"' ).findall(str(content)) likeCount = re.compile( '"edge_media_preview_like":{"count":(.*?),"').findall( str(content)) imgCount = re.compile('"display_url":"(.*?)"').findall( str(content)) for im in imgCount: im = im.encode('utf-8', 'replace').decode('unicode-escape') imgs += "<br><img src=\'" + im + "\'></img>" videoCount = re.compile('"video_url":"(.*?)"').findall( str(content)) if videoCount: videourl = "<br><video src='" + videoCount[ 0] + "' controls=" "></video>" videourl = videourl.encode( 'utf-8', 'replace').decode('unicode-escape') title = re.compile('<title>([\s\S]*?)</title>').findall( str(content)) title = title[0] if title == []: title = ac title = title if title == '\nInstagram\n': title = ac articleContent = ac + '<br>' + imgs + '<br>' + videourl site = "instagram" siteId = 1049117 pushState = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = [] data.append( InsertOne({ "url": articleurl, "title": title, "pub_time": pubTime, "content": articleContent, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": articleid, "only_id": articleid, 'push_state': pushState, 'like_num': int(likeCount[0]), 'cmt_num': int(replyCount[0]), })) # insertdb(data) try: collection.bulk_write(data) print('添加完成') print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: print("添加重复") print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: import traceback traceback.print_exc() pass driver.quit() os.system('/root/chromes.sh') os.system('/root/chromess.sh') #video re.video_url #img re.display_url 视频的话就是封面图 print() except Exception as err: import traceback driver.quit() os.system('/root/chromes.sh') os.system('/root/chromess.sh') traceback.print_exc() pass
def article(headers2): a = 0 list = guanjianci.key_list a = a + 1 for lis in list: try: from random import choice par = dict(params) par['q'] = lis arr = [50] arrs = choice(arr) time.sleep(int(arrs)) response = ss.get('https://www.facebook.com/search/posts', params=par, headers=headers2) print(response.status_code) # if response.status_code != 200: # headers2 = headers1 # pass content = response.content.decode('utf-8') id = re.compile('"id":"vm-(.*?):').findall(str(content)) url = re.compile('"permalink":"(.*?)"').findall(str(content)) if url == []: print('进入休眠') time.sleep(7200) break for ur in url: try: urls = str(ur).replace('\\', '') # urls = 'https://www.facebook.com/groups/2337886349768125/posts/2883216531901768' arrs = choice(arr) time.sleep(int(arrs)) res = ss.get(urls, headers=headers2) article = res.content.decode('utf-8') articles = re.compile('"wwwURL":"(.*?)"').findall( str(article)) times = re.compile('"creation_time":(.*?),').findall( str(article)) likeCount = re.compile( '"reaction_count":{"count":(.*?),"').findall( str(article)) title = re.compile('"message":{"text":"(.*?)"},"').findall( str(article)) for urs, ti, like, til in zip(articles, times, likeCount, title): try: ac = '' ab = re.compile('(\\\\ud...)').findall(str(til)) for te in ab: til = til.replace(te, '') if til[-1] == '\\': til = til[:-1] tils = til.encode( 'utf-8', 'replace').decode('unicode-escape') urss = str(urs).replace('\\', '') timeArray = time.localtime(int(ti)) pubTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray) arcontent = tils site = "Facebook" siteId = 1049117 pushState = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = [] data.append( InsertOne({ "url": urss, "title": tils, "pub_time": pubTime, "content": arcontent, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": urss, 'push_state': pushState, 'like_num': int(like), })) try: collection.bulk_write(data) print('添加完成') print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: print("添加重复") print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: import traceback traceback.print_exc() pass except Exception as err: import traceback traceback.print_exc() pass except Exception as err: import traceback traceback.print_exc() pass
def insertdb(data): try: collection.bulk_write(data) print('添加完成') except: print('重复添加')
def article(browser): import random a = 0 list = guanjianci.key_list a = a + 1 for lis in list: print("333") browser.find_element_by_xpath( "//input[@class='oajrlxb2 rq0escxv f1sip0of hidtqoto e70eycc3 lzcic4wl hzawbc8m ijkhr0an aaoq3grb sgqwj88q b3i9ofy5 oo9gr5id b1f16np4 hdh3q7d8 dwo3fsh8 qu0x051f esr5mh6w e9989ue4 r7d6kgcz br7hx15l h2jyy9rg n3ddgdk9 owxd89k7 ihxqhq3m jq4qci2q k4urcfbm iu8raji3 qypqp5cg l60d2q6s hv4rvrfc hwnh5xvq ez2duhqw rmlgq0sb dzqu5etb aj8hi1zk r4fl40cc kd8v7px7 m07ooulj mzan44vs']" ).send_keys(lis) browser.find_element_by_xpath("//input[@value='" + lis + "']").send_keys(Keys.ENTER) time.sleep(3) next_btn = browser.find_element_by_xpath("//*[text()='帖子']") browser.execute_script("arguments[0].click();", next_btn) time.sleep(10) next_btntwo = browser.find_element_by_xpath( "//input[@aria-label='近期帖子']") browser.execute_script("arguments[0].click();", next_btntwo) time.sleep(10) print("444") for a in range(10): browser.execute_script( 'window.scrollTo(0,document.body.scrollHeight)') time.sleep(3) length = len(list) if length < 1: return '' if length == 1: return str(list[0]) try: from random import choice # par = dict(params) # par['q'] = lis # arr = [30,10,20,40,50] # arrs = choice(arr) # time.sleep(int(arrs)) # browser.get('https://www.facebook.com/search/posts?filters=eyJyZWNlbnRfcG9zdHM6MCI6IntcIm5hbWVcIjpcInJlY2VudF9wb3N0c1wiLFwiYXJnc1wiOlwiXCJ9In0=&q='+str(list[randomNumber])) content = browser.page_source url = re.compile( '<a class="oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 a8c37x1j p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl gmql0nx0 p8dawk7l" href="https://www.facebook.com/groups/(.*?)/posts/(.*?)/" role="link" tabindex="0">' ).findall(str(content)) if url == []: login() for ur in url: print("555") try: # urls = str(ur).replace('\\', '') # arrs = choice(arr) # time.sleep(int(arrs))https://www.facebook.com/groups/334036650079422/posts/1955843984565339/ urls = "https://www.facebook.com/groups/" + ur[ 0] + "/posts/" + ur[1] + "/" next_btno = browser.find_element_by_xpath("//a[@href='" + urls + "']") browser.execute_script("arguments[0].click();", next_btno) time.sleep(10) print("123") article = browser.page_source times = re.compile('"item_logging_id":"(.*?):').findall( str(article)) title = re.compile('<title>(.*?)</title>').findall( str(article)) con = re.compile( '<div dir="auto".*?>(.*?)<div.*?data-visualcompletion="ignore-dynamic">' ).findall(str(article)) print(title) try: # ab = re.compile('(\\\\ud...)').findall(str(til)) # for te in ab: # til = til.replace(te, '') # if til[-1] == '\\': # til = til[:-1] # tils = til.encode('utf-8', 'replace').decode('unicode-escape') # urss = str(urs).replace('\\', '') # timeArray = time.localtime(int(times[0])) pubTime = times[0] + " 00:00:00" arcontent = con site = "Facebook" siteId = 1049117 pushState = 0 downloadTime = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') data = [] data.append( InsertOne({ "url": urls, "title": title[0], "pub_time": pubTime, "content": arcontent, "download_time": downloadTime, "site": site, "site_id": siteId, "aid": urls, 'push_state': pushState, })) print("333") try: collection.bulk_write(data) print('添加完成') print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: print("添加重复") print('下载时间' + downloadTime) print('发布时间' + pubTime) except Exception as err: browser.quit() login() except Exception as err: browser.quit() login() except Exception as err: browser.quit() login()