def getLink(url): proxyList = proxy.getProxy() for proxyItem in proxyList: print(f'use proxy {proxyItem["ip"]}:{proxyItem["port"]}') response = connect('本書介紹', url, proxyItem) if response is None: print('connection is invalid') continue print('connection is valid') d = pyquery.PyQuery(response.text) posts = d('div.type02_m057:contains("內容簡介")') print(posts) break
def getList(url): proxyList = proxy.getProxy() rows = [] for proxyItem in proxyList: print(f'use proxy {proxyItem["ip"]}:{proxyItem["port"]}') response = connect('即時榜', url, proxyItem) if response is None: print('connection is invalid') continue print('connection is valid') # 開二進位檔案 with open('book.b.html', 'wb') as f: # 寫入二進位資料 f.write(response.content) d = pyquery.PyQuery(response.text) posts = d('ul.clearfix li.item') for idx, post in enumerate(posts.items()): if idx == 0: order = post('strong.no').text() title = post('h4').text() author = post('ul.msg').text()[3:] price = post('li.price_a').text()[4:] link = post('h4 a').attr('href') print('排名 : ', order) print('書名 : ', title) # print('作者 : ', author) # print(price) print('連結 : ', link) print('----------------------') # getLink(link) rows.append({'order': order, 'title': title, 'link': link}) text = json.dumps(rows, sort_keys=True, indent=4) with codecs.open(f'books.json', 'w', 'utf-8') as f: f.write(text) break
def setup(): logger.info("Entered Setup") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding": "gzip, deflate", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT": "1", "Connection": "close", "Upgrade-Insecure-Requests": "1", } logger.debug(f"Created headers: {headers}") PROXY = getProxy() logger.debug(f"Created proxy: {PROXY}") proxies = { "httpProxy": PROXY, "ftpProxy": PROXY, "sslProxy": PROXY, "proxyType": "MANUAL", } logger.info(f"Setup is done: {[headers, proxies]}") return [headers, proxies]
from bs4 import BeautifulSoup as bs from urllib import request as req import proxy PROXY_URL = proxy.getProxy() main_array = [] # ALL ITEM ARRAYS ARE IN HERE AS A TUPLE cathegory_array = [] # ITEM CATHEGORIES ARE IN HERE stats_array = [] # ITEM STATUS (SEEDERS AND LEECHERS) WILL BE HERE item_link_array = [] # ITEM LINKS ARE HERE item_name_array = [] # ITEM NAMES ARE HERE item_date_array = [] # ITEM UPLOADED DATES ARE HERE item_size_array = [] # ITEM SIZES ARE HERE def search_item(search_request): search_term = search_request search_term.replace(' ', '_') site_link = '' while True: for i in PROXY_URL: link = '{}/s/?q={}&category=0&page=0&orderby=99'.format(i, search_term) site_link = i try: source = req.urlopen(link).read() soup = bs(source, 'html.parser') main_item = soup.find_all('tr') result_count = soup.find('h2')
def SaveScrape(baseurl, PageSaveFolder, ScrapeFile, Scrapewait, useProxy, **kwargs): _time=time.time() XMLsaveFile="XML_scrape_" + (datetime.datetime.now()).strftime('%Y-%m-%d') xmlFile=PageSaveFolder + XMLsaveFile with open(xmlFile +'.xml', "w") as saveXML: print("blank xml created") XMLsaveFile="XML_scrape_" + (datetime.datetime.now()).strftime('%Y-%m-%d') # time.sleep(random.randint(1,10)) # ua = UserAgent() #headers = {'User-Agent':str(ua.random)} headers={ 'User-Agent': proxy.getHeader(random.randint(0,249)) } if useProxy != '': print("using previous proxy:", useProxy) r_proxy=useProxy elif useProxy == '': r_proxy,prox_status=proxy.getProxy(ps_user="******", ps_pass="******", ps_host="172.22.114.65",ps_port="5432", ps_db="scrape_db", update=True) if prox_status==False: print('error getting proxy, quitting') sys.exit() _pass=False _loopcount=0 while _pass==False: try: response = requests.get(baseurl + ScrapeFile,headers=headers, timeout=Scrapewait, proxies= {'http' : 'http://' + r_proxy, 'https' : 'https://' + r_proxy}) #r_proxy) _pass=True except: # requests.exceptions.Timeout: _waittime=+random.randint(1,9) print("count:",_loopcount,"-timeout, wait secs before retry:", _waittime) time.sleep(_waittime) _loopcount+=1 if _loopcount >=20: print("getting new proxy after 20 tries, link:",baseurl + ScrapeFile) r_proxy,prox_status=proxy.getProxy(ps_user="******", ps_pass="******", ps_host="172.22.114.65",ps_port="5432", ps_db="scrape_db", update=True) _loopcount=0 gz_save_name =ScrapeFile[:-7] + '_' + (datetime.datetime.now()).strftime('%Y-%m-%d') + '.gz' #save to gz open(PageSaveFolder + gz_save_name, 'wb').write(response.content) time.sleep(5) #feast upon that rich gooey xml _xml_save = ScrapeFile[:-7] + '_' + (datetime.datetime.now()).strftime('%Y-%m-%d') + '.xml' _pass=False _loopcount=0 while _pass==False: try: with gzip.open(PageSaveFolder + gz_save_name, 'rb') as f_in: time.sleep(5) with open(PageSaveFolder + _xml_save, 'wb') as f_out: time.sleep(5) shutil.copyfileobj(f_in, f_out) tree = etree.parse(PageSaveFolder + _xml_save) with open(PageSaveFolder + _xml_save, "wb") as saveXML: saveXML.write(etree.tostring(tree,pretty_print=True)) _pass=True except: _waittime=+random.randint(1,9) print("count:",_loopcount,"-error extracting file, wait secs before retry:", _waittime) time.sleep(_waittime) _loopcount+=1 if _loopcount==1: print("20 tries, aborting") sys.exit() body=tree.xpath('//ns:url',namespaces={'ns':"http://www.sitemaps.org/schemas/sitemap/0.9"}) _count=1 #now we parse and read, using lists instead of df since its A BUNCH faster list_lastmod=[] list_url=[] list_state=[] list_proptype=[] list_suburb=[] list_propid=[] for element in body: # if _count % 10000 == 0: # print("interval:", str(_count-1)," -total runtime:", time.time()-_time) list_lastmod.append(element[1].text) list_url.append(element[0].text) _splitval='' if '-nsw-' in element[0].text: _splitval='-nsw-' # elif '+nsw+' in element[0].text: _splitval='+nsw+' elif '-qld-' in element[0].text: _splitval='-qld-' # elif '+qld+' in element[0].text: _splitval='+qld+' elif '-tas-' in element[0].text: _splitval='-tas-' # elif '+tas+' in element[0].text: _splitval='+tas+' elif '-act-' in element[0].text: _splitval='-act-' # elif '+act+' in element[0].text: _splitval='+act+' elif '-sa-' in element[0].text: _splitval='-sa-' # elif '+sa+' in element[0].text: _splitval='+sa+' elif '-nt-' in element[0].text: _splitval='-nt-' # elif '+nt+' in element[0].text: _splitval='+nt+' elif '-wa-' in element[0].text: _splitval='-wa-' # elif '+wa+' in element[0].text: _splitval='+wa+' elif '-vic-' in element[0].text: _splitval='-vic-' # elif '+vic+' in element[0].text: _splitval='+vic+' if _splitval !='': list_state.append(_splitval.replace('-','').replace('+','')) list_proptype.append( (element[0].text).split(_splitval)[0].replace('https://www.realestate.com.au/property-','').replace('+', ' ') ) list_suburb.append( (element[0].text).split(_splitval)[1].replace('https://www.realestate.com.au/property-','').replace((element[0].text).split('-')[-1],'').replace('-',' ').replace('+', ' ').strip() ) else: list_state.append('') list_proptype.append('') list_suburb.append('') list_propid.append( (element[0].text).split('-')[-1] ) # _count+=1 XML_gz_Dataset = pd.DataFrame( np.column_stack([list_lastmod, list_url, list_proptype, list_state, list_suburb, list_propid]), columns=['lastmod', 'url', 'proptype', 'state', 'suburb', 'prop_id']) XML_gz_Dataset.to_csv(PageSaveFolder + '/parsed_csv/' + _xml_save[:-3] + '_results' +'.csv') print("file saved to: " + PageSaveFolder + '\\parsed_csv\\' + _xml_save[:-3] + '_results' +'.csv') XML_gz_Dataset['lastmod']=pd.to_datetime(XML_gz_Dataset['lastmod']) print("total xml time:", time.time() - _time) XML_gz_Dataset['parent_gz']=XMLsaveFile XML_gz_Dataset['scrape_dt']=(datetime.datetime.now()).strftime('%Y-%m-%d %H:%M:%S') XML_gz_Dataset['lastmod']=pd.to_datetime(XML_gz_Dataset['lastmod']) XML_gz_Dataset['external_ip']=r_proxy #now we add to db table #parent file link connection = psycopg2.connect(user="******",password="******",host="172.22.114.65",port="5432",database="scrape_db") cursor = connection.cursor() # with connection.cursor() as cursor: cursor.execute(""" select max(s_fileid) FROM sc_land.sc_source_file WHERE s_filename = %(s_filename)s and date(lastmod) = %(lastmod)s; """, { 's_filename': XML_gz_Dataset['parent_gz'].drop_duplicates()[0] ,'lastmod' : XML_gz_Dataset['lastmod'].dt.date.drop_duplicates()[0] } ) result = cursor.fetchone() print("parent file link is:",ScrapeFile,"is:", result[0]) XML_gz_Dataset['s_fileid']=result[0] #remove redundant link XML_gz_Dataset=XML_gz_Dataset.drop(columns=['parent_gz']) # #time to insert engine = create_engine('postgresql://*****:*****@172.22.114.65:5432/scrape_db') XML_gz_Dataset.to_sql( name='sc_property_links' ,schema='sc_land' ,con=engine ,method=db_import.psql_insert_copy ,if_exists='append' ,index=False ) os.remove(PageSaveFolder + _xml_save) print("total runtime", time.time() - _time) print('----------------------------------------------------------------') return r_proxy
def spider_with_class(cls, position, ip=0, date="ofo_temp"): mypos = [] for item in position: if item['class'] == cls: mypos.append(item) mypos.append({"lng": item['lng'] + 0.005, "lat": item['lat']}) mypos.append({ "lng": item['lng'] + 0.005, "lat": item['lat'] + 0.005 }) mypos.append({"lng": item['lng'], "lat": item['lat'] + 0.005}) # print("线程%d 共%d个原始点"%(cls,len(mypos))) # global history # record = [] ip = proxy.getProxy() i = len(mypos) for pos in mypos: i = i - 1 count = 0 local = [] localString = [] info = 0 try: info = spider.spider_single(pos['lng'], pos['lat'], ip) except: if not proxy.testProxy(ip): proxy.deleteProxy(ip) ip = proxy.getProxy() try: info = spider.spider_single(pos['lng'], pos['lat'], ip) except: proxy.updateProxy() ip = proxy.getProxy() try: info = spider.spider_single(pos['lng'], pos['lat'], ip) except: print( "线程%d因网络问题退出 时间:" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) raise Exception return 0 lock.acquire() for bike in info['body']['bicycles']: posStr = str(bike['longitude']) + str(bike['latitude']) if not (posStr in history): result.append(bike) local.append(bike) count += 1 if not (posStr in localString): localString.append(posStr) history.extend(localString) global total_point total_point -= 1 if total_point % 4000 == 0: print("%s 本次剩余%d个" % (time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime()), total_point)) lock.release() if not save_in_db(local, date): while not save_in_db(local, date): print("线程%d 数据库连接失败" % (cls)) # save_in_db(local) # print("线程%d 剩余%d次 获取%d个 保存%d个"%(cls,i,info['body']['total'],count)) # lock.acquire() # result.extend(record) # lock.release() return
# f = open("thread_test100.txt","w") # f.write(json.dumps(result)) # f.close() # while 1: # if time.strftime("%H:%M",time.localtime()) == "00:00": # break while 1: date = time.strftime("%Y-%m-%d", time.localtime()) start_time = time.time() print("开始新的查询 当前时间:%s" % (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))) start_time_string = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) proxy.getProxy() thread = [] history = [] result = [] total_point = len(position) * 4 for i in range(100): thread.append( threading.Thread(target=spider_with_class, args=(i, position, 0, date))) for i in range(100): thread[i].setDaemon(True) thread[i].start() for i in range(100): thread[i].join()
<<<<<<< HEAD ydl_opts = { 'outtmpl': f'{fullpath}/{filename}/{filename}.mp4', 'format':'best' } with suppress_stdout(): with youtube_dl.YoutubeDL(ydl_opts) as ydl: with suppress_stdout(): ydl.download([url]) ======= if not status_manager.Manager.getStatus(): self.__proxy = proxy.getProxy() ydl_opts = { 'outtmpl': f'{fullpath}/{filename}/{filename}', 'format':'best', 'proxy':self.__proxy['proxy'] } logging.error(f"DOWNLOADING:USING_PROXY") else: ydl_opts = { 'outtmpl': f'{fullpath}/{filename}/{filename}', 'format':'best' } logging.error(f"DOWNLOADING:WITHOUT_PROXY")