def test_setup(): """ Test setup of TorCrawler. There are some major issues with testing multiple Controllers, i.e. testing multiple instances of TorCrawler. For some reason, when a stem Controller is instantiated and connects via SOCKS to my control port, that port remains in use even after the Controller instance is killed or otherwise dies. This means that in the thread of py.test-3, I can't spin up new TorCrawlers which is a major pain. For now, the workaround will be to throw a successful TorCrawler up to a global variable, but I can work on a multithreaded solution later (not sure if that would even work). """ stopTor() time.sleep(1) # Should fail if Tor is not running. with pytest.raises(EnvironmentError): c = TorCrawler() # Boot Tor startTor() time.sleep(1) c4 = TorCrawler(test_rotate=True, n_requests=3) global TOR_CRAWLER TOR_CRAWLER = c4
def main(): crawler = TorCrawler() url = 'https://api.foursquare.com/v2/venues/502aa937e4b0be57fd4cac73?m=swarm' data = crawler.get(url) print(data) print(crawler.ip) #os.system("sudo su") #os.system("asdf321") os.system("/etc/init.d/tor restart") #os.system("su willdoliver") time.sleep(10) print(crawler.ip)
def worker(workerQueue, session, page_num): ip_text = '' crawler = TorCrawler() url = "https://www.imdb.com/list/ls055462533/?ref_=tt_rls_2&sort=release_date,desc&st_dt=&mode=detail&page=" + str( page_num) print('page : ', page_num) print('url : ', url) with concurrent.futures.ThreadPoolExecutor(5) as thread_ex: f_t_u = {thread_ex.submit(seq, url, crawler, session): url} for x in concurrent.futures.as_completed(f_t_u): code = x.result() ip = session.get('http://www.httpbin.org/ip') if ip.text != ip_text: ip_text = ip.text print(ip_text) try: if code == 200: page = session.get(f_t_u[x]) page_soup = BeautifulSoup(page.text, 'html.parser') get_title(multiprocessing.current_process().name, page_soup, f_t_u[x]) except Exception as e: print(e) continue
def __init__(self, sitebase, savebase=None, delay=5, retrylimit=5, ctrl_pass=None, n_requests=5, use_tor=False, headers={}): self.sitebase = sitebase self.savebase = savebase self.delay = delay self.headers = headers self.retrylimit = retrylimit self.n_requests = n_requests self.crawler = TorCrawler(ctrl_pass=ctrl_pass, n_requests=n_requests, use_tor=use_tor)
def worker(workerQueue, session, lst): crawler = TorCrawler() lst = ["http://www.imdb.com/title/" + i for i in lst] with concurrent.futures.ThreadPoolExecutor(5) as thread_ex: f_t_u = { thread_ex.submit(seq, url, crawler, session): url for url in lst } for x in concurrent.futures.as_completed(f_t_u): code = x.result() try: if code == 200: page = session.get(f_t_u[x]) page_soup = BeautifulSoup(page.text, 'html.parser') get_basic_details(multiprocessing.current_process().name, page_soup, f_t_u[x]) except Exception as e: print(e) continue
def worker(workerQueue, session, lst): crawler = TorCrawler() lst = [ "http://www.imdb.com/title/tt" + '%08d' % (i) + "/reviews?spoiler=hide&sort=reviewVolume&dir=desc&ratingFilter=0" for i in lst ] with concurrent.futures.ThreadPoolExecutor(5) as thread_ex: f_t_u = { thread_ex.submit(seq, url, crawler, session): url for url in lst } for x in concurrent.futures.as_completed(f_t_u): #time.sleep(2) try: code = x.result() if code == 200: with open(multiprocessing.current_process().name + '.csv', 'a') as file: print(multiprocessing.current_process().name, ' => ', f_t_u[x]) file.write(f_t_u[x] + "\n") elif code == 503: with open( multiprocessing.current_process().name + '-503.csv', 'a') as file: print(multiprocessing.current_process().name, ' => ', code) file.write(f_t_u[x] + "\n") else: print(multiprocessing.current_process().name, ' => ', code) except requests.HTTPError as e: #print(e) continue except requests.ConnectTimeout as e: #print(e) continue except requests.ConnectionError as e: #print(e) continue
from bs4 import BeautifulSoup import numpy as np #from random import uniform #from time import sleep import csv import re #import time from TorCrawler import TorCrawler crawler = TorCrawler(ctrl_pass='******') def write_csv(data): with open('apartments_cian.csv', 'a', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow((data['rooms'], data['square'], data['living_space'], data['kitchen_space'], data['curr_floor'], data['max_floor'], data['home_type'], data['build_year'], data['district'], data['serv_lift'], data['pass_lift'], data['parking'], data['loggia'], data['balcony'], data['garbage_chute'], data['bathroom'], data['repair_type'],
class WebDirectoryCrawler(object): def __init__(self, sitebase, savebase=None, delay=5, retrylimit=5, ctrl_pass=None, n_requests=5, use_tor=False, headers={}): self.sitebase = sitebase self.savebase = savebase self.delay = delay self.headers = headers self.retrylimit = retrylimit self.n_requests = n_requests self.crawler = TorCrawler(ctrl_pass=ctrl_pass, n_requests=n_requests, use_tor=use_tor) def get_content(self, url, use_bs, retrylimit=5): req = None retry = 0 while (retry < self.retrylimit): try: if use_bs: self.crawler.use_bs = True req = self.crawler.get(url, headers=self.headers) return req else: self.crawler.use_bs = False req = self.crawler.get(url, headers=self.headers) return req except Exception as err: print(err) print("retry..: ", retry + 1) time.sleep(2) retry += 1 return None def download_file(self, cwd, savepath, filename): if cwd != '/' and cwd[-1] != "/": cwd += '/' url = self.sitebase + cwd + filename req = self.get_content(url, use_bs=False) if not req: print("download fail: " + filename) f = open(savepath + filename, 'wb') f.write(req.content) f.close() def recursive_listing(self, cwd): savepath = None if cwd != '/' and cwd[-1] != "/": cwd += '/' url = self.sitebase + cwd req = self.get_content(url, use_bs=True) if not req: print("crawler error") return cwd, listing = htmllistparse.parse(req) if cwd == None: print("It does not seem to be 'Web Directory'.") return if cwd != '/' and cwd[-1] != "/": cwd += '/' if self.savebase: savepath = self.savebase + cwd try: print("Create directory: " + savepath) if not os.path.exists(savepath): os.makedirs(savepath) except Exception as err: print("Cannot create directory..: " + str(err)) sys.exit(0) for f in listing: if f.name[-1] != "/": print(cwd + f.name) if self.savebase: self.download_file(cwd, savepath, f.name) else: print(cwd + f.name) self.recursive_listing(cwd + f.name) time.sleep(self.delay)
params5 = dict( client_id='W3K2WFYXXNW5DZIIOSQHOAGS4WVGHEQILLZMK10KKQ0Q3H4A', client_secret='0O52YL2LI4ZZVTV1EJ1NRUIXTU35LYN24DUSU2PSVRQFJE3W', v='20180323', limit=1) params6 = dict( client_id='T5H1DGJAVQPKQNPHKSEYXVXRFQGZRMLCLBC5X0KXQKN2XHFZ', client_secret='4DEZN2E4HPDZAKYJG2T3ATH3LQKEEPAIPVPQ1WEIDT4BAK11', v='20180323', limit=1) vetParams = [params3, params2, params1, params4, params5, params6] crawler = TorCrawler() #client = MongoClient() #db = client.curitiba today = str(date.today()) date = {'data consulta': today} def verificaLastID(): try: opened = open('lastIDCuritiba.txt', 'r') id_saved = opened.readline() return id_saved except: return 1
import time from TorCrawler import TorCrawler c = TorCrawler() while True: c.check_ip() c.rotate()
import requests from TorCrawler import TorCrawler import time import os params = dict( client_id='T5H1DGJAVQPKQNPHKSEYXVXRFQGZRMLCLBC5X0KXQKN2XHFZ', client_secret='4DEZN2E4HPDZAKYJG2T3ATH3LQKEEPAIPVPQ1WEIDT4BAK11', v='20180323', limit=1 ) crawler = TorCrawler() #data = crawler.get("https://api.foursquare.com/v2/venues/502aa937e4b0be57fd4cac73?m=swarm", headers=params) while 1: data = crawler.get("https://www.tudogostoso.com.br/receita/31593-pudim-de-leite-condensado.html") print(crawler.ip) os.system("/etc/init.d/tor restart") time.sleep(1) # Make a GET request (returns a BeautifulSoup object unless use_bs=False) # TorCrawler will, by default, rotate every n_requests. # If you want to manually rotate your IP, you can do that any time. # crawler.rotate()