Esempio n. 1
0
def test_setup():
    """
    Test setup of TorCrawler.

    There are some major issues with testing multiple Controllers, i.e.
    testing multiple instances of TorCrawler.

    For some reason, when a stem Controller is instantiated and connects
    via SOCKS to my control port, that port remains in use even after the
    Controller instance is killed or otherwise dies. This means that in the
    thread of py.test-3, I can't spin up new TorCrawlers which is a major pain.

    For now, the workaround will be to throw a successful TorCrawler up to
    a global variable, but I can work on a multithreaded solution later (not
    sure if that would even work).
    """
    stopTor()
    time.sleep(1)

    # Should fail if Tor is not running.
    with pytest.raises(EnvironmentError):
        c = TorCrawler()

    # Boot Tor
    startTor()
    time.sleep(1)
    c4 = TorCrawler(test_rotate=True, n_requests=3)

    global TOR_CRAWLER
    TOR_CRAWLER = c4
Esempio n. 2
0
def main():

    crawler = TorCrawler()

    url = 'https://api.foursquare.com/v2/venues/502aa937e4b0be57fd4cac73?m=swarm'
    data = crawler.get(url)
    print(data)
    print(crawler.ip)

    #os.system("sudo su")
    #os.system("asdf321")
    os.system("/etc/init.d/tor restart")
    #os.system("su willdoliver")
    time.sleep(10)
    print(crawler.ip)
Esempio n. 3
0
def worker(workerQueue, session, page_num):
    ip_text = ''
    crawler = TorCrawler()
    url = "https://www.imdb.com/list/ls055462533/?ref_=tt_rls_2&sort=release_date,desc&st_dt=&mode=detail&page=" + str(
        page_num)
    print('page : ', page_num)
    print('url : ', url)
    with concurrent.futures.ThreadPoolExecutor(5) as thread_ex:
        f_t_u = {thread_ex.submit(seq, url, crawler, session): url}
        for x in concurrent.futures.as_completed(f_t_u):
            code = x.result()

            ip = session.get('http://www.httpbin.org/ip')
            if ip.text != ip_text:
                ip_text = ip.text
                print(ip_text)

            try:
                if code == 200:
                    page = session.get(f_t_u[x])
                    page_soup = BeautifulSoup(page.text, 'html.parser')
                    get_title(multiprocessing.current_process().name,
                              page_soup, f_t_u[x])
            except Exception as e:
                print(e)
                continue
 def __init__(self,
              sitebase,
              savebase=None,
              delay=5,
              retrylimit=5,
              ctrl_pass=None,
              n_requests=5,
              use_tor=False,
              headers={}):
     self.sitebase = sitebase
     self.savebase = savebase
     self.delay = delay
     self.headers = headers
     self.retrylimit = retrylimit
     self.n_requests = n_requests
     self.crawler = TorCrawler(ctrl_pass=ctrl_pass,
                               n_requests=n_requests,
                               use_tor=use_tor)
def worker(workerQueue, session, lst):
    crawler = TorCrawler()
    lst = ["http://www.imdb.com/title/" + i for i in lst]
    with concurrent.futures.ThreadPoolExecutor(5) as thread_ex:
        f_t_u = {
            thread_ex.submit(seq, url, crawler, session): url
            for url in lst
        }
        for x in concurrent.futures.as_completed(f_t_u):
            code = x.result()
            try:
                if code == 200:
                    page = session.get(f_t_u[x])
                    page_soup = BeautifulSoup(page.text, 'html.parser')
                    get_basic_details(multiprocessing.current_process().name,
                                      page_soup, f_t_u[x])
            except Exception as e:
                print(e)
                continue
def worker(workerQueue, session, lst):
    crawler = TorCrawler()
    lst = [
        "http://www.imdb.com/title/tt" + '%08d' % (i) +
        "/reviews?spoiler=hide&sort=reviewVolume&dir=desc&ratingFilter=0"
        for i in lst
    ]
    with concurrent.futures.ThreadPoolExecutor(5) as thread_ex:
        f_t_u = {
            thread_ex.submit(seq, url, crawler, session): url
            for url in lst
        }
        for x in concurrent.futures.as_completed(f_t_u):
            #time.sleep(2)
            try:
                code = x.result()
                if code == 200:
                    with open(multiprocessing.current_process().name + '.csv',
                              'a') as file:
                        print(multiprocessing.current_process().name, ' => ',
                              f_t_u[x])
                        file.write(f_t_u[x] + "\n")
                elif code == 503:
                    with open(
                            multiprocessing.current_process().name +
                            '-503.csv', 'a') as file:
                        print(multiprocessing.current_process().name, ' => ',
                              code)
                        file.write(f_t_u[x] + "\n")
                else:
                    print(multiprocessing.current_process().name, ' => ', code)
            except requests.HTTPError as e:
                #print(e)
                continue
            except requests.ConnectTimeout as e:
                #print(e)
                continue
            except requests.ConnectionError as e:
                #print(e)
                continue
Esempio n. 7
0
from bs4 import BeautifulSoup
import numpy as np
#from random import uniform
#from time import sleep
import csv
import re
#import time
from TorCrawler import TorCrawler

crawler = TorCrawler(ctrl_pass='******')

def write_csv(data):
    with open('apartments_cian.csv', 'a', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow((data['rooms'],
                         data['square'],
                         data['living_space'],
                         data['kitchen_space'],
                         data['curr_floor'],
                         data['max_floor'],
                         data['home_type'],
                         data['build_year'],
                         data['district'],
                         data['serv_lift'],
                         data['pass_lift'],
                         data['parking'],
                         data['loggia'],
                         data['balcony'],
                         data['garbage_chute'],
                         data['bathroom'],
                         data['repair_type'],
class WebDirectoryCrawler(object):
    def __init__(self,
                 sitebase,
                 savebase=None,
                 delay=5,
                 retrylimit=5,
                 ctrl_pass=None,
                 n_requests=5,
                 use_tor=False,
                 headers={}):
        self.sitebase = sitebase
        self.savebase = savebase
        self.delay = delay
        self.headers = headers
        self.retrylimit = retrylimit
        self.n_requests = n_requests
        self.crawler = TorCrawler(ctrl_pass=ctrl_pass,
                                  n_requests=n_requests,
                                  use_tor=use_tor)

    def get_content(self, url, use_bs, retrylimit=5):
        req = None
        retry = 0
        while (retry < self.retrylimit):
            try:
                if use_bs:
                    self.crawler.use_bs = True
                    req = self.crawler.get(url, headers=self.headers)
                    return req
                else:
                    self.crawler.use_bs = False
                    req = self.crawler.get(url, headers=self.headers)
                    return req
            except Exception as err:
                print(err)
                print("retry..: ", retry + 1)
                time.sleep(2)
                retry += 1
        return None

    def download_file(self, cwd, savepath, filename):
        if cwd != '/' and cwd[-1] != "/":
            cwd += '/'
        url = self.sitebase + cwd + filename

        req = self.get_content(url, use_bs=False)
        if not req:
            print("download fail: " + filename)
        f = open(savepath + filename, 'wb')
        f.write(req.content)
        f.close()

    def recursive_listing(self, cwd):
        savepath = None
        if cwd != '/' and cwd[-1] != "/":
            cwd += '/'
        url = self.sitebase + cwd
        req = self.get_content(url, use_bs=True)
        if not req:
            print("crawler error")
            return
        cwd, listing = htmllistparse.parse(req)
        if cwd == None:
            print("It does not seem to be 'Web Directory'.")
            return
        if cwd != '/' and cwd[-1] != "/":
            cwd += '/'
        if self.savebase:
            savepath = self.savebase + cwd
            try:
                print("Create directory: " + savepath)
                if not os.path.exists(savepath):
                    os.makedirs(savepath)
            except Exception as err:
                print("Cannot create directory..: " + str(err))
                sys.exit(0)

        for f in listing:
            if f.name[-1] != "/":
                print(cwd + f.name)
                if self.savebase:
                    self.download_file(cwd, savepath, f.name)
            else:
                print(cwd + f.name)
                self.recursive_listing(cwd + f.name)
            time.sleep(self.delay)
Esempio n. 9
0
params5 = dict(
    client_id='W3K2WFYXXNW5DZIIOSQHOAGS4WVGHEQILLZMK10KKQ0Q3H4A',
    client_secret='0O52YL2LI4ZZVTV1EJ1NRUIXTU35LYN24DUSU2PSVRQFJE3W',
    v='20180323',
    limit=1)

params6 = dict(
    client_id='T5H1DGJAVQPKQNPHKSEYXVXRFQGZRMLCLBC5X0KXQKN2XHFZ',
    client_secret='4DEZN2E4HPDZAKYJG2T3ATH3LQKEEPAIPVPQ1WEIDT4BAK11',
    v='20180323',
    limit=1)

vetParams = [params3, params2, params1, params4, params5, params6]

crawler = TorCrawler()
#client = MongoClient()
#db = client.curitiba
today = str(date.today())
date = {'data consulta': today}


def verificaLastID():
    try:
        opened = open('lastIDCuritiba.txt', 'r')
        id_saved = opened.readline()
        return id_saved
    except:
        return 1

import time
from TorCrawler import TorCrawler

c = TorCrawler()

while True:
    c.check_ip()
    c.rotate()
Esempio n. 11
0
import requests
from TorCrawler import TorCrawler
import time
import os

params = dict(
  client_id='T5H1DGJAVQPKQNPHKSEYXVXRFQGZRMLCLBC5X0KXQKN2XHFZ',
  client_secret='4DEZN2E4HPDZAKYJG2T3ATH3LQKEEPAIPVPQ1WEIDT4BAK11',
  v='20180323',
  limit=1
)



crawler = TorCrawler()

#data = crawler.get("https://api.foursquare.com/v2/venues/502aa937e4b0be57fd4cac73?m=swarm", headers=params)
while 1:
    data = crawler.get("https://www.tudogostoso.com.br/receita/31593-pudim-de-leite-condensado.html")
    print(crawler.ip)
    os.system("/etc/init.d/tor restart")
    time.sleep(1)
    # Make a GET request (returns a BeautifulSoup object unless use_bs=False)


# TorCrawler will, by default, rotate every n_requests.
# If you want to manually rotate your IP, you can do that any time.
# crawler.rotate()