コード例 #1
0
def run():
    tr = TorRequest(
        password='******')
    response = tr.get(site, headers=headers[0], verify=True)
    response = tr.get(site1, headers=headers[0], verify=True)
    print '[' + str(i) + ']' + ' Blog View Added With IP:' + tr.get(
        'http://ipecho.net/plain').content
    tr.reset_identity()
コード例 #2
0
def tor_request(url):
    """
    Make a get request with Tor Proxy
    
    Args:
    - url (string) : url from which we want to fetch data  
    Return:
    response : Response from get request
    
    """
    
    tr = TorRequest(password="******")
    print(f"Scrapped {url} with Ip Address",tr.get("http://ipecho.net/plain").text)
    response = tr.get(url)
    tr.reset_identity()

    
    return response
コード例 #3
0
def tor():
    request = TorRequest()
    response = request.get("http://httpbin.org/ip")
    ip_address = json.loads(response.content)["origin"]
    response = requests.get(f"http://ip-api.com/json/{ip_address}")
    data = json.loads(response.content)
    city = data["city"]
    country = data["country"]

    return f"{country} - {city}"
コード例 #4
0
def get_last_page_num(stock):
    tr = TorRequest(proxy_port=9050, ctrl_port=9051, password=None)
    headers = {'User-Agent': random.choice(browsers)}
    target_url = "https://finance.naver.com/item/sise_day.nhn?code=%s&page=1" % stock.code
    r = tr.get(target_url, headers=headers)
    # page_re = re.compile(r'page=(\d+)')
    # s = BeautifulSoup(r.text, 'lxml')
    # rr = s.find('td', {"class": "pgRR"})
    # rr_href = rr.a['href']
    # m = re.search(r.text, rr_href)
    # return int(m[1])
    return 1
コード例 #5
0
ファイル: Agent.py プロジェクト: Satcomx00-x00/DataS-pwn
def tor_identity():
    from torrequest import TorRequest
    global ip
    print("Loading new Tor identity ...")
    tr = TorRequest()
    response = requests.get('http://ipecho.net/plain')
    print("My Original IP Address:", response.text)

    tr.reset_identity()  #Reset Tor identity
    response = tr.get('http://ipecho.net/plain')
    ip = response.text
    print("New Ip Address", response.text)
コード例 #6
0
class Ecosia:
    """
    Ecosia Search (through TOR nodes).
    Using tor is the standard mode, but it needs some extra config, in order to work
    on your machine. See these links for the config and possible errors:
    > https://www.scrapehero.com/make-anonymous-requests-using-tor-python/
    > https://stackoverflow.com/questions/49470261/tor-failing-to-run-with-failed-to-bind-one-of-the-listener-ports
    """
    def __init__(self, isTor=True):
        self.searchURL = "https://www.ecosia.org/search?q="
        # init generators
        self.rhGen = RequestHeaderGenerator()
        self.stGen = SearchTermGenerator()
        self.isTor = isTor
        self.searches = 0
        if self.isTor:
            # this password needs to be set in your .env file
            # simply create a new file and paste the password you set
            # while configuring tor. Make sure that the .env file is
            # in the same dir as the .config file
            self.tr = TorRequest(password=TOR_PASS)

    def _buildUrl(self):
        """
        build search url with given search terms and make sure,
        that they are correctly encoded
        """
        return self.searchURL + "+".join(
            list(map(urllib.parse.quote, self.stGen.getSearchTerm())))

    def search(self):
        """
        requests ecosia search results page
        has 2 modes: anonymous reqs via tor or normal ones
        to avoid blocking for normal reqs I generate for every req a new req header
        """
        url = self._buildUrl()
        if self.isTor:
            self.tr.reset_identity()
            response = self.tr.get(url)
        else:
            print("Tor Option disabled")
            response = requests.get(
                url, headers=self.rhGen.getRandomRequestHeader())

        if int(response.status_code) == 200:
            self.searches += 1

        print(
            f"Performed request to url: {url}, \nGot status code: {response.status_code}"
        )
        return response.status_code
コード例 #7
0
def tor_reset():
    global tor
    global header
    try:
        tor = TorRequest(password='******')
        tor.reset_identity()
    except:
        tor.close()
        tor_1 = TorRequest(password='******')
        tor = tor_1
        tor.reset_identity()
    response = tor.get('http://ipecho.net/plain')
    print("Ip Address has changed: ", response.text)
コード例 #8
0
ファイル: check_tor.py プロジェクト: twesleyb/openRealestate
def check_tor(password):
    ''' Check if we are connected via tor.
    '''
    # Requirements
    import sys
    from torrequest import TorRequest
    # Add HashedControlPass.
    tr = TorRequest(password=password)
    # Check that we are connected via tor.
    url = 'https://check.torproject.org'
    response = tr.get(url)
    txt = response.text
    status = txt[txt.find('<title>') +
                 7:txt.find('</title>')].split('\n')[2].lstrip()
    print(status, file=sys.stderr)
コード例 #9
0
def get_financial(tor: TorRequest, company: str) -> pd.DataFrame:
    """
    Get accountability information on company
    """

    financial_df = pd.DataFrame()
    # writer = pd.ExcelWriter('XLS/{}.xlsx'.format(company))

    for elements_financier in accountant:
        r = tor.get(url +
                    "{}/{}?p={}".format(company, elements_financier, company))
        if r.status_code != 200:
            print(r.status_code, ":", r.reason)
            time.sleep(10)
            financial_df = get_financial(tor, company)
            return financial_df

        soup = BeautifulSoup(r.text, "lxml")
        tables = soup.find_all('table')

        df = pd.DataFrame()
        raw = []

        for table in tables:
            tr = table.find_all('tr')
            for row in tr:
                td = row.find_all('td')

                # Catch if this is a title
                if len(td) == 1:
                    data = str(td[0].find(text=True))
                    raw.append(data)
                    df = df.append([raw])
                    raw = []
                    continue

                # Add a line with a temporary raw
                for element in td:
                    data = str(element.find(text=True))
                    raw.append(data)
                df = df.append([raw])
                del raw[:]
        df.set_index([0], inplace=True)
        # df.to_excel(writer, elements_financier)
        financial_df = pd.concat([financial_df, df])
    # writer.save()
    return financial_df
コード例 #10
0
def assign_new_ip(text=False):
    """
    Reset the identity using TorRequest

    Parameters
    ----------
    arg1 [OPTIONAL]| text: bool
        A boolean flag to return the IP address tuple (old, morphed)

    Returns
    -------
    boolean
        True/False

    """

    try:
        # pass the hashed password
        req = TorRequest(
            password=
            '******')

        # return the ip address
        normal_identity = requests.get('http://ipecho.net/plain')

        # reset the identity using Tor
        req.reset_identity()

        # make a request now
        morphed_identity = req.get('http://ipecho.net/plain')

        # return the status depending on the flag
        if morphed_identity != normal_identity:
            if text == True:
                # return the ip address pairs as a tuple
                return (normal_identity.text, morphed_identity.text)
            else:
                return True
        else:
            # return just the status
            return False
    except:
        return False
コード例 #11
0
def randomize_ip(password, quiet=False):
    ''' Randomize IP addredss with tor.
    Reset tor to randomize your IP address. Takes your tor hashed control
    password as an argument. Requires that you have set HashedControlPassword 
    variable in the tor configuration file.
    '''
    # Requirements
    import sys
    from torrequest import TorRequest
    # Add HashedControlPass.
    tr = TorRequest(password=password)
    # Reset Tor.
    tr.reset_identity()
    # Check new ip.
    response = tr.get('http://ipecho.net/plain')
    ip = response.text
    if not quiet:
        print("IP address is set to: {}".format(ip), file=sys.stderr)
    return (ip)
コード例 #12
0
def tor_session(password):
    '''
    tor_session
    '''
    # Requirements
    import sys
    from torrequest import TorRequest
    # Add HashedControlPass.
    tr = TorRequest(password=password)
    session = tr.session
    url = 'https://check.torproject.org'
    response = tr.get(url)
    txt = response.text
    status = txt[txt.find('<title>') +
                 7:txt.find('</title>')].split('\n')[2].lstrip()
    print(status, file=sys.stderr)
    if status is "Sorry. You are not using Tor.":
        print("Continue only at your own risk.", file=sys.stderr)
    #EIF
    return (session)
コード例 #13
0
def get_connection(links_site, torR):

   headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}

   for i in range(15):
       try:
           resp = torR.get(links_site, headers=headers, timeout=5)
           return resp
       except requests.exceptions.Timeout:
           print("Reconnect" + str(i+1))
           with Controller.from_port(port = 9051) as controller:
               controller.authenticate(password='******')
               print("Success!")
               controller.signal(Signal.NEWNYM)
               print("New Tor connection processed")
               torR=TorRequest(password='')
               torR.reset_identity() #Reset Tor
               response= torR.get('http://ipecho.net/plain')
               print("New Ip Address",response.text)
           pass

   return resp
コード例 #14
0
def pageScan(link):

    li = []
    ua = UserAgent()
    headers = {
        'User-agent':
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36 ua.random'
    }
    tr = TorRequest(1234)
    tr.reset_identity()
    url = tr.get(link).text

    soup = BeautifulSoup(url, 'lxml')
    try:
        links = soup.find('div', id='atfResults').find(
            'ul', id='s-results-list-atf').find_all('li',
                                                    class_='s-result-item')
        for l in links:
            li.append(l['data-asin'])
    except (AttributeError, TypeError):
        pass
    return li
コード例 #15
0
def get_summary(tor: TorRequest, company: str) -> pd.DataFrame:
    """
    Get additional information on company
    """
    summary_info = pd.DataFrame()
    raw = []
    r = tor.get(url + "{}?p={}".format(company, company))
    if r.status_code != 200:
        print(r.status_code, ":", r.reason)

    soup = BeautifulSoup(r.text, "lxml")
    tables = soup.find_all('table')

    for table in tables:
        spans = table.find_all('span')
        for element in spans:
            raw.append(element.get_text())
            if len(raw) == 2:
                summary_info = summary_info.append([raw])
                del raw[:]

    summary_info.set_index(summary_info.iloc[:, 0], inplace=True)
    return summary_info
コード例 #16
0
def make_request(url,
                 headers,
                 error_type,
                 social_network,
                 verbose=False,
                 tor=False,
                 unique_tor=False):
    r = TorRequest() if (tor or unique_tor) else requests
    try:
        rsp = r.get(url, headers=headers)
        if unique_tor:
            r.reset_identity()
        if rsp.status_code:
            return rsp, error_type
    except requests.exceptions.HTTPError as errh:
        print_error(errh, "HTTP Error:", social_network, verbose)
    except requests.exceptions.ConnectionError as errc:
        print_error(errc, "Error Connecting:", social_network, verbose)
    except requests.exceptions.Timeout as errt:
        print_error(errt, "Timeout Error:", social_network, verbose)
    except requests.exceptions.RequestException as err:
        print_error(err, "Unknown error:", social_network, verbose)
    return None, ""
コード例 #17
0
ファイル: crawl_gre.py プロジェクト: yisj/gre
from torrequest import TorRequest
import random
from bs4 import BeautifulSoup
from Word import Word
from constants import BLANK, browsers

tr = TorRequest(proxy_port=9050, ctrl_port=9051, password=None)

headers = {'User-Agent': random.choice(browsers)}

words = list()
with open('gre.csv') as f:
    for l in f:
        words.append(Word(l.strip()))

for w in words:
    print(w.word)
    re_ko = tr.get(
        "https://endic.naver.com/search.nhn?sLn=en&searchOption=all&query=%s" %
        w.word.replace(' ', BLANK),
        headers=headers)
    s_ko = BeautifulSoup(re_ko.text, 'lxml')
    dl = s_ko.findAll('dl', {'class': 'list_e2'})
    print(dl)

    break
コード例 #18
0
ファイル: thesaurus_syn.py プロジェクト: yisj/gre
        for data in words:
            writer.writerow(data)


failed = list()
for i, r in enumerate(words):
    syns = r['syn'].split(',')
    syns = [s.strip() for s in syns]
    print(i / len(words) * 100, '%')
    # print(syns)
    # break
    before = len(syns)

    try:
        result = tr.get(
            'https://en.wiktionary.org/wiki/Thesaurus:%s' %
            r['word'].replace(' ', '_'),
            headers=headers)  #% r['word'].replace(' ', '_'), headers=headers)
        s = BeautifulSoup(result.text, 'lxml')
        syn = s.find('span', id='Synonyms')
        ls = syn.findParent().fetchNextSiblings()[0].findAll('li')
        for l in ls:
            syns.append(l.text)
        syns = list(set(syns))

        after = len(syns)

        print(after - before, 'words added.')

        r['syn'] = ', '.join(syns)
        save_words(words)
    # break
コード例 #19
0
ファイル: crawl_gre_en.py プロジェクト: yisj/gre
print('filednames:', fieldnames)

words = list()
with open('gre2020.csv') as f:
    reader = csv.DictReader(f)
    for row in reader:
        words.append(row)

new_words = list()

for w in words:
    print(w['word'])
    try:
        re = tr.get(
            "https://endic.naver.com/search.nhn?sLn=en&query=%s&searchOption=all&isOnlyViewEE=Y"
            % w['word'].replace(' ', BLANK),
            headers=headers)
        s = BeautifulSoup(re.text, 'lxml')

        content_div = s.find('div', {'id': 'content'})

        dl_e2 = content_div.find('dl', {'class': 'list_e2'})
        dd = dl_e2.find('dd')
        k09 = dd.find('span', {'class': 'fnt_k09'})
        if k09 != None:
            pof = k09.text
        else:
            pof = ''

        # k05 = k09.find_next_sibling()
        k05 = dd.find('span', {'class': 'fnt_k05'})
コード例 #20
0
ファイル: workon_news1.py プロジェクト: rjsu26/ScheduleMe
lnght = len(ListOfData)
x = 0
delays = [random() for _ in range(10)] * 5

with open("news_2.py", "w") as fo:
    fo.writelines("BetterData = [\n")
    while True:
        if x >= lnght:
            break
        if x % 8 == 0:
            tr = TorRequest(password='******')
            tr.reset_identity()

        link = ListOfData[x][0]

        response = tr.get(ANOTHER_URL + link)
        soup = bs(response.content, "lxml")
        body = soup.body
        error_count = 0

        if error_count >= 5:
            print("Restarting from %d" % (x - 5))
            x = x - 5

        else:
            try:
                description = body.find("div", class_="Ap5OSd").get_text()
                error_count = 0
                ListOfData[x].extend(description)
                fo.writelines("%s,\n" % ListOfData[x])
            except ConnectionRefusedError as CE:
コード例 #21
0
ファイル: proxyrot.py プロジェクト: needmorecowbell/ProxyRot
class ProxyRot():
    """docstring for ProxyRot."""
    tr = None
    url= None
    password = ""
    proxies = []
    chromedriver="/usr/local/bin/chromedriver"

    def __init__(self, password= None, url= None):
        self.password= password

        if(password is not None):
            self.tr = TorRequest(password=self.password)
        else:
            self.tr = TorRequest()

        self.url = url

        self.options = Options()
        self.options.add_argument('--headless')
        self.options.add_argument('--disable-gpu')  # Last I checked this was necessary

    def get_ip(self):
        try:
            response= requests.get('http://ipinfo.io/ip')
        except Exception as e:
            print(str(e))
            return None

        return response.text

    def reset_tor_identity(self):
        self.tr.reset_identity()
        self.tr = TorRequest(password=self.password)

    def scrape_proxies_country(self):
        countries=["Russia","China", "India", "Ukraine",
        "Indonesia","Brazil", "Canada","Pakistan", "United Kingdom","Iran",
        "Thailand"]
        countries=["Russia","China"]
        url = "http://www.gatherproxy.com/proxylist/country/?c="
        proxies={}
   
        driver = webdriver.Chrome(self.chromedriver, options=self.options)

        try:
            for country in countries:
                driver.get(url+country)
                time.sleep(2)
                #print(driver.page_source)
                page_content = BeautifulSoup(''.join(driver.page_source), "html.parser")
                    
                proxy_table = page_content.find("div", attrs={"class": "proxy-list"})
                proxies_pre = proxy_table.findAll("tr")
                proxies[country]=[]
                
                for proxy in proxies_pre:
                    proxies[country].append(proxy.attrs)

            return proxies
        except Exception as e:
            print(e)
            return None

    def get_tor_ip(self):
        try:
            response = self.tr.get('http://ipinfo.io/ip')
        except Exception as e:
            print(str(e))
            return None
        return response.text

    def get(self, url):
        try:
            response = self.tr.get(url)
        except Exception as e:
            print(str(e))
            return None
        return response
コード例 #22
0
ファイル: test.py プロジェクト: twesleyb/openRealestate
# Add HashedControlPass.
password = get_pass("torpass")
tr=TorRequest(password=password)

# Reset Tor.
tr.reset_identity()

# Check initial ip.
session = requests.session()
response = session.get('http://ipecho.net/plain')
ip = response.text
print("IP address is set to: {}".format(ip))

# Check new ip with tor.
response = tr.get('http://ipecho.net/plain')
ip = response.text
print("IP address is set to: {}".format(ip))

# Check if tor is active.
response = tr.get('https://check.torproject.org')
response.text # Sorry, you are not using Tor.

#--------------------------------------------------------------------
## Test 3.
#--------------------------------------------------------------------
# We can use torrify on the command line.

torify wget -O - 'https://check.torproject.org' 

# Congratulations. This browser is configured to use Tor.
コード例 #23
0
ファイル: tor.py プロジェクト: H4medRostami/TorchangeIp
from stem import Signal
from stem.control import Controller
import requests
from torrequest import TorRequest

tr = TorRequest(password='******')
tr.reset_identity()  #Reset Tor
response = tr.get('http://ipecho.net/plain')
print("New Ip Address", response.text)
#------------------------------------------------------------
response = requests.get('http://ipecho.net/plain')
print("My Original IP Address:", response.text)
#------------------------------------------------------------

with Controller.from_port(port=9051) as controller:
    controller.authenticate(password='******')
    print("Success!")
    controller.signal(Signal.NEWNYM)
    print("New Tor connection processed")
response = requests.get('http://ipecho.net/plain')
print("IP Address after success s:", response.text)
コード例 #24
0
ファイル: pron.py プロジェクト: yisj/gre
    with open('gre2020-temp.csv', 'w') as f:
        writer = csv.DictWriter(f, fieldnames)
        writer.writeheader()
        for data in words:
            writer.writerow(data)


failed = list()
for i, r in enumerate(words):
    print(i / len(words) * 100, '%')
    if 'pron' in r and r['pron'] != '':
        print(r['pron'], 'is already')
    if 'pron' in r and r['pron'] == '':
        try:
            result = tr.get('https://www.lexico.com/en/definition/%s' %
                            r['word'].replace(' ', BLANK),
                            headers=headers)

            time.sleep(1)
            s = BeautifulSoup(result.text, 'lxml')
            pron = s.find('span', {'class': 'phoneticspelling'})
            print(pron.text)

            r['pron'] = pron.text

            save_words(words)
        except:
            failed.append(r['word'])
            with open('pron_failed2.csv', 'w') as f:
                f.write('\n'.join(failed))
コード例 #25
0
ID = int(input("who's the lucky one? : "))
i = int(input("how many times mister?: "))
cookies = {
    'HoldTheDoor': 'f113024b10de77d8031c15bdcf2f830d67773813',
}

headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36 OPR/66.0.3515.27',
    'Referer': 'http://158.69.76.135/level4.php',
}

data = {
    'id': ID,
    'holdthedoor': 'Submit',
    'key': 'f113024b10de77d8031c15bdcf2f830d67773813'
}
x = 0
while x < i:
    tr = TorRequest(password='******')
    tr.reset_identity()  #Reset Tor
    response = tr.post('http://158.69.76.135/level4.php',
                       headers=headers,
                       cookies=cookies,
                       data=data,
                       verify=False)
    ip = tr.get('http://ipecho.net/plain')
    print("***********  Identity {} ^_^ ********* \n              {} ".format(
        x + 1, ip.text))
    x += 1
コード例 #26
0
ファイル: order.py プロジェクト: robiXxu/taxi-troll
ip_check_url = 'http://ipecho.net/plain'

original_ip = requests.get(ip_check_url).text
print("My original Ip: %s" % original_ip)

tr = TorRequest(password=os.environ.get('TOR_PASS'))

target_url = 'http://newtaxicsv.taxiromaniaonline.ro/aplicatie/make_request'

addresses = json.load(open('addresses1.json'))
userAgents = json.load(open('userAgents.json'))

while True:
	try:
		# tr.reset_identity()
		tor_ip = tr.get(ip_check_url).text
		print("Using ip: %s" % tor_ip)

		if tor_ip == original_ip:
			raise ValueError("%s == %s" % (tor_ip, original_ip))

		headers = {
			'User-agent': random.choice(userAgents)
		}
		print("Headers: %s" % headers)


		address = str(random.choice(addresses))
		nr = random.choice(range(1,6))

		randomPrefix = random.choice(["str", "strada", "str.", "Str.", "St"])
コード例 #27
0
ファイル: getlinks.py プロジェクト: KostyaShey/immoscrapy
                            url.get('href'))
        else:
            with open(
                    'logs/getlinks_log_{timestamp}.txt'.format(
                        timestamp=timestamp), 'a') as f:
                f.write("Skipping " + url.get('href') + "\n")

# Scraping Links from wg-gesucht.de

ua = UserAgent()
header = {'User-Agent': str(ua.random)}

tr = TorRequest(password='******')
tr.reset_identity()  # Reset Tor
source = tr.get(
    'https://www.wg-gesucht.de/wohnungen-in-Hamburg.55.2.1.0.html?category=2&city_id=55&rent_type=2&noDeact=1&img=1',
    headers=header).text
soup = bs.BeautifulSoup(source, 'lxml')

# Scraping the number of pages
for option in soup.find_all("a", {"class": "a-pagination"}):
    pages = option.get_text()
pages = int(pages)

#scraping links from the pages

for i in range(0, pages + 1):

    with open('logs/getlinks_log_{timestamp}.txt'.format(timestamp=timestamp),
              'a') as f:
        f.write("Scraping Page {page} from wg-gesucht.de\n".format(page=i))
コード例 #28
0
ファイル: services.py プロジェクト: RoyceLeonD/5_star
def ParseReviews(asin):
    # This script has only been tested with Amazon.com: and only works with amazon.com because it involves the product requirements gathered from amazon.com
    amazon_url = 'http://www.amazon.com/dp/' + asin

    # Add some recent user agent to prevent amazon from blocking the request
    # Find some chrome user agent strings  here https://udger.com/resources/ua-list/browser-detail?browser=Chrome
    def randomizer():
        header_index = random.randint(1, 37)
        headers_user_agents = {}
        with open("user_agents.json", "r") as outfile:
            headers_user_agents = json.load(outfile)
            print(headers_user_agents)

        headers = {'User-Agent': headers_user_agents["headers"][header_index]}
        print(headers)
        #headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
        #Hcanged the following range(5) to range(20)
        proxy_number = random.randint(0, 14)
        print(proxy_number)
        proxies_list = [
            "105.185.176.102", "223.25.97.62", "110.44.122.198",
            "5.148.128.44", "177.21.103.63", "196.61.16.247", "109.224.57.14",
            "110.49.11.50", "181.10.129.85", "91.137.140.89", "103.9.134.241",
            "91.147.180.1", "213.57.125.158", "117.239.30.251"
        ]

        proxy = proxies_list[proxy_number]

        return headers, proxy

    for i in range(5):
        headers, proxy_lnk = randomizer()
        print(proxy_lnk)
        proxy = {'http': proxy_lnk}
        #response = get(amazon_url, headers = headers, verify=False, timeout=30, proxies=proxy)
        tr = TorRequest(password="******")
        tr.reset_identity()
        response = tr.get(amazon_url)
        if response.status_code == 404:
            return {"url": amazon_url, "error": "page not found"}
        if response.status_code != 200:
            continue

        # Removing the null bytes from the response.
        cleaned_response = response.text.replace('\x00', '')

        parser = html.fromstring(cleaned_response)
        print(parser)
        XPATH_AGGREGATE = '//span[@id="acrCustomerReviewText"]'
        XPATH_REVIEW_SECTION_1 = '//div[contains(@id,"reviews-summary")]'
        XPATH_REVIEW_SECTION_2 = '//div[@data-hook="review"]'
        XPATH_AGGREGATE_RATING = '//table[@id="histogramTable"]//tr'
        XPATH_PRODUCT_NAME = '//h1//span[@id="productTitle"]//text()'
        XPATH_PRODUCT_PRICE = '//span[@id="priceblock_ourprice"]/text()'

        raw_product_price = parser.xpath(XPATH_PRODUCT_PRICE)
        raw_product_name = parser.xpath(XPATH_PRODUCT_NAME)
        total_ratings = parser.xpath(XPATH_AGGREGATE_RATING)
        reviews = parser.xpath(XPATH_REVIEW_SECTION_1)

        product_price = ''.join(raw_product_price).replace(',', '')
        product_name = ''.join(raw_product_name).strip()

        if not reviews:
            reviews = parser.xpath(XPATH_REVIEW_SECTION_2)
        ratings_dict = {}
        reviews_list = []

        # Grabing the rating  section in product page
        for ratings in total_ratings:
            extracted_rating = ratings.xpath('./td//a//text()')
            if extracted_rating:
                rating_key = extracted_rating[0]
                raw_raing_value = extracted_rating[1]
                rating_value = raw_raing_value
                if rating_key:
                    ratings_dict.update({rating_key: rating_value})

        # Parsing individual reviews
        for review in reviews:
            XPATH_RATING = './/i[@data-hook="review-star-rating"]//text()'
            XPATH_REVIEW_HEADER = './/a[@data-hook="review-title"]//text()'
            XPATH_REVIEW_POSTED_DATE = './/span[@data-hook="review-date"]//text()'
            XPATH_REVIEW_TEXT_1 = './/div[@data-hook="review-collapsed"]//text()'
            XPATH_REVIEW_TEXT_2 = './/div//span[@data-action="columnbalancing-showfullreview"]/@data-columnbalancing-showfullreview'
            XPATH_REVIEW_COMMENTS = './/span[@data-hook="review-comment"]//text()'
            XPATH_AUTHOR = './/span[contains(@class,"profile-name")]//text()'
            XPATH_REVIEW_TEXT_3 = './/div[contains(@id,"dpReviews")]/div/text()'

            raw_review_author = review.xpath(XPATH_AUTHOR)
            raw_review_rating = review.xpath(XPATH_RATING)
            raw_review_header = review.xpath(XPATH_REVIEW_HEADER)
            raw_review_posted_date = review.xpath(XPATH_REVIEW_POSTED_DATE)
            raw_review_text1 = review.xpath(XPATH_REVIEW_TEXT_1)
            raw_review_text2 = review.xpath(XPATH_REVIEW_TEXT_2)
            raw_review_text3 = review.xpath(XPATH_REVIEW_TEXT_3)

            # Cleaning data
            author = ' '.join(' '.join(raw_review_author).split())
            review_rating = ''.join(raw_review_rating).replace(
                'out of 5 stars', '')
            review_header = ' '.join(' '.join(raw_review_header).split())

            try:
                review_posted_date = dateparser.parse(
                    ''.join(raw_review_posted_date)).strftime('%d %b %Y')
            except:
                review_posted_date = None
            review_text = ' '.join(' '.join(raw_review_text1).split())

            # Grabbing hidden comments if present
            if raw_review_text2:
                json_loaded_review_data = loads(raw_review_text2[0])
                json_loaded_review_data_text = json_loaded_review_data['rest']
                cleaned_json_loaded_review_data_text = re.sub(
                    '<.*?>', '', json_loaded_review_data_text)
                full_review_text = review_text + cleaned_json_loaded_review_data_text
            else:
                full_review_text = review_text
            if not raw_review_text1:
                full_review_text = ' '.join(' '.join(raw_review_text3).split())

            raw_review_comments = review.xpath(XPATH_REVIEW_COMMENTS)
            review_comments = ''.join(raw_review_comments)
            review_comments = sub('[A-Za-z]', '', review_comments).strip()
            review_dict = {
                'review_comment_count': review_comments,
                'review_text': full_review_text,
                'review_posted_date': review_posted_date,
                'review_header': review_header,
                'review_rating': review_rating,
                'review_author': author
            }
            reviews_list.append(review_dict)

        data = {
            'ratings': ratings_dict,
            'reviews': reviews_list,
            'url': amazon_url,
            'name': product_name,
            'price': product_price
        }
        return data

    return {"error": "failed to process the page", "url": amazon_url}
コード例 #29
0
                    timestamp=timestamp), 'a') as f:
            f.write("Done scraping " + link + "\n")

# scraping flat data on wg-gesucht.de

    if "wg-gesucht.de" in link:

        flat_info = []

        ua = UserAgent()
        header = {'User-Agent': str(ua.random)}

        # using tor to scrape anonymously
        tr = TorRequest(password='******')
        tr.reset_identity()  # Reset Tor
        source = tr.get(link, headers=header).text
        soup = bs.BeautifulSoup(source, 'lxml')

        #check if the flat is deactivated
        try:
            if "Diese Anzeige ist momentan deaktiviert" in soup.find_all(
                    "h4",
                {"class": "headline alert-primary-headline"})[2].get_text():
                DEACTIVATED_FLATS += 1
                with open(
                        'logs/immoscrapy_log_{timestamp}.txt'.format(
                            timestamp=timestamp), 'a') as f:
                    f.write("The flat is deactivated: " + link + "\n")
                continue
        except:
            pass
コード例 #30
0
ファイル: 3-download_sub.py プロジェクト: lfkopp/opensubtitle
def reset_ip():
    tr = TorRequest(password='******')
    tr.reset_identity()  #Reset Tor
    response = tr.get('http://ipecho.net/plain')
    print("New Ip Address", response.text)