Ejemplo n.º 1
0
def parse(url):
	logging.debug("Trying to parse the following source: %s",url)
	request = urllib2.Request(url)
	#request.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)')
	request.add_header('User-Agent', ua().random)	
	try:
		http = bs(urllib2.urlopen(request), "lxml") #grab a soup object and name it http
	except:
		logging.error("Error parsing %s",url)
		return
	return http #return the http object
Ejemplo n.º 2
0
 def process_request(self, request, spider):
     # Called for each request that goes through the downloader
     # middleware.
     user_agent = ua().random
     request.headers['User-Agent'] = user_agent
     # Must either:
     # - return None: continue processing this request
     # - or return a Response object
     # - or return a Request object
     # - or raise IgnoreRequest: process_exception() methods of
     #   installed downloader middleware will be called
     return None
Ejemplo n.º 3
0
    def geturl1(self):
        print('5')
        ip = self.ipQueue.get()
        while 1:
            try:
                url = self.urlQueue.get(block=False, timeout=10)
                res0 = requests.get(url,
                                    headers={"User-Agent": str(ua().random)},
                                    proxies=ip)
                res0.encoding = 'utf-8'
                html = res0.text
                q = re.compile(r'https://gz.lianjia.com/chengjiao/.*?[.]html')
                r = '//div[@class="total fl"]/span/text()'

                parseHtml0 = etree.HTML(html)
                datab = parseHtml0.xpath(r)
                pn = int(datab[0])
                if pn == 0:
                    continue
                pageend = pn // 30 + 2
                for k in range(1, pageend):
                    if k == 1:
                        pass
                    else:
                        urlbase1 = url + 'pg%s/' % k
                        res0 = requests.get(
                            urlbase1,
                            headers={"User-Agent": str(ua().random)},
                            proxies=ip)
                        res0.encoding = 'utf-8'
                        html = res0.text
                    dataa = q.findall(html)
                    dataa = set(dataa)
                    dataa = list(dataa)
                    for km in dataa:
                        self.urlQueue1.put(km)

            except:
                break
Ejemplo n.º 4
0
 def process_request(self, request, spider):
     user_agent = ua().random
     request.headers['User-Agent'] = user_agent
     print(request.headers)
     #random.choice(iter)可迭代对象随机选择一个
     pass
Ejemplo n.º 5
0
# Scrapy settings for img_design_spider project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'img_design_spider'

SPIDER_MODULES = ['img_design_spider.spiders']
NEWSPIDER_MODULE = 'img_design_spider.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = ua().random

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
Ejemplo n.º 6
0
from glob import glob
from hashlib import md5
from fake_useragent import UserAgent as ua
from splinter import Browser
from splinter.exceptions import *
from selenium.common.exceptions import *
from random import randint
from time import sleep
import requests
import time
import os
import logging

user_agent = ua()
CACHE_FLDR = 'cache'
DATA_FLDR = 'data'
GEOCODE_URL = ''.join([
    'https://maps.googleapis.com/maps/api/geocode/json',
    '?address={:s}&key={:s}'
])
logging.basicConfig(format='%(asctime)s %(message)s',
                    datefmt='%d/%m/%Y %I:%M:%S %p',
                    level=logging.INFO)


def enable_detailed_logging():
    # These two lines enable debugging at httplib level
    # (requests->urllib3->http.client)
    # You will see the REQUEST, including HEADERS and DATA,
    # and RESPONSE with HEADERS but without DATA.
    # The only thing missing will be the response.body which is not logged.
Ejemplo n.º 7
0
DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
from fake_useragent import UserAgent as ua
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent':  ua().random
}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'jc_goods.middlewares.JcGoodsSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'jc_goods.middlewares.JcGoodsDownloaderMiddleware': 543,
#}

# Enable or disable extensions