def get_api_url(api_url, api_key, target, **filters): '''Build the API URL to query a list of proxies''' api_url = urljoin(api_url, API_ENDPOINT) api_url = urljoin(api_url + '/', target) api_url = add_or_replace_parameter(api_url, 'api_key', api_key) for f_key, f_val in filters.items(): api_url = add_or_replace_parameter(api_url, f_key, f_val) return api_url
def __init__(self, place=None, start=1, count=None, expiry=None, *args, **kwargs): super(ZapSpider, self).__init__(*args, **kwargs) self.start = int(start) if start else 1 self.count = int(count) if count else None self.expiry = self.parse_timedelta(expiry) self.crawl_count = 0 self.scrape_count = 0 self.total_crawl = 0 self.total_scrape = 0 self.link_extractor = LinkExtractor( restrict_xpaths= '//div[@id="list"]//a[contains(@class, "detalhes")]', deny='/lancamento/', unique=True, process_value=url_query_cleaner) self.start_urls = [ urljoin('https://www.zapimoveis.com.br/venda/imoveis/', place or 'pe+recife' if place != 'all' else ''), ] self.lua_script = """
def start_requests(self): with open("quotesdb.jl") as f: for line in f: quote = json.loads(line) yield scrapy.Request( urljoin('http://www.goodreads.com', quote.get('author', {}).get('goodreads_link')))
def deal_img(img_urls_raw): img_urls_end = [] if not isinstance(img_urls_raw, type([])): img_urls_raw = [img_urls_raw] for img_url_raw in img_urls_raw: if 'http' or 'www' in img_url: img_url = urljoin('https://www.atc.org.au/', img_url_raw) img_urls_end.append(img_url) return img_urls_end
def get_index(contentqueue): headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } urls=['http://bts.gov.cn/xw/gjgn/']#国内 url2=['http://bts.gov.cn/xw/zdxw/']#师事 url3=['http://www.bts.gov.cn/zcms/']#部门动态 url4=['http://bts.gov.cn/xw/zsjg/']#直属单位 url5=['http://bts.gov.cn/xw/gjgn/'] url6=['http://bts.gov.cn/xw/qt/']#其它 url7=['http://www.bts.gov.cn/gk/tzgg/']#通知公告 url9=['http://www.bts.gov.cn/gk/rsxx/']#人事信息 url10=['http://www.bts.gov.cn/gk/ywgz/']#业务工作 url11=['http://www.bts.gov.cn/gk/wjzc/']#文件政策 url12=['http://www.bts.gov.cn/gk/zcjd1/']#政策解读 url13=['http://www.bts.gov.cn/gk/tjxx/']#统计信息 urls_all=urls+url2+url3+url4+url5+url6+url7+url9+url10+url11+url12+url13 for one_url in urls_all: response1=get_response_and_text(url=one_url,headers=headers,charset='utf-8') response_in_fucntion=response1['response_in_function'] if not response_in_fucntion: continue response_in_fucntion_text=response1['response_in_function_text'] datasoup=BeautifulSoup(response_in_fucntion_text,'lxml') try: for one_li in datasoup.select( 'body > div.mainBg > div.listMain.pageWidth.clearself > div.ListRightContent.clearself > div.docuContent.listul > ul > li'): # print one_li.text url_raw = one_li.select('a')[0].get('href') title = one_li.select('a')[0].text.strip() url_end = urljoin(basic_url, url_raw) id=url_end.split('/')[-1].split('.')[0] if 'bts.gov.cn' in url_end: print url_end print title # print one_li.select('a')[1].text#publish_time//2017-04-04 index_dict={ 'title':title, 'url':url_end, 'id':id, 'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'parent_id':id, } contentqueue.put(index_dict) except Exception as e: print e print one_url
def deal_img_urls(img_urls_raw): img_url_list = [] for one_img_url in img_urls_raw: if 'paypal_cn' or 'pixel.gif' in one_img_url: continue if 'http' and 'www' not in one_img_url: url_img = urljoin('http://www.tibetanyouthcongress.org/', one_img_url) img_url_list.append(url_img) return img_url_list
def deal_img_urls(img_urls_raw): img_urls_dealed = [] for one_url in img_urls_raw: if 'download-pdf' in one_url: continue if 'http' not in one_url: one_url_dealed = urljoin('http://www.savetibet.org/', one_url) img_urls_dealed.append(one_url_dealed) return img_urls_dealed
def get_content(data, comment_queue): Re_find_img = re.compile(r'src\=\"(.*?)\"') headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } basic_url = 'http://www.altxw.com/news/content/' url = data['url'] response1 = get_response_and_text(url=url, headers=headers, charset='utf-8') response_in_function = response1['response_in_function'] if not response_in_function: return try: response_in_function_text = response1['response_in_function_text'] datasoup = BeautifulSoup(response_in_function_text, 'lxml') # title=datasoup.select('body > div.body > div > div.main.l > div > div > h1')[0].text() source = datasoup.select( 'body > div.body > div > div.main.l > div > div > div > li:nth-of-type(2)' )[0].text.split(u':')[1] content = '' for i in datasoup.select( 'body > div.body > div > div.main.l > div > div > ul > p'): content += i.text content_div = datasoup.select('div > div.main.l > div > div > ul')[0] img_urls = Re_find_img.findall(str(content_div)) img_urls2 = [] for one_img_url in img_urls: one_img_url = urljoin(basic_url, one_img_url.strip('../')) img_urls2.append(one_img_url) print img_urls2 data['source'] = source data['content'] = content data['img_urls'] = img_urls2 comment_queue.put(data) except Exception as e: print e
def _execute(self): """ Parse with the first successful option """ if not self.items: for option in self.template.items: output = self._extract(self.response, option) self.items.extend(output) if self.items: break if not self.links: for option in self.template.links: suffix = option.get('suffix') callback = option['callback'] priority = option.get('priority', 0) for link in self._extract(self.response, option): link = dict(**link) link.setdefault('callback', callback) link.setdefault('priority', priority) suffix = link.pop('suffix', '') or suffix if suffix: link['url'] = urljoin(link['url'], suffix) self.links.append(link)
def get_content(data, result_queue): url = data['url'] headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' } basic_url = 'http://bts.gov.cn/' Re_sub_javascript2 = re.compile(r'<script[\S|\s]*?>[\s|\S]*?<\/script\>') Re_find_time = re.compile(r'(\d{4}\-\d{2}\-\d{2} \d{2}\:\d{2}\:\d{2})') Re_find_img = re.compile(r'src\=\"(.*?)\"') Re_find_cource = re.compile(ur'来源:(.*?) ') try: response1 = get_response_and_text(url=url, headers=headers, charset='utf-8') response_in_function = response1['response_in_function'] if not response_in_function: return response_in_function_text = response1['response_in_function_text'] response_in_function_text_dealed = Re_sub_javascript2.sub( '', response_in_function_text) datasoup = BeautifulSoup(response_in_function_text_dealed, 'lxml') title = datasoup.select( 'body > div > div.detailMain.pageWidth > div.pargraph > h1' )[0].text content = '' for i in datasoup.select( 'body > div > div.detailMain.pageWidth > div.pargraph div.detailPar p' ): content += i.text # print content source = Re_find_cource.findall(response_in_function_text_dealed) if source: source = source[0] else: source = '' content_str = datasoup.select( 'body > div.mainBg > div.detailMain.pageWidth > div.pargraph > div.detailPar' )[0] content_str2 = str(content_str) img_urls = Re_find_img.findall(content_str2) img_urls2 = [] for one_img_url in img_urls: img_url_dealed = urljoin(basic_url, one_img_url) img_urls2.append(img_url_dealed) publish_div = datasoup.select( 'body > div > div.detailMain.pageWidth > div.pargraph > h6' )[0].text publish_time = Re_find_time.findall(publish_div)[0] data['content'] = content data['publish_time'] = publish_time data['img_urls'] = img_urls2 data['source'] = source data['publish_user'] = '' pass result_queue.put(data) except Exception as e: print e
url13 = ['http://www.bts.gov.cn/gk/tjxx/'] # 统计信息 urls_all = urls + url2 + url3 + url4 + url5 + url6 + url7 + url9 + url10 + url11 + url12 + url13 basic_url = 'http://bts.gov.cn/' for oneurl in url7: response = requests.get(url=oneurl, headers=headers) response.encoding = 'utf-8' datasoup = BeautifulSoup(response.text, 'lxml') # try: # print datasoup.select('body > div.mainBg > div.detailMain.pageWidth > div.pargraph > h1') # except Exception as e: # print oneurl try: for one_li in datasoup.select( 'body > div.mainBg > div.listMain.pageWidth.clearself > div.ListRightContent.clearself > div.docuContent.listul > ul > li' ): # print one_li.text url_raw = one_li.select('a')[0].get('href') title = one_li.select('a')[0].text url_end = urljoin(basic_url, url_raw) if 'bts.gov.cn' in url_end: print url_end print title print one_li.select('a')[1].text except Exception as e: print e print oneurl
response_text=response1.text basic_url='http://bts.gov.cn/' content1=Re_sub_javascript2.sub('',response1.text) datasoup=BeautifulSoup(content1,'lxml') print datasoup.select('body > div > div.detailMain.pageWidth > div.pargraph > h1')[0].text content='' for i in datasoup.select('body > div > div.detailMain.pageWidth > div.pargraph div.detailPar p'): content+= i.text source=Re_find_cource.findall(response_text) print source[0] content_str=datasoup.select('body > div.mainBg > div.detailMain.pageWidth > div.pargraph > div.detailPar')[0] content_str2=str(content_str) img_urls=Re_find_img.findall(content_str2) print img_urls img_urls2=[] for one_img_url in img_urls: img_url_dealed=urljoin(basic_url,one_img_url) img_urls2.append(img_url_dealed) print img_urls2 print content publish_div= datasoup.select('body > div > div.detailMain.pageWidth > div.pargraph > h6')[0].text print Re_find_time.findall(publish_div)[0]
def deal_img_urls(img_urls): img_result = [] for one_img in img_urls: img_urls = urljoin('http://www.tibetswiss.ch/', one_img) img_result.append(img_urls) return img_result