Esempio n. 1
0
 def get_api_url(api_url, api_key, target, **filters):
     '''Build the API URL to query a list of proxies'''
     api_url = urljoin(api_url, API_ENDPOINT)
     api_url = urljoin(api_url + '/', target)
     api_url = add_or_replace_parameter(api_url, 'api_key', api_key)
     for f_key, f_val in filters.items():
         api_url = add_or_replace_parameter(api_url, f_key, f_val)
     return api_url
Esempio n. 2
0
    def __init__(self,
                 place=None,
                 start=1,
                 count=None,
                 expiry=None,
                 *args,
                 **kwargs):
        super(ZapSpider, self).__init__(*args, **kwargs)
        self.start = int(start) if start else 1
        self.count = int(count) if count else None
        self.expiry = self.parse_timedelta(expiry)

        self.crawl_count = 0
        self.scrape_count = 0
        self.total_crawl = 0
        self.total_scrape = 0

        self.link_extractor = LinkExtractor(
            restrict_xpaths=
            '//div[@id="list"]//a[contains(@class, "detalhes")]',
            deny='/lancamento/',
            unique=True,
            process_value=url_query_cleaner)

        self.start_urls = [
            urljoin('https://www.zapimoveis.com.br/venda/imoveis/',
                    place or 'pe+recife' if place != 'all' else ''),
        ]

        self.lua_script = """
Esempio n. 3
0
 def start_requests(self):
     with open("quotesdb.jl") as f:
         for line in f:
             quote = json.loads(line)
             yield scrapy.Request(
                 urljoin('http://www.goodreads.com',
                         quote.get('author', {}).get('goodreads_link')))
Esempio n. 4
0
        def deal_img(img_urls_raw):
            img_urls_end = []
            if not isinstance(img_urls_raw, type([])):
                img_urls_raw = [img_urls_raw]

            for img_url_raw in img_urls_raw:
                if 'http' or 'www' in img_url:
                    img_url = urljoin('https://www.atc.org.au/', img_url_raw)
                    img_urls_end.append(img_url)
            return img_urls_end
Esempio n. 5
0
def get_index(contentqueue):

    headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
    }
    urls=['http://bts.gov.cn/xw/gjgn/']#国内
    url2=['http://bts.gov.cn/xw/zdxw/']#师事
    url3=['http://www.bts.gov.cn/zcms/']#部门动态
    url4=['http://bts.gov.cn/xw/zsjg/']#直属单位
    url5=['http://bts.gov.cn/xw/gjgn/']
    url6=['http://bts.gov.cn/xw/qt/']#其它
    url7=['http://www.bts.gov.cn/gk/tzgg/']#通知公告
    url9=['http://www.bts.gov.cn/gk/rsxx/']#人事信息
    url10=['http://www.bts.gov.cn/gk/ywgz/']#业务工作
    url11=['http://www.bts.gov.cn/gk/wjzc/']#文件政策
    url12=['http://www.bts.gov.cn/gk/zcjd1/']#政策解读
    url13=['http://www.bts.gov.cn/gk/tjxx/']#统计信息


    urls_all=urls+url2+url3+url4+url5+url6+url7+url9+url10+url11+url12+url13

    for one_url in urls_all:
        response1=get_response_and_text(url=one_url,headers=headers,charset='utf-8')
        response_in_fucntion=response1['response_in_function']
        if not response_in_fucntion:
            continue
        response_in_fucntion_text=response1['response_in_function_text']
        datasoup=BeautifulSoup(response_in_fucntion_text,'lxml')
        try:
            for one_li in datasoup.select(
                    'body > div.mainBg > div.listMain.pageWidth.clearself > div.ListRightContent.clearself > div.docuContent.listul > ul > li'):
                # print one_li.text
                url_raw = one_li.select('a')[0].get('href')
                title = one_li.select('a')[0].text.strip()
                url_end = urljoin(basic_url, url_raw)
                id=url_end.split('/')[-1].split('.')[0]

                if 'bts.gov.cn' in url_end:
                    print url_end

                print title
                # print one_li.select('a')[1].text#publish_time//2017-04-04
                index_dict={
                    'title':title,
                    'url':url_end,
                    'id':id,
                    'spider_time':datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                    'parent_id':id,
                }
                contentqueue.put(index_dict)
        except Exception as e:
            print e
            print one_url
Esempio n. 6
0
        def deal_img_urls(img_urls_raw):
            img_url_list = []
            for one_img_url in img_urls_raw:
                if 'paypal_cn' or 'pixel.gif' in one_img_url:
                    continue
                if 'http' and 'www' not in one_img_url:
                    url_img = urljoin('http://www.tibetanyouthcongress.org/',
                                      one_img_url)
                    img_url_list.append(url_img)

            return img_url_list
Esempio n. 7
0
        def deal_img_urls(img_urls_raw):
            img_urls_dealed = []
            for one_url in img_urls_raw:
                if 'download-pdf' in one_url:
                    continue
                if 'http' not in one_url:
                    one_url_dealed = urljoin('http://www.savetibet.org/',
                                             one_url)
                    img_urls_dealed.append(one_url_dealed)

            return img_urls_dealed
Esempio n. 8
0
def get_content(data, comment_queue):

    Re_find_img = re.compile(r'src\=\"(.*?)\"')

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
    }
    basic_url = 'http://www.altxw.com/news/content/'

    url = data['url']
    response1 = get_response_and_text(url=url,
                                      headers=headers,
                                      charset='utf-8')
    response_in_function = response1['response_in_function']
    if not response_in_function:
        return
    try:
        response_in_function_text = response1['response_in_function_text']
        datasoup = BeautifulSoup(response_in_function_text, 'lxml')
        # title=datasoup.select('body > div.body > div > div.main.l > div > div > h1')[0].text()
        source = datasoup.select(
            'body > div.body > div > div.main.l > div > div > div > li:nth-of-type(2)'
        )[0].text.split(u':')[1]
        content = ''
        for i in datasoup.select(
                'body > div.body > div > div.main.l > div > div > ul > p'):
            content += i.text
        content_div = datasoup.select('div > div.main.l > div > div > ul')[0]
        img_urls = Re_find_img.findall(str(content_div))
        img_urls2 = []
        for one_img_url in img_urls:
            one_img_url = urljoin(basic_url, one_img_url.strip('../'))
            img_urls2.append(one_img_url)
        print img_urls2
        data['source'] = source
        data['content'] = content
        data['img_urls'] = img_urls2

        comment_queue.put(data)
    except Exception as e:
        print e
Esempio n. 9
0
    def _execute(self):
        """
            Parse with the first successful option
        """
        if not self.items:
            for option in self.template.items:
                output = self._extract(self.response, option)
                self.items.extend(output)
                if self.items:
                    break

        if not self.links:
            for option in self.template.links:
                suffix = option.get('suffix')
                callback = option['callback']
                priority = option.get('priority', 0)
                for link in self._extract(self.response, option):
                    link = dict(**link)
                    link.setdefault('callback', callback)
                    link.setdefault('priority', priority)
                    suffix = link.pop('suffix', '') or suffix
                    if suffix:
                        link['url'] = urljoin(link['url'], suffix)
                    self.links.append(link)
Esempio n. 10
0
def get_content(data, result_queue):
    url = data['url']
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
        'Accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
    }
    basic_url = 'http://bts.gov.cn/'
    Re_sub_javascript2 = re.compile(r'<script[\S|\s]*?>[\s|\S]*?<\/script\>')
    Re_find_time = re.compile(r'(\d{4}\-\d{2}\-\d{2} \d{2}\:\d{2}\:\d{2})')
    Re_find_img = re.compile(r'src\=\"(.*?)\"')
    Re_find_cource = re.compile(ur'来源:(.*?) ')

    try:
        response1 = get_response_and_text(url=url,
                                          headers=headers,
                                          charset='utf-8')
        response_in_function = response1['response_in_function']
        if not response_in_function:
            return
        response_in_function_text = response1['response_in_function_text']
        response_in_function_text_dealed = Re_sub_javascript2.sub(
            '', response_in_function_text)
        datasoup = BeautifulSoup(response_in_function_text_dealed, 'lxml')
        title = datasoup.select(
            'body > div > div.detailMain.pageWidth > div.pargraph > h1'
        )[0].text
        content = ''
        for i in datasoup.select(
                'body > div > div.detailMain.pageWidth > div.pargraph  div.detailPar  p'
        ):
            content += i.text
        # print content
        source = Re_find_cource.findall(response_in_function_text_dealed)
        if source:
            source = source[0]
        else:
            source = ''
        content_str = datasoup.select(
            'body > div.mainBg > div.detailMain.pageWidth > div.pargraph > div.detailPar'
        )[0]
        content_str2 = str(content_str)
        img_urls = Re_find_img.findall(content_str2)
        img_urls2 = []
        for one_img_url in img_urls:
            img_url_dealed = urljoin(basic_url, one_img_url)
            img_urls2.append(img_url_dealed)

        publish_div = datasoup.select(
            'body > div > div.detailMain.pageWidth > div.pargraph > h6'
        )[0].text
        publish_time = Re_find_time.findall(publish_div)[0]

        data['content'] = content
        data['publish_time'] = publish_time
        data['img_urls'] = img_urls2
        data['source'] = source
        data['publish_user'] = ''

        pass
        result_queue.put(data)
    except Exception as e:
        print e
Esempio n. 11
0
url13 = ['http://www.bts.gov.cn/gk/tjxx/']  # 统计信息

urls_all = urls + url2 + url3 + url4 + url5 + url6 + url7 + url9 + url10 + url11 + url12 + url13

basic_url = 'http://bts.gov.cn/'
for oneurl in url7:

    response = requests.get(url=oneurl, headers=headers)
    response.encoding = 'utf-8'
    datasoup = BeautifulSoup(response.text, 'lxml')
    # try:
    #     print datasoup.select('body > div.mainBg > div.detailMain.pageWidth > div.pargraph > h1')
    # except Exception as e:
    #     print oneurl
    try:
        for one_li in datasoup.select(
                'body > div.mainBg > div.listMain.pageWidth.clearself > div.ListRightContent.clearself > div.docuContent.listul > ul > li'
        ):
            # print one_li.text
            url_raw = one_li.select('a')[0].get('href')
            title = one_li.select('a')[0].text
            url_end = urljoin(basic_url, url_raw)

            if 'bts.gov.cn' in url_end:
                print url_end

            print title
            print one_li.select('a')[1].text
    except Exception as e:
        print e
        print oneurl
Esempio n. 12
0
response_text=response1.text
basic_url='http://bts.gov.cn/'

content1=Re_sub_javascript2.sub('',response1.text)



datasoup=BeautifulSoup(content1,'lxml')
print datasoup.select('body > div > div.detailMain.pageWidth > div.pargraph > h1')[0].text
content=''
for i in  datasoup.select('body > div > div.detailMain.pageWidth > div.pargraph  div.detailPar  p'):
    content+= i.text

source=Re_find_cource.findall(response_text)
print source[0]



content_str=datasoup.select('body > div.mainBg > div.detailMain.pageWidth > div.pargraph > div.detailPar')[0]
content_str2=str(content_str)
img_urls=Re_find_img.findall(content_str2)
print img_urls
img_urls2=[]
for one_img_url in img_urls:
    img_url_dealed=urljoin(basic_url,one_img_url)
    img_urls2.append(img_url_dealed)
print img_urls2
print content
publish_div= datasoup.select('body > div > div.detailMain.pageWidth > div.pargraph > h6')[0].text
print Re_find_time.findall(publish_div)[0]
Esempio n. 13
0
 def deal_img_urls(img_urls):
     img_result = []
     for one_img in img_urls:
         img_urls = urljoin('http://www.tibetswiss.ch/', one_img)
         img_result.append(img_urls)
     return img_result