Example #1
0
 def __init__(self, ):
     self.count = 0
     self.wcount = 0
     self.mylock = Lock()
     self.csvfile = file('sz.csv', 'a')  #ks.csv
     self.csvfile.write(codecs.BOM_UTF8)
     self.item_queue = Queue()
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
     }
     self.proxies = {
         'http': 'https://121.61.0.33:9999',
         "https": 'https://121.61.0.33:9999'
     }
     self.mysign = True
     #https://suzhou.anjuke.com/community
     self.rawurl = 'https://suzhou.anjuke.com/community/'
     self.urlmanager = UrlManager()
Example #2
0
def crawl(init_url):
    url_pool = UrlManager()
    downloader = Downloader()
    parser = HtmlParser()
    outputer = Outputer()
    temp_url = init_url
    while temp_url:
        driver = downloader.download(temp_url)
        content, temp_url = parser.parse(driver)
        outputer.write(content)
    outputer.close()
Example #3
0
class SpiderMan():
    def __init__(self):
        self.manager = UrlManager()
        self.downloader = HtmlDownloader()
        self.parser = HtmlParser()
        self.output = DataOutput()

    def crawl(self,root_url):
        self.manager.add_new_url(root_url)
        while (self.manager.has_new_url() and self.manager.old_url_size()<100):
            try:
                new_url =self.manager.get_new_url()
                html = self.downloader.download(new_url)
                new_urls,data = self.parser.parser(new_url,html)
                self.manager.add_new_urls(new_urls)
                self.output.store_data(data)
                print("已经抓取%s个链接"%self.manager.old_url_size())
            except Exception as e:
                print('crawl failed')
        self.output.output_html()
Example #4
0
class Spiderman(object):

    def __init__(self):
        self.manage = UrlManager()
        self.parser = HtmlParser()
        self.downloader = Htmldownloader()
        self.output = DataOutput()

    def crawl(self,root_url):
        self.manage.add_new_url(root_url)
        print(len(self.manage.new_urls))
        while(self.manage.has_new_url() and self.manage.old_url_size() < 100):
            try:
                new_url = self.manage.get_new_url()
                html = self.downloader.download(new_url)
                new_urls,data = self.parser.parser(new_url,html)
                self.manage.add_new_urls(new_urls)
                self.output.store_data(data=data)
                print('已经抓取%s个链接' % self.manage.old_url_size())
            except:
                print('crawl Failed')
        self.output.output_html()
Example #5
0
 def __init__(self):
     self.urls = UrlManager()  #url管理器
Example #6
0
 def url_manage_proc(self, url_q, conn_q, root_url, page_num):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     print('url_mannager is working...')
     while True:
         while url_manager.has_new_url():
             # 从URL管理器获取新的URL
             new_url = url_manager.get_new_url()
             # 将新的URL发到工作节点
             url_q.put(new_url)
             # 加上判断, 爬满2000个链接终止爬虫并保存进度
             if (url_manager.old_urls_size() > page_num):
                 # 通知爬虫节点结束工作
                 url_q.put('end')
                 print('控制节点发起结束通知!')
                 # 关闭节点同事存储状态
                 url_manager.save_process('new_urls.txt',
                                          url_manager.new_urls)
                 url_manager.save_process('old_urls.txt',
                                          url_manager.old_urls)
                 return
             # 从result_solve_proc获取的URL添加到URL管理器
             print('url control working..., solve result')
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
         except Exception as e:
             time.sleep(1)  # 延时休息
         print('has crawl page num : ', url_manager.old_urls_size())
         time.sleep(5)
Example #7
0
 def __init__(self):
     self.manage = UrlManager()
     self.parser = HtmlParser()
     self.downloader = Htmldownloader()
     self.output = DataOutput()
Example #8
0
    def url_manager_proc(self, url_q, conn_q, root_url):
        url_manager = UrlManager()
        # url_manager.add_new_url(root_url)
        while True:
            while (url_manager.has_new_url()):

                # 从URL管理器获取新的url
                new_url = url_manager.get_new_url()
                print(new_url)
                # 将新的URL发给工作节点
                url_q.put(new_url)
                print('old_url=', url_manager.old_url_size())

                # 加一个判断条件,当爬去2000个链接后就关闭,并保存进度
                if (url_manager.old_url_size() > 2000):
                    # 通知爬行节点工作结束
                    url_q.put('end')
                    print('控制节点发起结束通知!')

                    # 关闭管理节点,同时存储set状态
                    url_manager.save_progress('new_urls.txt',
                                              url_manager.new_urls)
                    url_manager.save_progress('old_urls.txt',
                                              url_manager.old_urls)
                    return
            # 将从result_solve_proc获取到的urls添加到URL管理器之间
            try:
                if not conn_q.empty():
                    urls = conn_q.get()
                    url_manager.add_new_urls(urls)
            except BaseException as e:
                time.sleep(0.1)  # 延时休息
Example #9
0
class Anjuke(object):
    def __init__(self, ):
        self.count = 0
        self.wcount = 0
        self.mylock = Lock()
        self.csvfile = file('sz.csv', 'a')  #ks.csv
        self.csvfile.write(codecs.BOM_UTF8)
        self.item_queue = Queue()
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36'
        }
        self.proxies = {
            'http': 'https://121.61.0.33:9999',
            "https": 'https://121.61.0.33:9999'
        }
        self.mysign = True
        #https://suzhou.anjuke.com/community
        self.rawurl = 'https://suzhou.anjuke.com/community/'
        self.urlmanager = UrlManager()

    def get_villages(self):
        rep = requests.get(self.rawurl,
                           headers=self.headers,
                           verify=False,
                           timeout=2)
        soup = BeautifulSoup(rep.text, 'lxml')
        results = soup.find_all('span', attrs={'class': 'elems-l'})
        items = results[0].find_all('a')
        for item in items:
            if item.get("title") == "全部小区":
                print '剔除(全部)这一选项'
                continue
            vurl = item.get('href')
            print vurl
            self.get_villages2(vurl)
            #self.get_villages3(vurl)
            print '======================================'
        self.urlmanager.save_urls_process_status(self.urlmanager.new_urls,
                                                 r'new_urls.txt')
        self.urlmanager.save_urls_process_status(self.urlmanager.crawled_urls,
                                                 r'crawled_urls.txt')

    def get_villages2(self, vurl):
        rep = requests.get(vurl, headers=self.headers, verify=False, timeout=2)
        soup = BeautifulSoup(rep.text, 'lxml')
        results = soup.find_all('div', attrs={'class': 'sub-items'})
        items = results[0].find_all('a')
        for item in items:
            if item.get("title") == "全部小区":
                print '剔除(全部)这一选项2'
                continue
            url2 = item.get('href')
            print url2
            self.get_villages3(url2)

    def get_villages3(self, url):
        while 1:
            rep = requests.get(url,
                               headers=self.headers,
                               verify=False,
                               timeout=2)
            soup = BeautifulSoup(rep.text, 'lxml')
            results = soup.find_all('div', attrs={'_soj': 'xqlb'})
            for result in results:
                item_url = result.get('link')
                self.count += 1
                print 'No.', self.count, ':', item_url
                self.urlmanager.add_new_url(item_url)
            next_item = soup.find('a', attrs={'class': 'aNxt'})
            if next_item == None:
                break
            else:
                url = next_item.get('href')
        # time.sleep(1)
    def get_detail(self, c_url):
        rlist = []
        # item ={}
        try:
            rep = requests.get(c_url,
                               headers=self.headers,
                               verify=False,
                               timeout=4)
        except:
            print 'current:urls num2:', self.urlmanager.new_urls_size()
            self.urlmanager.readd_new_url(c_url)
            return
        if rep.url.startswith('https://www.anjuke.com/captcha-verify/'):
            self.urlmanager.readd_new_url(c_url)
            self.urlmanager.save_urls_process_status(self.urlmanager.new_urls,
                                                     r'new_urls.txt')
            self.urlmanager.save_urls_process_status(
                self.urlmanager.crawled_urls, r'crawled_urls.txt')
            return
        print rep.url
        soup = BeautifulSoup(rep.text, 'lxml')
        name = soup.find('h1')
        addr = name.find('span')
        print name.contents[0].strip(), addr.string
        rlist.append(name.contents[0].strip().decode("utf-8"))
        rlist.append(addr.string.decode("utf-8"))
        village = re.search("(.*?)-.*?", addr.string).group(1)
        try:
            y, x = geocodeG(addr.string.replace(village, '苏州'))  #昆山
        except:
            try:
                y, x = geocodeG(name.contents[0].strip().decode("utf-8"))
            except:
                y, x = geocodeG(u'苏州' + village)  #昆山
        rlist.append(y)
        rlist.append(x)
        price = re.search('.*comm_midprice":"(.*?)"', rep.text)
        if price == None:
            price = u'暂无报价'
        else:
            price = price.group(1)
        print 'price:', price
        rlist.append(price)
        # item['price'] = price.group(1)
        result = soup.find('dl', attrs={"class": 'basic-parms-mod'})
        for a in result.find_all('dd'):
            # print a.string.strip().decode('utf-8')
            rlist.append(a.string.strip().decode('utf-8'))
            print '----'
        id = re.search('view/(\d+)', rep.url)
        rent_url = 'https://ks.anjuke.com/v3/ajax/communityext/?commid=' + str(
            id.group(1)) + '&useflg=onlyForAjax'
        print rent_url
        response = requests.get(rent_url,
                                headers=self.headers,
                                verify=False,
                                timeout=2)
        print response.text
        content = json.loads(response.text)
        print content.get('comm_propnum').get('rentNum'), content.get(
            'comm_propnum').get('saleNum')
        rlist.append(content.get('comm_propnum').get('rentNum'))
        rlist.append(content.get('comm_propnum').get('saleNum'))
        # self.item_queue.put(rlist)
        return rlist
        # item['property-type'] = value_list[0].string.strip().replace(":",'')
        # item['property-cost'] = value_list[1].string.strip().replace(":",'')
        # item['area'] = value_list[2].string.strip().replace(":",'')
        # item['households'] = value_list[3].string.strip().replace(":",'')
        # item['build-years'] = value_list[4].string.strip().replace(":",'')
        # item['parking-nums'] = value_list[5].string.strip().replace(":",'')
        # item['cap-rate'] = value_list[6].string.strip().replace(":",'')
        # item['greeening-rate'] = value_list[7].string.strip().replace(":",'')
        # item['developer'] = value_list[8].string.strip().replace(":",'')
        # item['property-management'] = value_list[9].string.strip().replace(":",'')
        # print j.string.strip().replace(":",'')
        # for (k,v) in  item.items():
        # print "dict[%s]=" % k,v
    def write_to_csv(self, item):
        csv_write = unicodecsv.writer(self.csvfile,
                                      encoding='utf-8-sig',
                                      dialect='excel')
        csv_write.writerow(item)

    def write_to_csv2(self):
        if not self.item_queue.empty():
            self.mylock.acquire(10)
            with open('ks.csv', 'a') as csvfile:
                item = self.item_queue.get()
                csv_write = unicodecsv.writer(csvfile,
                                              encoding='utf-8-sig',
                                              dialect='excel')
                csv_write.writerows(item)
            self.mylock.release()

    def write_to_csv3(self):
        while not self.item_queue.empty():
            item = self.item_queue.get()
            csv_write = unicodecsv.writer(self.csvfile,
                                          encoding='utf-8-sig',
                                          dialect='excel')
            self.wcount += 1
            print 'write No.', self.wcount, 'url'
            csv_write.writerow(item)

    def start2(self):
        num = 0
        self.get_villages()
        print 'current:urls num1:', self.urlmanager.new_urls_size()
        while self.urlmanager.has_new_url():
            num += 1
            new_url = self.urlmanager.get_new_url()
            try:
                print 'get No.', num, 'url'
                url_process = Process(target=self.get_detail,
                                      args=(new_url, self.item_queue))
                url_process.start()
            except:
                with open("anjuke.log", 'w+') as f:
                    f.write('current:urls num2:')
                    f.write(str(self.urlmanager.new_urls_size()))
                self.urlmanager.readd_new_url(new_url)
                self.urlmanager.save_urls_process_status(
                    self.urlmanager.new_urls, r'new_urls.txt')
                self.urlmanager.save_urls_process_status(
                    self.urlmanager.crawled_urls, r'crawled_urls.txt')
        while not self.item_queue.empty():
            print 'write No.', num, 'url'
            self.write_to_csv(item)
            # write_process = Process(target=self.write_to_csv2)
            # write_process.start()
    def start3(self):
        # self.get_villages()
        fo = open("anjuke.log", 'w+')
        s = sys.stdout
        sys.stdout = fo
        print 'current:urls num1:', self.urlmanager.new_urls_size()
        while self.urlmanager.has_new_url():
            newlist = []
            flag = 100
            while flag:
                if self.urlmanager.has_new_url():
                    new_url = self.urlmanager.get_new_url()
                    newlist.append(new_url)
                    flag -= 1
                else:
                    break
            pool = threadpool.ThreadPool(12)
            requests = threadpool.makeRequests(self.get_detail, newlist)
            [pool.putRequest(req) for req in requests]

            pool.wait()
            self.urlmanager.save_urls_process_status(self.urlmanager.new_urls,
                                                     r'new_urls.txt')
            self.urlmanager.save_urls_process_status(
                self.urlmanager.crawled_urls, r'crawled_urls.txt')
            self.write_to_csv3()
        sys.stdout = s

    def start(self):
        fo = open("anjuke.log", 'w+')
        s = sys.stdout
        sys.stdout = fo
        num = 0
        # self.get_villages()
        print 'current:urls num1:', self.urlmanager.new_urls_size()
        while self.urlmanager.has_new_url():
            new_url = self.urlmanager.get_new_url()
            self.urlmanager.save_urls_process_status(self.urlmanager.new_urls,
                                                     r'new_urls.txt')
            self.urlmanager.save_urls_process_status(
                self.urlmanager.crawled_urls, r'crawled_urls.txt')
            try:
                item = self.get_detail(new_url)
                num += 1
                print 'write No.', num, 'url'
                self.write_to_csv(item)
            except:
                print 'current:urls num2:', self.urlmanager.new_urls_size()
                self.urlmanager.readd_new_url(new_url)
                self.urlmanager.save_urls_process_status(
                    self.urlmanager.new_urls, r'new_urls.txt')
                self.urlmanager.save_urls_process_status(
                    self.urlmanager.crawled_urls, r'crawled_urls.txt')
        sys.stdout = s
Example #10
0
 def url_manager_proc(self, url_q, conn_q, root_url):
     url_manager = UrlManager()
     url_manager.add_new_url(root_url)
     while True:
         if url_manager.has_new_url():
             new_url = url_manager.get_new_url()
             url_q.put(new_url)
             print('old_url=', url_manager.old_url_size())
             if url_manager.old_url_size() > 2000:
                 url_q.put('end')
                 print('Manager notify ending!')
                 url_manager.save_progress('new_urls.txt',
                                           url_manager.new_urls)
                 url_manager.save_progress('old_urls.txt',
                                           url_manager.old_urls)
                 return
         try:
             if not conn_q.empty():
                 urls = conn_q.get()
                 url_manager.add_new_urls(urls)
         except BaseException as e:
             time.sleep(0.1)