コード例 #1
0
ファイル: sinanews.py プロジェクト: hsstock/crawler
    def get_page(self,code,url):
        self.itemArray = []
        res = requests.get(url,timeout=10)
        res.encoding = "gbk"
        try:
            res.raise_for_status()
            if res.status_code == 200 :
                    contentSoup = bs4.BeautifulSoup(res.text,'lxml')
                    elems = contentSoup.select('#js_ggzx > li,.li_point > ul > li,.col02_22 > ul > li')
                    for elem in elems:
                        json = {}
                        json['code'] = code
                        ele = elem.select('span')
                        json['date'] = dateutil.format_date(ele[0].getText()[1:-1])
                        s = json['date']
                        ele = elem.select('a')
                        json['title'] = ele[len(ele)-1].getText()
                        loger.info("date:{},title:{}".format(s,json['title']))
                        json['href'] = ele[len(ele)-1].attrs['href']
                        ret,content = self.get_content(json['href'])
                        if ret != -1 :
                            time.sleep(4 * random.random())

                        if ret == 0 :
                            json['content'] = content
                            self.itemArray.append(json)
        except Exception as err:
            time.sleep(4 * random.random())
            loger.warning(err)
        finally:
            res.close()
コード例 #2
0
ファイル: sinanews.py プロジェクト: hsstock/crawler
    def get_content(self,url):
        content = ''
        ret = -1

        self.urlExist = self.mongodbutil.urlIsExist(url)
        if self.urlExist:
            loger.info('This url:{} has existed'.format(url))
            return ret, content

        header = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        res = requests.get(url,headers=header,timeout=10)
        res.encoding = "utf-8"
        try:
            res.raise_for_status()
            if res.status_code == 200:
                soup = bs4.BeautifulSoup(res.text,'lxml')
                elems = soup.select('#artibody,.entry-content')
                if len(elems) > 0 :
                    content = elems[0].getText()
                    ret = 0
            self.mongodbutil.insertUrls({"url": url})
        except Exception as err:
            loger.warning(err)
        finally:
            res.close()
        return ret, content
コード例 #3
0
ファイル: main.py プロジェクト: hsstock/crawler
def scheduled_history_job():
    loger.info('history_scheduled_job..')
    if working_history == False:
        sched.remove_job(timerid_history)
        start_crawl_history()
    else:
        loger.info('pre-history-timer is working')
コード例 #4
0
ファイル: fb.py プロジェクト: nelson0423/cb105g3-crawler
def process_30birds():
    login()
    urls = [
        "https://www.facebook.com/微笑山丘-223226418074079/",
        "https://www.facebook.com/30birdz/"
    ]
    for url in urls:
        url_reviews = url + "reviews/"
        driver.get(url_reviews)
        time.sleep(1)
        _len = 0
        while True:
            driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);"
            )  # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾)
            time.sleep(3)  # 2秒有時會來不及, 所以改用3秒
            reviews = driver.find_elements_by_css_selector(
                "div[class='_5pbx userContent _3576']")
            logger.info("已載入{}筆意見".format(len(reviews)))
            if _len == len(reviews):
                break
            _len = len(reviews)  # 筆數一樣表示沒有意見了
        for review in reviews:
            logger.info("id: {}, comment: {}".format(
                review.get_attribute("id"),
                review.find_element_by_tag_name("p").text))
コード例 #5
0
    def google_search_extract_pixnet_blog(self, camp_list):
        datas = list()

        def search_filter(url_list):
            for u in url_list:
                if u.find("pixnet.net/blog/post/") != -1: yield u

        delays = [9, 10, 5]
        for idx in range(len(camp_list)):
            if idx % random.choice(delays) == 0:
                time.sleep(30)
            camp = camp_list[idx]
            camp_title = camp["camp_title"]
            camp_site = camp["camp_site"]
            logger.info("idx: {}, camp_site: {}, camp_title: {}".format(idx, camp_site, camp_title))
            collect_cnt = 1
            max_start = 30
            # search_result = self.google_search("\"露營\"+\"痞客邦\"+\"" + camp_site + "\"", search_filter, collect_cnt,
            #                                    max_start)
            search_result = self.google_search("露營+痞客邦+" + camp_site, search_filter, collect_cnt,
                                               max_start)
            logger.debug("search_result: {}".format(search_result))
            for url in search_result:
                content = self.extract_pixnet(url)["text_content"]
                data = dict()
                data["camp_site"] = camp_site
                data["camp_title"] = camp_title
                data["pixnet_url"] = url
                data["content"] = content  # .replace("\"", "")
                datas.append(data)
        return datas
コード例 #6
0
ファイル: main.py プロジェクト: hsstock/crawler
def scheduled_job():
    loger.info('scheduled_job..')
    if working == False:
        sched.remove_job(timerid)
        start_crawl()
    else:
        loger.info('pre-timer is working')
コード例 #7
0
def process(url):
    """
    將爬蟲內容轉成JSON
    """
    total = []
    response = requests.get(url)
    html = BeautifulSoup(response.text)
    menus = html.select_one("#home-menu").select("li > a")
    cnt_area = 0
    bk = False
    for menu in menus:
        cnt_area = cnt_area + 1
        cnt_campsite = 0
        murl = menu["href"]
        logger.info("區域: {} ----------------------------".format(menu.text))
        logger.debug("murl: {}".format(murl))
        response = requests.get(murl)
        html = BeautifulSoup(response.text)
        nav = html.select_one("div.nav-links")  # 分頁導覽區域
        if nav is not None:
            last_page_num = int(
                nav.select_one("a.page-numbers:nth-last-of-type(2)")
                ["href"].split("/")[-1])  # 倒數第2個才是最後一頁
            logger.info("總共{}頁".format(last_page_num))
            for num in range(last_page_num):
                pnum = str(num + 1)
                logger.info("{} - 第{}頁 ----------------------------".format(
                    menu.text, pnum))
                page_url = murl + "/page/" + pnum
                logger.debug("page_url: {}".format(page_url))
                response = requests.get(page_url)
                html = BeautifulSoup(response.text)
                campsites = html.select("h2.entry-title-list > a")
                for campsite in campsites:
                    cnt_campsite = cnt_campsite + 1
                    row = dict()
                    # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d")
                    row["location"] = menu.text
                    campsite_url = campsite["href"]
                    process_content(campsite_url, row)
                    logger.info("row: {}".format(row))
                    total.append(row)
                    if False and cnt_area == 1 and cnt_campsite == 10:  # 限制爬的數量(False則不限制數量)
                        bk = True  # Python沒有label, 所以用這種鳥方式
                    if bk:
                        break
                # <<< end of page campsite for loop
                if bk:
                    break
            # <<< end of location page for loop
        if bk:
            break
    # <<< end of location menu for loop
    logger.info("total count: {}".format(len(total)))
    return total
コード例 #8
0
 def extract_fb_comment(self, camp_list):
     datas = list()
     for camp in camp_list:
         web_site = camp["web_site"]
         fb_url = ""
         for web in web_site:
             for v in web.values():
                 if v.find("facebook.com") != -1:
                     fb_url = v
                 if "" != fb_url:
                     break
             if "" != fb_url:
                 break
         if "" != fb_url:
             data = dict()
             data["camp_site"] = camp["camp_site"]
             data["camp_title"] = camp["camp_title"]
             data["fb_url"] = fb_url
             datas.append(data)
     driver = self.init_fb()
     delays = [7, 3, 5, 2, 4]
     for data in datas:
         try:
             url = data["fb_url"]
             url_reviews = url + "reviews/"
             logger.debug("url_reviews: {}".format(url_reviews))
             driver.get(url_reviews)
             time.sleep(random.choice(delays))
             _len = 0
             while True:
                 driver.execute_script(
                     "window.scrollTo(0, document.body.scrollHeight);")  # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾)
                 time.sleep(3)
                 reviews = driver.find_elements_by_css_selector("div[class='_5pbx userContent _3576']")
                 logger.info("已載入{}筆意見".format(len(reviews)))
                 if _len == len(reviews):
                     break
                 _len = len(reviews)  # 筆數一樣表示沒有意見了
             comments = list()
             for review in reviews:
                 # logger.info(
                 #     "id: {}, comment: {}".format(review.get_attribute("id"),
                 #                                  review.find_element_by_tag_name("p").text))
                 comment = review.find_element_by_tag_name("p").text
                 if comment and "" != comment.strip():
                     comments.append(comment.strip())
             data["comments"] = comments
         except Exception as e:
             logger.error("Error: {}".format(e))
     return datas
コード例 #9
0
ファイル: main.py プロジェクト: hsstock/crawler
def start_crawl():
    '''
    retrieve news from sina site
    :return:
    '''
    global working
    working = True
    loger.info('start crawl current news...')
    for market in MARKET:
        data = read_file(market)
        for indexs in data.index:
            market = data.loc[indexs].values[0][0:2]
            code = data.loc[indexs].values[0][3:]
            url = generate_url(market, code)

            loger.info('Current Time:{}, code:{}, url:{}'.format(
                datetime.datetime.now(), code, url))

            try:
                sinanews.get_page(code, url)
                items = sinanews.get_item_array()
                if len(items) > 0:
                    mongodbutil.insertItems(items)
                    time.sleep(4 * random.random())
                    loger.info("store items to mongodb ...")
                else:
                    loger.info("all items exists")
            except Exception as err:
                time.sleep(4 * random.random())
                loger.warning(err)
    working = False
    sched.add_job(scheduled_job, 'interval', seconds=1, id=timerid)
コード例 #10
0
def process(keyword, prefix, collect_cnt):
    driver = Chrome("../chromedriver")
    driver.set_window_rect(10, 10, 1027, 768)
    img_dir = path_config["crawler"] + "/images"
    if not os.path.exists(img_dir):
        os.makedirs(img_dir)
    keyword = quote(keyword)
    logger.debug("keyword: {}, collect_cnt{}".format(keyword, collect_cnt))
    ret = list()
    url_pattern = "https://www.google.com/search?q={}&source=lnms&tbm=isch&sa=X&ved=0ahUKEwi33-bootHhAhVXyIsBHXN5CAMQ_AUIDigB&biw=1920&bih=979"
    url = url_pattern.format(keyword)
    driver.get(url)
    _len = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);"
                              )  # 處理延遲載入機制(JavaScript模擬滑鼠滾輪下滾)
        time.sleep(3)  # 2秒有時會來不及, 所以改用3秒
        hSRGPd = driver.find_elements_by_css_selector("a[jsname='hSRGPd']")
        logger.info("已載入{}筆資料".format(len(hSRGPd)))
        if _len == len(hSRGPd):
            break
        _len = len(hSRGPd)  # 筆數一樣表示沒有意見了
    g_urls = []
    for d in hSRGPd:
        g_url = d.get_attribute("href")
        g_urls.append(g_url)
    delay = [1, 2, 3, 1.5, 2.3, 3.2]
    for i in range(len(g_urls)):
        try:
            g_url = g_urls[i]
            # print("g_url=", g_url)
            driver.get(g_url)
            time.sleep(random.choice(delay))
            img_url = driver.find_element_by_css_selector(
                "img[class='irc_mi'][src^='http']").get_attribute("src")
            print("img_url=", img_url)
            fpath = img_dir + "/" + prefix + format(
                i, "03d") + "." + img_url.split(".")[-1]
            urlretrieve(img_url, fpath)
            if i > collect_cnt:
                break
        except Exception as e:
            print("Error: {}".format(e))
    return ret
コード例 #11
0
def proces_pixnet_blog(camp_list):
    """
    依JSON找出營地的痞客邦部落格
    :param camp_list: 營地資訊JSON
    :return:
    """
    datas = list()
    import crawler.test.google_search as google_search
    import crawler.test.pixnet as pixnet

    def search_filter(url_list):
        for u in url_list:
            if u.find("pixnet.net/blog") != -1: yield u

    delays = [9, 10, 2, 5]
    for idx in range(len(camp_list)):
        if idx % random.choice(delays) == 0:
            time.sleep(30)
        camp = camp_list[idx]
        camp_title = camp["camp_title"]
        camp_site = camp["camp_site"]
        logger.info("idx: {}, camp_site: {}, camp_title: {}".format(
            idx, camp_site, camp_title))
        collect_cnt = 3
        max_start = 30
        search_result = google_search.process(
            "\"露營\"+\"pixnet\"+\"" + camp_site + "\"", search_filter,
            collect_cnt, max_start)
        logger.debug("search_result: {}".format(search_result))
        for url in search_result:
            content = pixnet.process(url)["text_content"]
            data = dict()
            data["camp_site"] = camp_site
            data["camp_title"] = camp_title
            data["pixnet_url"] = url
            data["content"] = content  # .replace("\"", "")
            datas.append(data)
    return datas
コード例 #12
0
def process_content(content_url, row):
    try:
        logger.debug("content_url: {}".format(content_url))
        response = requests.get(content_url)
        html = BeautifulSoup(response.text)
        logger.info("entry-title: {}".format(
            html.select_one("h1.entry-title").text))
        row["camp_title"] = html.select_one("h1.entry-title").text
        text0 = [
            t.select_one("a").text for t in html.select_one("#text0").select(
                "div[class^='t-camp-']"  # 為t-camp-開頭
                + ":not([class$='-none'])"  # 不為-none結尾
                + ":not([class='t-camp-area'])"  # 不為t-camp-area
            )
        ]
        row["features"] = text0
        text1 = [
            t.select("span[class^=t-]")
            for t in html.select_one("#text1").select("li")
        ]
        merge_text1(text1, row)
    except Exception as e:
        logger.error("Error: {}, content_url: {}".format(e, content_url))
コード例 #13
0
ファイル: main.py プロジェクト: hsstock/crawler
def start_crawl_history():
    '''
    retrieve news from sina site
    :return:
    '''
    global working_history
    working_history = True
    loger.info('start crawl history news...')
    for market in MARKET:
        data = read_file(market)
        for indexs in data.index:
            market = data.loc[indexs].values[0][0:2]
            code = data.loc[indexs].values[0][3:]
            sinanewshistory.clear_item_array()
            loger.info('Current Time:{}, code:{}, market:{},history'.format(
                datetime.datetime.now(), code, market))

            try:
                if market == 'HK':
                    sinanewshistory.get_hk_page(market, code)
                if market == 'US':
                    sinanewshistory.get_us_page(market, code)
                if market == 'SZ' or market == 'SH':
                    sinanewshistory.get_chn_page(market, code)

                items = sinanewshistory.get_item_array()
                if len(items) > 0:
                    mongodbutil.insertItems(items)
                    time.sleep(4 * random.random())
                    loger.info("store items to mongodb ...")
                else:
                    loger.info("all items exists")
            except Exception as err:
                time.sleep(4 * random.random())
                loger.warning(err)
    working_history = False
    sched.add_job(scheduled_history_job,
                  'interval',
                  days=1,
                  id=timerid_history)
コード例 #14
0
def process(url):
    response = requests.get(url)
    items = json.loads(response.text)["items"]
    logger.debug(items)
    limit_cnt = 3  # 限制帶出筆數(大於0才做限制)
    for idx in range(len(items)):
        item = items[idx]
        logger.info("{}.".format(idx + 1), item)
        logger.info("no: {}, store_id: {}, store_name: {}".format(
            idx + 1, item["store_id"], item["store_name"]))
        logger.info("address: {}, area: {}, city: {}".format(
            item["address"], item["area"], item["city"]))
        url = "https://icamping-prod.appspot.com/_ah/api/icamping_guest/v2/query_store_by_store_id?store_id=" + item[
            "store_id"]
        response = requests.get(url)
        content = json.loads(response.text)["items"][0]
        logger.info("description: {}".format(content["description"]))
        external_links = json.loads(content["external_links"])
        for ex_link in external_links:
            logger.info("link_name: {}, link: {}".format(
                ex_link["link_name"], ex_link["link"]))
        photo = json.loads(content["photo"])
        for p in photo:
            logger.info("gcs_url: {}".format(p["gcs_url"]))
        if limit_cnt > 0 and idx == limit_cnt - 1:
            break
        logger.info(
            "------------------------------------------------------------")
コード例 #15
0
 def extract_rvcamp(self, limit_count):
     total = []
     response = requests.get(self.__config["url_rvcamp"])
     html = BeautifulSoup(response.text)
     menus = html.select_one("#home-menu").select("li > a")
     cnt_area = 0
     bk = False
     extract_count = 0
     for menu in menus:
         cnt_area = cnt_area + 1
         cnt_campsite = 0
         murl = menu["href"]
         logger.info("區域: {} ----------------------------".format(menu.text))
         logger.debug("murl: {}".format(murl))
         response = requests.get(murl)
         html = BeautifulSoup(response.text)
         nav = html.select_one("div.nav-links")  # 分頁導覽區域
         if nav is not None:
             last_page_num = int(
                 nav.select_one("a.page-numbers:nth-last-of-type(2)")["href"].split("/")[-1])  # 倒數第2個才是最後一頁
             logger.info("總共{}頁".format(last_page_num))
             for num in range(last_page_num):
                 pnum = str(num + 1)
                 logger.info("{} - 第{}頁 ----------------------------".format(menu.text, pnum))
                 page_url = murl + "/page/" + pnum
                 logger.debug("page_url: {}".format(page_url))
                 response = requests.get(page_url)
                 html = BeautifulSoup(response.text)
                 campsites = html.select("h2.entry-title-list > a")
                 for campsite in campsites:
                     extract_count += 1
                     cnt_campsite = cnt_campsite + 1
                     row = dict()
                     # row["_id"] = "campsite_" + format(cnt_area, "02d") + "_" + format(cnt_campsite, "04d")
                     row["location"] = menu.text
                     campsite_url = campsite["href"]
                     try:
                         logger.debug("content_url: {}".format(campsite_url))
                         response = requests.get(campsite_url)
                         html = BeautifulSoup(response.text)
                         logger.info("entry-title: {}".format(html.select_one("h1.entry-title").text))
                         row["camp_title"] = html.select_one("h1.entry-title").text
                         text0 = [t.select_one("a").text for t in
                                  html.select_one("#text0").select(
                                      "div[class^='t-camp-']"  # 為t-camp-開頭
                                      + ":not([class$='-none'])"  # 不為-none結尾
                                      + ":not([class='t-camp-area'])"  # 不為t-camp-area
                                  )
                                  ]
                         row["features"] = text0
                         text1 = [t.select("span[class^=t-]") for t in html.select_one("#text1").select("li")]
                         self.__merge_rvcamp_text1(text1, row)
                     except Exception as e:
                         logger.error("Error: {}, campsite_url: {}".format(e, campsite_url))
                     logger.info("row: {}".format(row));
                     total.append(row)
                     # if False and cnt_area == 1 and cnt_campsite == 10:  # 限制爬的數量(False則不限制數量)
                     if extract_count == limit_count:
                         bk = True  # Python沒有label, 所以用這種鳥方式
                     if bk:
                         break
                 # <<< end of page campsite for loop
                 if bk:
                     break
             # <<< end of location page for loop
         if bk:
             break
     # <<< end of location menu for loop
     logger.info("total count: {}".format(len(total)))
     return total  # json array
コード例 #16
0
ファイル: main.py プロジェクト: hsstock/crawler
                items = sinanewshistory.get_item_array()
                if len(items) > 0:
                    mongodbutil.insertItems(items)
                    time.sleep(4 * random.random())
                    loger.info("store items to mongodb ...")
                else:
                    loger.info("all items exists")
            except Exception as err:
                time.sleep(4 * random.random())
                loger.warning(err)
    working_history = False
    sched.add_job(scheduled_history_job,
                  'interval',
                  days=1,
                  id=timerid_history)


loger.info('Starting time: {}'.format(datetime.datetime.now()))
sched.add_job(scheduled_job,
              'interval',
              max_instances=2,
              seconds=1,
              id=timerid)
sched.add_job(scheduled_history_job,
              'interval',
              max_instances=2,
              days=1,
              id=timerid_history)
sched.start()
loger.info('Ending time: {}'.format(datetime.datetime.now()))