Example #1
0
 def crawler(self):
     hasUrl = True
     while hasUrl:
         target = self.getTarget()
         # 判断是否已经取完目标
         if target != "":
             # 预登录处理
             self.crawler_run(target)
         else:
             hasUrl = False
     baseUtil.log("Crawler is over!!!!!!!!!!!!!")
Example #2
0
    def onStartBtnClick(self):

        photoId = self.picIdEdit.text().strip()
        outputPath = BaseUtil.ensureXlSuffix(self.outputDesEdit.text().strip())
        if photoId == "" or outputPath == "":
            self.mainWindow.popUpMessageBox(False, "照片ID或输出地址不可为空")
            return
        self.loading.show()
        self.setEnabled(False)
        self.ctrller.setPhotoId(photoId)
        self.ctrller.setOutputPath(outputPath)
        self.scrape_thread.start()
Example #3
0
    def start2Scrape(self):
        driver = webdriver.PhantomJS(executable_path=ScrapeUtil.phantomjsUrl)
        driver.get(ScrapeUtil.getPhotoPageUrl(self.photoId))
        try:
            element = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.ID, "content")))
        except:
            print("Oops!",sys.exc_info()[0],"occured.")
        finally:
            pageSource = driver.page_source
            driver.close()
            bsObj = ScrapeUtil.getBSObjByHTML(pageSource)
            totalLikedNum = int(bsObj.find(class_="v2_new_fav with_count").find("span", {"class":"value"}).get_text())
            photoName = bsObj.find(id="content").h2.get_text()
            users = []
            if totalLikedNum > 0:
                numOfPages = math.ceil(totalLikedNum / self.sizeOfOnePage)
                print(totalLikedNum)
                print(numOfPages)
                self.addUsersPages(users, self.photoId, numOfPages, self.sizeOfOnePage)
                print("length is", len(users))

        """
        with open('./test.csv', 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['PhotoName', 'Num', 'NickNames', 'Links'])
            if totalLikedNum == 0:
                writer.writerow([photoName, totalLikedNum])
            else:
                for i in range(len(users)):
                    if i == 0:
                        writer.writerow([photoName, totalLikedNum, users[i].nickName, ScrapeUtil.getUserPageUrlTpl(users[i].userId)])
                    if i > 0:
                        writer.writerow(['', '', users[i].nickName, ScrapeUtil.getUserPageUrlTpl(users[i].userId)])
        """
        path = self.outputPath
        if os.path.exists(path):
            os.remove(path)
        mywb = openpyxl.Workbook()
        mysheet = mywb['Sheet']
        if totalLikedNum == 0:
            BaseUtil.writerow2xl(mysheet, 1, [photoName, totalLikedNum])
        else:
            for i in range(len(users)):
                j = i + 1
                if j == 1:
                    BaseUtil.writerow2xl(mysheet, j, [photoName, totalLikedNum, users[i].nickName, ScrapeUtil.getUserPageUrlTpl(users[i].userId)])                
                if j > 1:
                    BaseUtil.writerow2xl(mysheet, j, ['', '', users[i].nickName, ScrapeUtil.getUserPageUrlTpl(users[i].userId)])
                mysheet.cell(row=j, column=3).hyperlink = ScrapeUtil.getUserPageUrlTpl(users[i].userId)
        mywb.save(path)
Example #4
0
def sleep(seconds):
	BaseUtil.msg("sleep %d seconds" % seconds)
	DatetimeObj.sleep(seconds)
Example #5
0
    def analysisPageInfo(self, target, url):
        info_rule = target.get("data_rule").get("info_rule")
        baseUtil.log("Load url")
        try:
            baseUtil.driver.get(url)
        except Exception:
            baseUtil.log("Load Time out,will return!")
            return None
        baseUtil.log("Loaded url")
        time.sleep(6)
        # 评论采集-数据加载  START
        comment_rule = target.get("comment_rule")
        if int(comment_rule.get("need_comment")) == 1:
            if int(comment_rule.get("comment_page_type")) == 2:
                for i in xrange(int(comment_rule.get("max_comment_page"))):
                    # 判断 当有按钮时  点击按钮 没有按钮时 滚动
                    try:
                        if comment_rule.get('comment_page_key') and len(
                                baseUtil.driver.find_elements_by_css_selector(
                                    comment_rule.get(
                                        "comment_page_key"))) == 0:
                            js = "var p = document.body.scrollTop=100000"
                            baseUtil.driver.execute_script(js)
                            baseUtil.log("Auth Comment scrollTop")
                        else:
                            baseUtil.driver.find_element_by_css_selector(
                                comment_rule.get("comment_page_key")).click()
                            baseUtil.log("Click Comment scrollTop")
                    except:
                        baseUtil.log("No found Click target for " +
                                     comment_rule.get('comment_page_key'))
                    time.sleep(2)
        else:
            baseUtil.log("Skip Commont!")
        # 评论采集-数据加载  END
        html = baseUtil.driver.page_source
        doc = pq(html)
        save_data = {}
        save_data['href'] = url
        save_data['crawler_time'] = int(time.time())
        save_data['site_name'] = target.get('target_name')
        save_data['site_id'] = target.get('id')
        save_data['source_type'] = target.get('source_type')
        # 数据采集
        for k, v in info_rule.items():
            k_v = None
            k_v_find = doc.find(v.get("find"))
            if v.get("getType") == 'text':
                k_v = k_v_find.text()
            elif v.get("getType") == 'attr':
                if v.get("attr_key"):
                    k_v = k_v_find.attr(v.get("attr_key"))
                else:
                    baseUtil.log("[ERROR]:data rule is error  target is :" +
                                 target.get("target_url"))
            elif v.get("getType") == 'html':
                k_v = k_v_find.html()

            if v.get("formart") == 'time_0' and k_v != '':
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d %H:%M"))
            elif v.get("formart") == 'time_1' and k_v != '':
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d %H:%M:%S"))
            elif v.get("formart") == 'time_2' and k_v != '':
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d"))
            elif v.get("formart") == 'time_3' and k_v != '':
                k_v = int(k_v) / 1000
            elif v.get("formart") == 'time_4' and k_v != '':
                k_v = k_v.replace('年', '-')
                k_v = k_v.replace('月', '-')
                k_v = k_v.replace('日', '')
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d %H:%M:%S"))
            elif v.get("formart") == 'time_5' and k_v != '':
                k_v = time.mktime(time.strptime(k_v, "%Y/%m/%d %H:%M"))
            elif v.get("formart") == 'time_6' and k_v != '':
                k_v = k_v.replace('年', '-')
                k_v = k_v.replace('月', '-')
                k_v = k_v.replace('日', '')
                k_v = time.mktime(time.strptime(k_v, "%Y-%m-%d%H:%M"))
            elif v.get("formart") == 'str_int' and k_v != '':
                k_v = filter(lambda ch: ch in '0123456789.', k_v)
            elif v.get("formart") == 'formart_url' and k_v != '':
                k_v = baseUtil.formartURL(target.get("target_url"), k_v)
            save_data[k] = k_v
        # 评论数据采集
        save_data['comments'] = []
        if int(comment_rule.get("need_comment")) == 1:
            comment_lines = doc.find(comment_rule.get('comment_line'))
            for comment_line in comment_lines.items():
                comment_line_data = {}
                for k, comment in comment_rule.get("info_rule").items():
                    if k and comment.get("find"):
                        comment_find = comment_line.find(comment.get("find"))
                        c_v = None
                        if comment.get("getType") == 'text':
                            c_v = comment_find.text()
                        elif comment.get("getType") == 'attr':
                            if comment.get("attr_key"):
                                c_v = comment_find.attr(
                                    comment.get("attr_key"))
                            else:
                                baseUtil.log(
                                    "[ERROR]:Comment rule is error  target is :"
                                    + target.get("target_url"))
                        elif comment.get("getType") == 'html':
                            c_v = comment_find.html()

                        if comment.get(
                                "formart") == 'formart_url' and c_v != '':
                            c_v = baseUtil.formartURL(target.get("target_url"),
                                                      c_v)
                        comment_line_data[k] = c_v
                comment_line_data['crawler_time'] = time.time()
                save_data['comments'].append(comment_line_data)

        if save_data['content']:
            save_data['create_time'] = save_data.get('create_time') and int(
                save_data.get('create_time')) or int(time.time())
            baseUtil.saveData(target, save_data)
Example #6
0
 def crawler_run(self, target):
     baseUtil.simulationLogin(target['headers'], target['login_info'])
     self.getAnalysisHtml(target)
Example #7
0
 def getAnalysisHtml(self, target):
     baseUtil.log("Analysis begin…………")
     info_urls = []
     # 分页类型 2||3 为滚动加载  加载完分页数据之后再抽取页面源代码
     if int(target.get("page_type")) == 1 or int(
             target.get("page_type")) == 2:
         url_params_str = baseUtil.formartParamsToUrl(
             target.get('url_params'))
         print target.get("target_url")
         baseUtil.driver.get(target.get("target_url") + url_params_str)
         time.sleep(16)
         baseUtil.log("Get html…………")
         try:
             for i in xrange(target.get('max_crawler_page')):
                 #分页类型2 不会有加载更多的点击按钮 所以直接滚动
                 if int(target.get("page_type")) == 1:
                     js = "var p = document.body.scrollTop=100000"
                     baseUtil.driver.execute_script(js)
                     baseUtil.log("Auth scrollTop")
                 else:
                     # 判断 当有按钮时  点击按钮 没有按钮时 滚动
                     if len(
                             baseUtil.driver.find_elements_by_css_selector(
                                 target.get("page_params_key"))) == 0:
                         js = "var p = document.body.scrollTop=100000"
                         baseUtil.driver.execute_script(js)
                         baseUtil.log("Auth scrollTop")
                     else:
                         baseUtil.driver.find_element_by_css_selector(
                             target.get("page_params_key")).click()
                         baseUtil.log("Click scrollTop")
                 time.sleep(6)
         except:
             baseUtil.log('[ERROR] not found scroll selector')
         #获取加载分页后的源代码
         html = baseUtil.driver.page_source
         #print html
         doc = pq(html)
         if target.get('data_rule').get(
                 'get_info_address_type') == 'tag_href':
             datas_href_items = doc.find(
                 target.get('data_rule').get('get_info_address_value'))
             for data_href in datas_href_items.items():
                 info_urls.append(data_href.attr("href"))
     # 分页类型为1 需要循环加载分页页面 并单独抽取源代码
     elif int(target.get("page_type")) == 0:
         url_params = target.get('url_params')
         for i in xrange(target.get('max_crawler_page')):
             url_params[target.get('page_params_key')] = i
             url_params_str = baseUtil.formartParamsToUrl(url_params)
             baseUtil.driver.get(target.get("target_url") + url_params_str)
             time.sleep(6)
             baseUtil.log("Get html…………")
             #获取加载分页后的源代码
             html = baseUtil.driver.page_source
             doc = pq(html)
             if target.get('data_rule').get(
                     'get_info_address_type') == 'tag_href':
                 datas_href_items = doc.find(
                     target.get('data_rule').get('get_info_address_value'))
                 for data_href in datas_href_items.items():
                     info_urls.append(data_href.attr("href"))
     for i in xrange(0, len(info_urls)):
         info_url = info_urls[i]
         host = baseUtil.getHostInURL(target.get("target_url"))
         if "http" in info_url:
             info_url_host = baseUtil.getHostInURL(info_url)
             if info_url_host.find(info_url_host) == -1:
                 continue
         elif info_url[:2] == '//':
             info_url = "http:" + "" + info_url
         elif info_url[:1] == '/':
             info_url = "http://" + host + "" + info_url
         else:
             info_url = "http://" + "" + info_url
         baseUtil.log("target_url:" + info_url)
         if baseUtil.checkRepeat(info_url):
             baseUtil.log("Pass checkRepeat")
             self.analysisPageInfo(target, info_url)
             time.sleep(2)
Example #8
0
#!/usr/bin/python

# To enable debug => set debug=True in BaseUtil

import os
import sys
import BaseUtil

from BaseUtil import msg
from BaseUtil import error
from BaseUtil import os_cmd
from AWS import AWSResourceManager
from AWS import add_sudoer

BaseUtil.set_debug(True)

mgr = AWSResourceManager('us-east-1') 

instance = mgr.start_instance('i-ba73ff54')

ip_address = instance.ip_address

while True:

    (s,o) = os_cmd("checkport.py " + ip_address + " 12345")
    if s == 0 :
        break
    BaseUtil.sleep(20)

if s > 0 :
	msg("The amazon VM is not reachable - please make sure AWS instances are running and "