Example #1
0
    def getHtml(self, url, referer="https://www.baidu.com/"):
        _result = ""
        try:
            my_dc = DesiredCapabilities.PHANTOMJS.copy()
            my_dc["browserName"] = "chrome"
            my_dc["platform"] = "mac"
            my_dc["version"] = "63.0.3239.84"
            my_dc["phantomjs.page.settings.loadImages"] = False
            my_dc["phantomjs.page.settings.userAgent"] = HtmlURLUtil.__USER_AGENT

            service_args = ["--load-images=false", "--disk-cache=false",
                            "--ignore-ssl-errors=true"]
            # "--webdriver-logfile=webdriver.log","--webdriver-loglevel=INFO"
            for head, value in self.headers.iteritems():
                my_dc["phantomjs.page.customHeaders.{}".format(head)] = value

            my_dc["phantomjs.page.customHeaders.Referer"] = referer
            self.driver = WebDriver(desired_capabilities=my_dc, service_args=service_args)
            self.driver.set_script_timeout(20)
            self.driver.set_page_load_timeout(30)
            self.driver.implicitly_wait(5)
            self.driver.set_window_size(2560, 1066)

            self.driver.get(url)
            # 保存网页快照图片
            # self.driver.save_screenshot(md5_util.md5(url)+".png")
            _result = self.driver.page_source
        except:
            log.getLogger().exception("HtmlURLUtil  getHtml error...")
            # self.driver.close()
            self.driver.quit()
        return _result
Example #2
0
 def run(self):
     while True:
         try:
             self.crawler.action()
             time.sleep(30)
         except:
             self.crawler.mysql.close()
             log.getLogger().exception("MyCrawlerThread exception...")
Example #3
0
 def action(self):
     try:
         os.system(self.action_str)
     except:
         log.getLogger().exception("action impl ....action_str:%s" %
                                   self.action_str)
         raise ConsumberException(
             "consumer action exception action_str=%s" % self.action_str)
Example #4
0
 def close(self):
     """
     提交事务动作
     :return:
     """
     try:
         self.connection.close()
     except:
         log.getLogger().exception("close exception .....")
Example #5
0
 def excuteNotCommit(self, query, params=None):
     """
     :param query:
     :param params: 参数建议同  queryDict
     :return:
     """
     try:
         cursor = self.connection.cursor()
         cursor.execute(query, params)
     except:
         log.getLogger().exception("excute not commit exception .....")
         self.connection.rollback()
Example #6
0
 def excuteManyNotCommit(self, query, params):
     """
     :param query:
     :param params: 参数建议同  queryDict
     :return:
     """
     try:
         cursor = self.connection.cursor()
         cursor.executemany(query, params)
     except:
         self.connection.rollback()
         log.getLogger().exception("excute commit exception .....")
Example #7
0
 def getTLD(self, url):
     """
     获取域名对象
     :param url:
     :return:
     """
     try:
         if not url:
             return None
         web = urllib.splitquery(url)[0]
         return tld.get_tld(web)
     except:
         log.getLogger().exception("getTLD ...%s" % url)
     return None
Example #8
0
 def queryNotClose(self, query, params=None, cursor_class=DictCursor):
     """
     带参数查询
     :param query:
     :param params:  推荐使用dict类型的,这样可读性最高
     :param cursor_class:  游标类,这里默认为字典游标
     :return:
     """
     try:
         cursor = self.connection.cursor(cursorclass=cursor_class)
         cursor.execute(query, params)
         result = cursor.fetchall()
     except:
         log.getLogger().exception("query dict exception .....")
     return result
Example #9
0
 def writeWebContentToFile(self, webcontent, filepath):
     if not webcontent:
         return
     reload(sys)
     sys.setdefaultencoding("utf-8")
     try:
         _dir = os.path.dirname(filepath)
         if not os.path.exists(_dir):
             os.makedirs(_dir)
         f = open(filepath, "w")
         f.write(webcontent)
         f.flush()
     except:
         log.getLogger().exception("htmlutil writeWebContentToFile ...")
     finally:
         f.close()
Example #10
0
 def excuteCommit(self, query, params=None):
     """
     :param query:
     :param params: 参数建议同  queryDict
     :return:
     """
     try:
         cursor = self.connection.cursor()
         cursor.execute(query, params)
         self.connection.commit()
     except:
         self.connection.rollback()
         log.getLogger().exception("excute commit exception .....")
     finally:
         cursor.close()
         self.connection.close()
Example #11
0
def mvFile(source_file, des_file):
    """
    将source_file移动到des_file
    :param source_file:
    :param des_file:
    :return:
    """
    try:
        _dir = os.path.dirname(des_file)
        if not os.path.exists(_dir):
            os.makedirs(_dir)
        shutil.move(source_file, des_file)
        return True
    except:
        log.getLogger().exception("mvFile exception ...%s %s" %
                                  (source_file, des_file))
    return False
Example #12
0
    def action(self):
        """
        crawler主入口,保存需要爬取的url到web_url_table表中,
        并转化到action_queue表中让生产者去取用
        只有在web_url_table里边没有链接的时候才根据提供的top_url取新的链接
        该方法使用定时任务或者线程单独调用
        :return:
        """
        # 先查找种子库,根据种子库爬取
        sql = """
           select * from seed_url_table where status=0 
           ORDER by update_time desc,crawler_num ASC limit 0,%s FOR UPDATE 
        """
        try:
            result = self.mysql.queryNotClose(sql, [self.url_num])

            if result:  # 有种子,则将种子库给放入到web_url_table中
                for seed in result:
                    self.saveUrlTableByUrl(seed["url"], seed["content_type"],
                                           seed["title"], seed["referer"])
                    update_crawler_num = """
                        update seed_url_table set crawler_num = crawler_num + 1,update_time=%s where id=%s
                    """
                    self.mysql.excuteNotCommit(update_crawler_num,
                                               [util.now(), seed["id"]])
            # 在web_url_table中选20条放入队列中
            sql = """
                      select * from web_url_table where used=0
                      ORDER BY create_time desc limit 0,%s FOR UPDATE 
                  """
            result = self.mysql.queryNotClose(sql, [self.url_num])
            if result:
                self.parseToActionQueue(result)
            self.mysql.connection.commit()
        except:
            log.getLogger().exception("mycrawler action ...")
        finally:
            # self.mysql.close()
            pass
Example #13
0
def mergeFile(file_path, new_file, del_old=True):
    """
    将文件同级目录下的所有文件合并
    :param file_path:  要合并的文件目录的内容或者文件同级文件,必须有rw权限
    :param new_file: 合并产生的新文件
    :param del_old: 是否删除被合并的文件
    :return:
    """
    try:
        if os.path.isfile(file_path):
            file_path = os.path.dirname(file_path)

        if not os.path.isdir(file_path):
            raise Exception("不是目录:%s,请检查该路径是否是文件目录" % file_path)
        if not os.access(file_path, os.W_OK):
            raise Exception("文件目录:%s,不可写,请检查该目录是否可写" % file_path)
        if os.path.isdir(new_file):
            raise Exception("合并到的文件%s是目录,不能合并" % new_file)

        pdir = os.path.dirname(new_file)
        if not os.path.exists(pdir):
            os.makedirs(pdir)

        files = os.listdir(file_path)
        try:
            newfile = file(new_file, "a")
            while len(files) >= 1:
                f = files.pop()
                if f.startswith("."):  # 过滤掉隐藏文件
                    continue
                f = file_path + os.sep + f
                if not os.path.isfile(f):  # 过滤掉目录
                    continue
                if not os.access(f, os.W_OK):  # 文件不可写则忽略该文件
                    continue
                try:
                    _f = file(f, "r")
                    newfile.write(_f.read())
                    if del_old:
                        os.remove(f)  # 合并之后删除
                    os.chmod(f, stat.S_IRUSR)  # 改变该文件权限为只读模式,目的:已经合并的文件就不再重复合并
                except:
                    log.getLogger().exception("mergeFile ...")
                finally:
                    _f.close()
        except:
            log.getLogger().exception("mergeFile ...")
            return False
        finally:
            newfile.close()
    except:
        log.getLogger().exception("mergeFile ...")
        return False
    return True
Example #14
0
 def fail(self):
     log.getLogger().info("fail......")
Example #15
0
 def success(self):
     log.getLogger().info("success......")
Example #16
0
 def action(self):
     log.getLogger().info("action......action=%s,params=%s" %
                          (self.action_str, self.params))
     raise Exception(self.action_str)
import base.config.common_config as common_config
import crawler.util.uuid_util as myuuid
import log.common_log as log
from base.exception.consumer_exception import ConsumberException
from crawler.mycrawler import MyCrawler
from crawler.util.html_util import HtmlURLUtil

if __name__ == "__main__":
    """
      爬取链接动作,供消费者(ConsumerAction)调用
      sys.argv = [py_file_path,url,title,refere]
    """
    reload(sys)
    sys.setdefaultencoding("utf-8")

    log.getLogger().info(sys.argv)

    try:
        if not sys.argv or len(sys.argv) <= 3:
            raise ConsumberException("请设置需要爬取的url,title,referer")

        html_util = HtmlURLUtil()
        # url, title, referer, save_path
        url = sys.argv[1]
        title = sys.argv[2]
        referer = sys.argv[3]

        html_result = html_util.getHtml(url)
        my_crawler = MyCrawler()

        file_path = common_config.CRAWLER_SAVE_PATH + os.sep + "tmp" + os.sep + myuuid.getUUID(
Example #18
0
 def saveUrlTableByUrl(self,
                       url,
                       charset="utf-8",
                       title=None,
                       referer=None):
     """
     爬取url并获取到该url页面的所有a标签
     保存到web_url_table表中
     :param url:
     :param charset:
     :param title:
     :param referer:
     :return:
     """
     sql = """
         insert into web_url_table (`url`,`title`,`content_type`,`referer`,
         `hostname`,`params`,`md5`,`url_type`,`used`,`create_time`
         ,`update_time`) VALUES (%(url)s,%(title)s,%(content_type)s,%(referer)s,
         %(hostname)s,%(params)s,%(md5)s,%(url_type)s,%(used)s,
         %(create_time)s,%(update_time)s) ON DUPLICATE KEY UPDATE md5=md5
     """
     params = []
     now = util.now()
     _md5 = md5(url)
     self.html_util = html_util = HtmlURLUtil()
     try:
         params.append({
             "url": url,
             "title": title,
             "content_type": charset,
             "referer": referer,
             "hostname": html_util.getTLD(url),
             "params": html_util.getSortQS(url),
             "md5": _md5,
             "url_type": "0",
             "used": "0",
             "create_time": now,
             "update_time": now
         })
         douban = html_util.getHtml(url)
         # 写入到文件中
         file_path = common_config.CRAWLER_SAVE_PATH + os.sep + "tmp" + os.sep + myuuid.getUUID(
         ).__str__()
         self.appendContentToFile(url, title, url, douban, file_path)
         # 查找该页面下的所有的a标签
         eles = html_util.getElementsByTagName("a")
         hsn = html_util.getTLD(url)
         _charset = html_util.getCharset(douban)
         if eles:
             for el in eles:
                 sub_url = el.get_attribute('href')
                 if sub_url.count("javascript"):
                     continue
                 sub_md5 = md5(sub_url)
                 if not util.dictListContains(params, "md5", sub_md5):
                     params.append({
                         "url":
                         sub_url,
                         "title":
                         html_util.driver.title,
                         "content_type":
                         _charset,
                         "referer":
                         url,
                         "hostname":
                         html_util.getTLD(sub_url),
                         "params":
                         str(html_util.getSortQS(sub_url)),
                         "md5":
                         sub_md5,
                         "url_type":
                         0 if hsn == html_util.getTLD(sub_url) else 1,
                         "used":
                         "0",
                         "create_time":
                         now,
                         "update_time":
                         now
                     })
         self.mysql.excuteManyNotCommit(sql, params)
     except:
         log.getLogger().exception("mycrawler saveSeedWebUrlToMysql ...")
     finally:
         html_util.closeWebDriver()
     return params