Ejemplo n.º 1
0
 def get_consumer():
     if not Consumer.last_obj:
         try:
             Consumer.last_obj = Consumer()
         except BaseException as e:
             hlog.debug("消息队列初始化连接异常")
             Consumer.last_obj = None
         else:
             hlog.info("消息队列连接完成")
     return Consumer.last_obj
Ejemplo n.º 2
0
    def __init__(self):
        hlog.info("初始化消息队列连接")

        hlog.var('mq_host', mq_config.host)
        hlog.var('mq_port', mq_config.port)
        hlog.var('mq_group', mq_config.group)
        hlog.var('mq_topic', mq_config.topic)

        self.consumer = KafkaConsumer(
            mq_config.topic,
            group_id=mq_config.group,
            bootstrap_servers=['{}:{}'.format(mq_config.host, mq_config.port)])
Ejemplo n.º 3
0
def read_file(filename):
    hlog.info('read file %s' % filename)
    file = Path(filename)
    if not (file.exists() and file.is_file()):
        file.exists() and file.is_file()
        hlog.error('文件 %s 不存在' % filename)
        return None

    with file.open(mode='rb') as file:
        file_text = file.read()

    return file_text
Ejemplo n.º 4
0
def get_start_url():
    """
    根据配置文件设置,获取起始网址
    :return: url获取的网址,num需要爬取的数量
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    headers = {
        "content-type": "application/json"
    }

    url = "http://%s:%s%s"%(
        api_config.source_host,
        api_config.source_port,
        api_config.source_uri
    )
    hlog.var("url", url)
    response = requests.get(url=url, headers= headers)
    if 200 == response.status_code:
        response_string = response.text
        response_json = json.loads(response_string)
        if "SUCCESS" == response_json["code"]:
            hlog.info("获取起始url成功")
            hlog.exit_func(func_name)
            """
            返回结果需要返回
            {
                "code": "SUCCESS",
                "message": "成功",
                "result": {
                    "url": "https://www.zhipin.com/c101270100-p100199/",
                    "num": 3
                    }
            }
            现在返回的是
            {
                "code": "SUCCESS",
                "message": "成功",
                "result": "https://www.zhipin.com/c101270100-p100199/"
            }
            """
            return response_json["result"], 3

    hlog.debug("获取起始url失败,请检查网络")
    hlog.exit_func(func_name)
    return "", 0
Ejemplo n.º 5
0
def get_film_list():
    """
    获取电影信息
    :return:
    """

    from utils import get_urls

    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    film_list = list()
    film_objs = session.query(Film).all()

    for obj in film_objs:
        film_id = obj.id
        hlog.var('film_id', film_id)

        location_list = get_urls('Location', film_id)
        people_list = get_urls('People', film_id)
        specie_list = get_urls('Specie', film_id)
        vehicle_list = get_urls('Vehicle', film_id)

        film = {
            "id": obj.id,
            "title": obj.title,
            "description": obj.description,
            "director": obj.director,
            "producer": obj.producer,
            "release_date": obj.release_date,
            "rt_score": obj.rt_score,
            "url": obj.url,
            "people": people_list,
            "species": specie_list,
            "locations": location_list,
            "vehicles": vehicle_list
        }

        film_list.append(film)

    hlog.info("读取电影信息成功。")
    hlog.exit_func(func_name)

    return film_list
Ejemplo n.º 6
0
def get_platfrom(url):
    """
    根据url获取网站域名主体
    :param url: 网址
    :return:
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)
    hlog.var("url", url)
    try:
        domain = url.split("/")[2]
        platfrom = domain.split(".")[1]
        hlog.var("platfrom", platfrom)
    except:
        hlog.debug("获取网站域名主体失败")

    hlog.info("获取网站域名主体成功")
    hlog.exit_func(func_name)
    return platfrom
Ejemplo n.º 7
0
def send_data(source_url, htmlString, platform):
    """
    :param source_url: 爬取的是哪个网址
    :param htmlString: 爬取结果的字符串
    :param platform: 平台
    :return:
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    hlog.var("source_url", source_url)

    if "" == htmlString:
        hlog.exit_func(func_name)
        return

    spider_uuid = uuid.uuid1()
    hlog.var("spider_uuid", spider_uuid)

    encodedBytes = base64.b64encode(htmlString.encode("utf-8"))
    encodedStr = str(encodedBytes, "utf-8")

    data = {
        "url": source_url,
        "spiderUuid": str(spider_uuid),
        "platform": platform,
        "htmlString": encodedStr
    }

    headers = {
        "content-type": "application/json"
    }

    url = "http://%s:%s%s"%(
        api_config.target_host,
        api_config.target_port,
        api_config.target_uri
    )

    response = requests.post(url=url, data=json.dumps(data), headers= headers)
    hlog.info("发送结果完成,返回状态%s"%response.status_code)

    hlog.exit_func(func_name)
Ejemplo n.º 8
0
def crawl_url(url):
    """
    :param url: 要爬取的url
    :return: url的所有html字符串
    """
    func_name = inspect.stack()[0][3]
    hlog.enter_func(func_name)

    output = os.popen("node spider.js %s" % url).read()
    jsonString = json.loads(output)
    hlog.info("爬取完成,返回状态%s" % jsonString["code"])

    html = ""
    if "success" == jsonString["code"]:
        html = jsonString["data"]
    else:
        hlog.debug("爬虫爬取有误")

    hlog.exit_func(func_name)
    return html
Ejemplo n.º 9
0
def send(title, text, att_map=None, receivers=None, name=None):
    name = name if name else config.sender
    message = MIMEMultipart()
    message['Subject'] = Header(title, 'utf-8')
    message['From'] = Header(name, 'utf-8')
    message.attach(MIMEText(text, 'plain', 'utf-8'))

    smtp_obj = smtplib.SMTP()

    try:
        hlog.info('connect to %s:%s' % (config.mail_host, config.mail_port1))
        smtp_obj.connect(config.mail_host, config.mail_port1)
    except smtplib.SMTPServerDisconnected as e:
        hlog.info(e)
        hlog.info('try another port')
        hlog.info('connect to %s:%s' % (config.mail_host, config.mail_port2))
        smtp_obj.connect(config.mail_host, config.mail_port2)

    hlog.info('login with %s:%s' % (config.mail_user, config.mail_pass))
    smtp_obj.login(config.mail_user, config.mail_pass)

    receivers = receivers if receivers else config.receivers

    for receiver in receivers:
        message['To'] = Header(receiver, 'utf-8')

        for filename, file in att_map.items():
            att = MIMEText(file, 'base64', 'utf-8')
            att["Content-Type"] = 'application/octet-stream'
            att.add_header('Content-Disposition', 'attachment', filename=('gbk', '', filename))
            message.attach(att)

        try:
            smtp_obj.sendmail(
                config.sender, receiver, message.as_string())
            hlog.info('send success')
        except smtplib.SMTPException as e:
            hlog.error(e)

    smtp_obj.quit()