Ejemplo n.º 1
0
    def is_crawl_success(self,url, content):
        content = util.html_encode(content)

        if content.find("站长帮手网") > 0:
            return True
        if content.find("暂无数据") > 0:
            return True
        if content.find("为无效的域名格式") > 0:
            return True

        if content.find("HTTP Error 400. The request URL is invalid.") > 0:
            return True

        if content.find("您的查询量比较大") > 0:
            logger.info("您的查询量比较大")

            if len(login_users) < 100:
                while True:
                    opener = urllib2.build_opener()
                    username = util.id_generator(10)
                    data = {
                        "username":username,
                        "password": "******",
                        "confirmpassword": "******",
                        "opaction":"reg",
                        "qq":"",
                        "isqqopen":"1",
                        "email":"*****@*****.**" % username
                    }

                    data = urllib.urlencode(data)
                    logger.info(data)
                    headers = {
                        "Referer": "http://my.links.cn/reg.asp"
                    }
                    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9'
                    headers['User-Agent'] = user_agent

                    try:
                        request= urllib2.Request("http://my.links.cn/regpost.asp", data, headers)
                        r = opener.open(request, timeout=30)
                        try:
                            content = util.html_encode(r.read())
                            #logger.info(content)
                            login_users.append({"name":username, "pwd":"ann123456", "date":datetime.datetime.now()})
                            logger.info(login_users)
                            break
                        except Exception,e:
                            #pass
                            traceback.print_exc()
                    except Exception,e:
                        #pass
                        traceback.print_exc()
            return False
Ejemplo n.º 2
0
def handle_result(response, website):
    global total
    if response.error:
        # url = website['link']
        delete(website)
    else:
        try:
            if response.code != 200:
                delete(website)
            else:
                # html = unicode(response.body,encoding="utf-8",errors='replace')
                html = util.html_encode(response.body)
                doc = pq(html)
                metas = doc('meta')
                description = None
                keywords = None
                for meta in metas:
                    name = pq(meta).attr('name')
                    content = pq(meta).attr('content')
                    if name == 'keywords':
                        keywords = content
                    if name == 'description':
                        description = content

                update(description, keywords, website)

        except:
            traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
Ejemplo n.º 3
0
    def login(self, url, redirect=True):
        global login_users
        #logger.info(login_users)
        _login_users = []
        for user in login_users:
            date = user["date"]
            if (datetime.datetime.now() - date).seconds < 5*60:
                _login_users.append(user)
        login_users = _login_users
        if len(login_users) == 0:
            login_users.append({"name":"ann201","pwd":"ann123456", "date":datetime.datetime.now()})

        #logger.info(login_users)

        retries = 0
        while True:
            retries += 1
            if retries > 3:
                break

            self.init_http_session(url)

            while True:
                try:
                    idx = random.randint(0, len(login_users) - 1)
                    login_user = login_users[idx]
                    logger.info(login_user)
                    break
                except:
                    pass

            data = {
                "backurl":"	http://beian.links.cn",
                "bsave": "1",
                "opaction":"login",
                "username":login_user["name"],
                "password":login_user["pwd"],}

            data = urllib.urlencode(data)
            headers = {
                "Referer": "http://beian.links.cn"
            }
            user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9'
            headers['User-Agent'] = user_agent

            try:
                request= urllib2.Request("http://my.links.cn/checklogin.asp", data, headers)
                r = self.opener.open(request, timeout=30)
            except:
                continue

            try:
                content = util.html_encode(r.read())
            except:
                continue

            #logger.info(content)
            if content.find("loaduserinfo") > 0:
                break
Ejemplo n.º 4
0
def crawler(company_id, link):
    retry_time = 0
    while True:
        result = news_crawler.crawl(link, agent=False)
        if result['get'] == 'success':
            #logger.info(result["content"])
            html = util.html_encode(result["content"])
            #logger.info(html)
            contents = extract.extractContents(link, html)

            title = extract.extractTitle(html)
            date = extractArticlePublishedDate.extractArticlePublishedDate(
                link, html)

            dnews = {
                "companyId": company_id,
                "date": date,
                "title": title,
                "link": link,
                "createTime": datetime.datetime.now(),
                "source": 13001
            }

            dcontents = []
            rank = 1
            for c in contents:
                if c["type"] == "text":
                    dc = {
                        "rank": rank,
                        "content": c["data"],
                        "image": "",
                        "image_src": "",
                    }
                else:
                    dc = {
                        "rank": rank,
                        "content": "",
                        "image": "",
                        "image_src": c["data"],
                    }
                dcontents.append(dc)
                rank += 1
            dnews["contents"] = dcontents

            logger.info(dnews)

            mongo = db.connect_mongo()
            _id = mongo.article.news.insert_one(dnews).inserted_id
            mongo.close()

            return _id

        retry_time += 1
        if retry_time > 10:
            break

    return None
Ejemplo n.º 5
0
def get_meta_info(url):
    user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9'
    headers = {
        'User-Agent': user_agent,
        'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Accept-Encoding': 'gzip'
    }
    try:
        request = urllib2.Request(url, None, headers)
    except:
        return None
    opener = urllib2.build_opener()
    retries = 0
    while True:
        try:
            r = opener.open(request, timeout=17)
            if r.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(r.read())
                f = gzip.GzipFile(fileobj=buf)
                data = f.read()
            else:
                data = r.read()
            content = util.html_encode(data)
            redirect_url = url_helper.url_normalize(r.geturl())
            #logger.info(redirect_url)
            #logger.info(content)
            d = pq(html.fromstring(content))
            title = d("title").text()
            #logger.info(title)
            keywords = d("meta[name='keywords']").attr("content")
            if keywords is None:
                keywords = d("meta[name='Keywords']").attr("content")
            #logger.info(keywords)
            description = d("meta[name='description']").attr("content")
            if description is None:
                description = d("meta[name='Description']").attr("content")
            #logger.info(description)

            flag, domain = url_helper.get_domain(url)
            if flag is not True:
                domain = None
            return {
                "url": url,
                "redirect_url": redirect_url,
                "domain": domain,
                "title": title,
                "tags": keywords,
                "description": description,
                "httpcode": 200
            }
            break
        except:
            retries += 1
        if retries >= 3:
            return None
    return None
Ejemplo n.º 6
0
def process(search_name, from_doc_id, content):
    d = pq(util.html_encode(content))

    divs = d('div.app')
    for div in divs:
        e = pq(div)
        a = e('a.app-name')
        name = a.text().strip()
        #logger.info(name)
        href = a.attr("href")
        #logger.info(href)
        result = util.re_get_result("docid=(\d*)",href)
        if result:
            (docid_str,) = result
            try:
                docid = long(docid_str)
            except:
                continue
        else:
            continue

        data = e('a.inst-btn')
        if len(data) == 0:
            data = e('a.inst-btn-big')
        if len(data) == 0:
            continue
        type = data.attr("data_detail_type")
        apkname = data.attr("data_package")
        version = data.attr("data_versionname")
        size = None
        try:
            size = long(data.attr("data_size"))
        except:
            pass

        item = {
            "key_int": docid,
            "search_name": search_name,
            "name": name,
            "link": "http://shouji.baidu.com/software/%s.html" % docid,
            "type": type,
            "apkname": apkname,
            "version": version,
            "size": size
        }
        #logger.info(json.dumps(item, ensure_ascii=False, cls=util.CJsonEncoder))

        try:
            android.save_baidu_search(collection_search, item)
        except Exception,e:
            logger.info(e)
Ejemplo n.º 7
0
def handle_news_result(response, company, summary):
    global total
    if response.error:
        # logger.info("Error: %s, %s" % (response.error,response.request.url))
        if response.code == 404:
            content = response.body
            if content.find("404.jpg") > 0:
                total -= 1
                if total <= 0:
                    begin()
        request(
            response.request.url,
            lambda r, company=company: handle_news_result(r, company, summary))
        # logger.info('erroring .....')
        return

    logger.info(summary["title"])

    content_from_toutiao = True
    if response.effective_url != response.request.url:
        if 'toutiao.com' not in response.effective_url:
            #logger.info(response.effective_url)
            logger.info(response.request.url)
            logger.info('url changed .....')
            content_from_toutiao = False

    try:
        content = util.html_encode(response.body)
        if content_from_toutiao:
            if content.find(u"京ICP备12025439号") > 0:
                save_news(company, response.request.url, summary, content,
                          content_from_toutiao)
            else:
                request(response.request.url,
                        lambda r, company=company: handle_news_result(
                            r, company, summary))
                return
        else:
            save_news(company, response.request.url, summary, content,
                      content_from_toutiao)
    except:
        traceback.print_exc()

    total -= 1
    if total <= 0:
        begin()
Ejemplo n.º 8
0
def process_news(content, news_crawler, news_key, company_key_int, title,
                 news_date, tags):
    d = pq(html.fromstring(content))
    actual_news_url = d("iframe").attr("src")
    if actual_news_url is None:
        return
    if not actual_news_url.startswith("http"):
        return

    logger.info("actual_news_url: %s", actual_news_url)

    retry_time = 0
    while True:
        result = news_crawler.crawl(actual_news_url, agent=True)
        if result['get'] == 'success' and result.get("code") == 200:
            #logger.info(result["content"])
            news_content = util.html_encode(result["content"])
            try:
                collection_content = {
                    "date": datetime.datetime.now(),
                    "source": SOURCE,
                    "type": TYPE,
                    "url": actual_news_url,
                    "key": news_key,
                    "key_int": int(news_key),
                    "content": news_content,
                    "company_key_int": company_key_int,
                    "title": title,
                    "news_date": news_date,
                    "original_tags": tags
                }
                collection.insert_one(collection_content)
                break
            except Exception, ex:
                #logger.exception(ex)
                pass
        retry_time += 1
        if retry_time > 10:
            break
Ejemplo n.º 9
0
    for i in range(204, 210):
        opener = urllib2.build_opener()
        username = "******" % i
        data = {
            "username": username,
            "password": "******",
            "confirmpassword": "******",
            "opaction": "reg",
            "qq": "",
            "isqqopen": "1",
            "email": "*****@*****.**" % username
        }

        data = urllib.urlencode(data)
        logger.info(data)
        headers = {"Referer": "http://my.links.cn/reg.asp"}
        user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/600.8.9 (KHTML, like Gecko) Version/8.0.8 Safari/600.8.9'
        headers['User-Agent'] = user_agent

        try:
            request = urllib2.Request("http://my.links.cn/regpost.asp", data,
                                      headers)
            r = opener.open(request, timeout=60)
            try:
                content = util.html_encode(r.read())
                #logger.info(content)
            except Exception, e:
                traceback.print_exc()
        except Exception, e:
            traceback.print_exc()