コード例 #1
0
def fetch_theverge():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBThevergeCom", "pages")
    url = "https://www.theverge.com/camera-review"
    doc = []
    html = HttpHelper.fetch(url)
    soup = BeautifulSoup(html[1], "lxml")
    total = 1

    div = soup.find_all("div", attrs={"class": "c-compact-river"})
    for i in div:
        try:
            a = i.find_all(
                "a", attrs={"class": "c-entry-box--compact__image-wrapper"})
            for j in a:
                filename = HttpHelper.fetchAndSave(j['href'], "utf-8",
                                                   "D:/pages/theverge.com")
                doc.append({
                    "filename": filename,
                    "url": j['href'],
                    "state": "fetched",
                    "domain": "www.theverge.com"
                })
                print(total)
                total += 1
        except Exception as err:
            print(err)
    collection.insertMany(doc)
コード例 #2
0
def for_blank_des():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg",
                             "supplement_copy", "url")
    doclist = []
    doc = []
    nlp = NLPHelper
    total = 0

    while True:
        slist = collection.nextPage(100)
        if slist == None or len(slist) == 0:
            break
        for i in slist:
            doclist.append(i)

    for i in doclist:
        if i['state'] == "built":
            if i['description'] != "":
                continue
            else:
                description = nlp.getSummary(i['content'], wordCount=35)
                print(i['title'])
                print(description)
                '''doc.append(
                    {"_id": i['_id'], "cat": i['cat'], "fileName": i['fileName'], "url": i['url'], "host": i['host'],
                     "state": "built", "title": i['title'], "content": i['content'], "description": description,
                     "attrlist": i['attrlist'], "contenthtml": i['contenthtml']})
                collection.updateOne(doc)'''
                total += 1
                doc.clear()

    print(total)
コード例 #3
0
def Mongo2Csv():
    doc = []
    try:
        pdCollection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter")

        total = 0
        while True:
            pdList = pdCollection.nextPage(10)
            if pdList == None or len(pdList) == 0:
                break

            for pd in pdList:
                if pd['state'] != 'pass':
                    continue

                newID = createPost(pd)
                if newID != None:
                    doc.append({"_id": pd['_id'], "ID": newID, "brand": pd['brand'], "url": pd['url'],
                                "state": "posted", "price": pd['price'], "title": pd['title'], "brand_a": pd['brand_a']
                                , "inner_des": pd['inner_des']})
                    print(doc[0]['ID'])
                    pdCollection.updateOne(doc)
                    doc.clear()
                    print("create post ok")
                else:
                    print("create post error")

                total += 1
                print('total=' + str(total) + ', title=' + pd['title'])
                return

        print('Creawte all posts ok')

    except Exception as err:
        print(err)
コード例 #4
0
def fetch_techradar():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBTechradarCom",
                             "pages")
    entrance = "https://www.techradar.com/reviews/car-tech?"
    doclist = []
    total = 1
    html = HttpHelper.fetch(entrance)
    soup = BeautifulSoup(html[1], "lxml")

    div = soup.find("div", attrs={"class": "listingResults"})
    divitem = div.find_all(
        "div", attrs={"class": re.compile("^listingResult small result*")})
    for i in divitem:
        try:
            a = i.find("a")
            url = a['href']
            filename = HttpHelper.fetchAndSave(url, "utf-8",
                                               "D:/pages/techradar.com")
            doclist.append({
                "filename": filename,
                "url": url,
                "state": "fetched",
                "domain": "www.techradar.com"
            })
            print(total)
            total += 1
        except Exception as err:
            print(err)
    collection.insertMany(doclist)
コード例 #5
0
def fetch_highsnobiety():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBHighsnobietyCom",
                             "pages")
    entrance = "https://www.highsnobiety.com/style/"
    driver = webdriver.Chrome(
        'C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe')
    driver.get(entrance)
    print("waiting for u")
    doclist = []  # 此处设置断点
    total = 1
    html = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(html, "lxml")
    div = soup.find("div", attrs={"class": "sub-contents__item"})
    articles = div.find_all("article")
    for i in articles:
        try:
            a = i.find("a")
            url = a['href']
            filename = HttpHelper.fetchAndSave(url, "utf-8",
                                               "D:/pages/highsnobiety.com")
            doclist.append({
                "filename": filename,
                "url": url,
                "state": "fetched",
                "domain": "www.highsnobiety.com"
            })
            print(total)
            total += 1
        except Exception as err:
            print(err)
    collection.insertMany(doclist)
コード例 #6
0
def fetch_fashionista():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBFashionistaCom",
                             "pages")
    entrance = "https://fashionista.com/style"
    driver = webdriver.Chrome(
        'C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe')
    driver.get(entrance)
    print("waiting for u")
    doclist = []  # 此处设置断点
    total = 1
    html = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(html, "lxml")
    articles = soup.find_all(
        "article",
        attrs={"class": "m-card mm-card--landscape-image mm-card--type-list"})
    for i in articles:
        try:
            a = i.find(
                "a", attrs={"class": "m-card--image-link m-background-image"})
            url = "https://fashionista.com" + a['href']
            filename = HttpHelper.fetchAndSave(url, "utf-8",
                                               "D:/pages/fashionista.com")
            doclist.append({
                "filename": filename,
                "url": url,
                "state": "fetched",
                "domain": "www.fashionista.com"
            })
            print(total)
            total += 1
        except Exception as err:
            print(err)
    collection.insertMany(doclist)
コード例 #7
0
def test():
    collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, "pages")
    doclist = []
    total = 1

    while True:
        slist = collection.nextPage(100)
        if slist == None or len(slist) == 0:
            break
        for i in slist:
            doclist.append(i)

    for page in doclist:
        try:
            if page['state'] != 'fetched':
                continue
            prefix = page['filename'][0:1]
            filepath = DOMAIN + prefix + "/" + page['filename']
            with open(filepath, encoding="utf-8") as fp:
                html = fp.read()
                task = {
                    "id": "id",
                    "url": page['url'],
                    'topic': 'crawler_data_p123',
                    'routingKey': '256'
                }  # dx.com 225, banggood.com 224 ,tomtop 195 ,gearbest 256
                sendPage(task, html)
            fp.close()
            page['state'] = 'sended'
            collection.updateOne(page)
            print(total)
            total += 1
        except Exception as err:
            print(err)
コード例 #8
0
def test2():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBGearbestCom", "pages")
    while True:
        slist = collection.nextPage(10)
        if len(slist) == 0:
            break

        for article in slist:
            article['state'] = "fetched"
            collection.updateOne(article)
コード例 #9
0
def parseAllBlog():
    try:
        collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME,
                                 'blog')
        total = 0
        while True:
            blogList = collection.nextPage(100)
            if len(blogList) == 0:
                break
            for blog in blogList:
                if blog['state'] == 'PARSED':
                    filePath = HttpHelper.getFullPath(HTML_ROOT_PATH,
                                                      blog['fileName'])
                    with open(filePath, 'r', encoding='utf-8') as file:
                        html = file.read()
                    found, title, desc, ogTitle, ogDesc, twTitle, twDesc, keywords, content, author, summary, summaryKeywords = parseBlog(
                        html)
                    if found and (title != None or ogTitle != None
                                  or twTitle != None):
                        doc = {
                            'title': title,
                            'ogTitle': ogTitle,
                            'twTitle': twTitle,
                            'desc': desc,
                            'ogDesc': ogDesc,
                            'twDesc': twDesc,
                            'keywords': keywords,
                            'content': content,
                            'author': author,
                            'summary': summary,
                            'summaryKeywords': summaryKeywords
                        }
                        blog['doc'] = doc
                        blog['state'] = 'PARSED'
                        collection.updateOne(blog)
                        print("ok")
                    else:
                        print("error")
                        break

                    total += 1
                    print("url=" + blog['url'])
                    print("total=" + str(total))
    except Exception as err:
        print(err)
    finally:
        print("exit")
コード例 #10
0
def fetch_gearbest():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBGearbestCom", "pages")
    entrance = "https://www.gearbest.com/health-care-c_11689/"
    pageNum = 1
    total = 1
    doc = []
    while pageNum < 11 and entrance is not None:
        html = HttpHelper.fetch(entrance)[1]
        soup = BeautifulSoup(html, "lxml")
        proUl = soup.find_all("ul",
                              attrs={"class": "clearfix js_seachResultList"})
        for proList in proUl:
            li = proList.find_all("li")
            for i in li:
                try:
                    photo = i.find_all(
                        "a",
                        attrs={
                            "class":
                            "icon-loading gbGoodsItem_thumb js-selectItemd"
                        })
                    for j in photo:
                        url = j['href']
                        filename = HttpHelper.fetchAndSave(
                            url, "utf-8", "D:/pages/gearbest.com")
                        doc.append({
                            "filename": filename,
                            "url": url,
                            "state": "fetched"
                        })
                        print(total)
                        total += 1
                except Exception as err:
                    print(err)
        a = soup.find_all("a", attrs={"class": "pageNext"})
        if a is None:
            entrance = None
            print("NO." + str(pageNum))
            pageNum += 1
            continue
        next = a[-1]
        entrance = next['href']
        print("NO." + str(pageNum))
        pageNum += 1
    if doc != []:
        collection.insertMany(doc)
コード例 #11
0
def Resolve():
    collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, "pages")
    doclist = []
    while True:
        slist = collection.nextPage(100)
        if slist == None or len(slist) == 0:
            break
        for i in slist:
            doclist.append(i)

    for article in doclist:
        try:
            if article['state'] != "fetched":
                continue
            prefix = article['filename'][0:1]
            filepath = DOMAIN + prefix + "/" + article['filename']
            with open(filepath, encoding="utf-8") as fp:
                html = fp.read()
                p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12 = parseBlog(
                    html)
                md5 = CryptHelper.getMD5Hash(article['url'])
                key = UrlHelper.getHostPath(article['url'])[1]
                excerpt = p3 if p3 else p11
                doc = {
                    "_id": article['_id'],
                    "filename": article['filename'],
                    "url": article['url'],
                    "state": "pass",
                    "domain": article['domain'],
                    'md5': md5,
                    'title': p2,
                    'excerpt': excerpt,
                    'content': p9,
                    'author': article['domain'],
                    'categories': CATEGORY,
                    'tags': "",
                    'status': 0,
                    'key': key
                }
                collection.updateOne(doc)
                doc.clear()
            fp.close()

        except Exception as err:
            print(err)
    '''for i in doc:
コード例 #12
0
ファイル: article.py プロジェクト: yang11037/Pycharm_Project
def importAllArticle(MONGO_HOST, MONGO_DATABASE_NAME, IMPORT_URL):
    try:
        articleCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME,
                                        'pages')
        total = 0
        while True:
            articleList = articleCollection.nextPage(10)
            if len(articleList) == 0:
                break

            total += len(articleList)
            print("total=" + str(total))
            newArticleList = []
            for article in articleList:
                if article['state'] != "pass":
                    continue
                #print (str(article['_id']))
                doc = {
                    'id': article['md5'],
                    'title': article['title'],
                    'excerpt': article['excerpt'],
                    'content': "",
                    'author': article['domain'],
                    'domain': article['domain'],
                    'categories': article['categories'],
                    'tags': article['tags'],
                    'url': article['url'],
                    'status': article['status'],
                    'key': article['key'],
                }
                newArticleList.append(doc)

                errorCode, rsp = HttpHelper.post(IMPORT_URL, newArticleList)
                if errorCode == "OK" and rsp != None and 'isOk' in rsp and rsp[
                        'isOk'] == True:
                    print("import article ok")
                else:
                    print("import article error")
                newArticleList.clear()
                article['state'] = "sended"
                articleCollection.updateOne(article)

    except Exception as err:
        print(err)
    finally:
        print("exit")
コード例 #13
0
def test3():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg",
                             "supplement_copy", "url")
    doclist = []
    total = 0

    while True:
        slist = collection.nextPage(100)
        if slist == None or len(slist) == 0:
            break
        for i in slist:
            doclist.append(i)

    for i in doclist:
        if i['state'] != "completed":  # 只操作经过test2()处理的数据
            continue
        contenthtml = ""

        for j in i['attrlist']:
            contenthtml1 = '<h3 class="h3-subtitle">' + j['subtitle'] + '</h3><br/>' \
                           + '<div class="div-content>'+ j['innerhtml'] + '</div><br/>'
            contenthtml += contenthtml1

        doc = {
            "_id": i['_id'],
            "cat": i['cat'],
            "fileName": i['fileName'],
            "url": i['url'],
            "host": i['host'],
            "state": "built",
            "title": i['title'],
            "content": i['content'],
            "description": i['description'],
            "attrlist": i['attrlist'],
            "contenthtml": contenthtml
        }

        collection.updateOne(doc)
        doc.clear()

        total += 1
        print(total)
コード例 #14
0
def fetch_banggood():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBBgoodCom", "pages")
    entrance = "https://www.banggood.com/Wholesale-Indoor-Lighting-c-2514.html"
    pageNum = 1
    total = 1
    doc = []

    while pageNum < 11 and entrance is not None:
        html = HttpHelper.fetch(entrance)[1]
        soup = BeautifulSoup(html, 'lxml')
        proUl = soup.find("ul", attrs={"class": "goodlist_1"})
        li = proUl.find_all("li")
        for i in li:
            try:
                photo = i.find_all("span", attrs={"class": "img"})
                for j in photo:
                    a = j.find("a")
                    url = a['href']
                    filename = HttpHelper.fetchAndSave(
                        url, 'utf-8', 'D:/pages/banggood.com')
                    doc.append({
                        "filename": filename,
                        "url": url,
                        "state": "fetched"
                    })
                    print(total)
                    total += 1
            except Exception as err:
                print(err)
        div = soup.find("div", attrs={"class": "page_num"})
        next = div.find("a", attrs={"id": "listNextPage"})
        if next is None:
            entrance = None
            print("NO." + str(pageNum))
            pageNum += 1
            continue
        entrance = next['href']
        print("NO." + str(pageNum))
        pageNum += 1
    if doc != []:
        collection.insertMany(doc)
コード例 #15
0
def fetch_tomtop():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBTomtopCom2", "pages")
    entrance = "https://www.tomtop.com/vehicle-infotainment-11035/"
    pageNum = 1
    total = 1
    doc = []

    while pageNum < 11 and entrance is not None:
        html = HttpHelper.fetch(entrance)[1]
        soup = BeautifulSoup(html, 'lxml')
        proUl = soup.find("ul", attrs={"class": "lbBox categoryProductList"})
        li = proUl.find_all("li")
        for i in li:
            try:
                photo = i.find_all("div", attrs={"class": "productImg"})
                for j in photo:
                    a = j.find("a")
                    url = "https://www.tomtop.com" + a['href']
                    filename = HttpHelper.fetchAndSave(url, 'utf-8',
                                                       'D:/pages/tomtop.com')
                    doc.append({
                        "filename": filename,
                        "url": url,
                        "state": "fetched"
                    })
                    print(total)
                    total += 1
            except Exception as err:
                print(err)
        ul = soup.find("ul", attrs={"class": "lbBox pagingWarp"})
        next = ul.find("li", attrs={"class": "lineBlock pageN pageClick"})
        if next is None:
            entrance = None
            print("NO." + str(pageNum))
            pageNum += 1
            continue
        entrance = "https://www.tomtop.com" + next.find("a")['href']
        print("NO." + str(pageNum))
        pageNum += 1
    if doc != []:
        collection.insertMany(doc)
コード例 #16
0
def fetch_whowhatwear():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBWhowhatwearCom",
                             "pages")
    entrance = "https://www.whowhatwear.com/channel/trends"
    driver = webdriver.Chrome(
        'C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe')
    driver.get(entrance)
    print("waiting for u")
    doclist = []  # 此处设置断点
    total = 1
    html = driver.page_source.encode('utf-8')
    soup = BeautifulSoup(html, "lxml")
    div = soup.find(
        "div",
        attrs={
            "class":
            "card__group card__group--river card__group--river-channel"
        })
    articles = div.find_all(
        "div",
        attrs={
            "class": "card__item card__item--river card__item--river-channel"
        })
    for i in articles:
        try:
            a = i.find("a")
            url = "https://www.whowhatwear.com" + a['href']
            filename = HttpHelper.fetchAndSave(url, "utf-8",
                                               "D:/pages/whowhatwear.com")
            doclist.append({
                "filename": filename,
                "url": url,
                "state": "fetched",
                "domain": "www.whowhatwear.com"
            })
            print(total)
            total += 1
        except Exception as err:
            print(err)
    collection.insertMany(doclist)
コード例 #17
0
def fetchAllBlog():
    try:
        catCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME,
                                    'category')
        collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME,
                                 'blog')
        total = 0
        while True:
            blogList = collection.nextPage(100)
            if len(blogList) == 0:
                break
            for blog in blogList:
                if blog['state'] == 'CLOSED':
                    fileName = HttpHelper.fetchAndSave(blog['url'], "utf-8",
                                                       HTML_ROOT_PATH)
                    if fileName != None and len(fileName) > 0:
                        blog['fileName'] = fileName
                        blog['state'] = "FETCHED"
                    else:
                        blog['state'] = "CLOSED"
                    collection.updateOne(blog)
                    total += 1
                    print("url=" + blog['url'])
                    print("total=" + str(total))

    except Exception as err:
        print(err)
    finally:
        print("exit")
コード例 #18
0
def amazonfetch_detail():
    doclist = []
    total = 1
    collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter")

    while True:
        slist = collection.nextPage(100)
        if slist == None or len(slist) == 0:
            break
        for i in slist:
            doclist.append(i)

    for x in doclist:
        if x['state'] != "fetched":
            continue


        try:
            status, html = HttpHelper.fetch(x['url'])
            soup = BeautifulSoup(html, "lxml")
            title = soup.find_all("span", attrs={"id": "productTitle"})
            for i in title:
                text = i.text
                title = text.strip()
            a = soup.find_all("a", attrs={"id": "bylineInfo"})  # bylineInfo brand
            for i in a:
                href = i['href']
                if re.match("^/{1}.*", href):
                    href = "http://www.amazon.com" + href
            description = soup.find_all("ul", attrs={"class": "a-unordered-list a-vertical a-spacing-none"})
            for i in description:
                doc = {"_id": x['_id'], "brand": x['brand'], "url": x['url'], "state": "pass", "price": x['price']
                            , "title": title, "brand_a": href, "inner_des": str(i)}
            collection.updateOne(doc)
            doc.clear()
        except Exception as err:
            print(err)
            continue
        print(total)
        total += 1
コード例 #19
0
def fetch_dx():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBDxCom", "pages")
    entrance = "http://www.dx.com/c/computer-office-399/networking-314"
    pageNum = 1
    total = 1
    doc = []
    while pageNum < 11 and entrance is not None:
        html = HttpHelper.fetch(entrance)[1]
        soup = BeautifulSoup(html, "lxml")
        proUl = soup.find_all("ul", attrs={"class": "productList subList"})
        for proList in proUl:
            li = proList.find_all("li", attrs={"class": "c_cates"})
            for i in li:
                try:
                    photo = i.find("div", attrs={"class": "photo"})
                    url = "https://www.dx.com" + photo.find("a")['href']
                    filename = HttpHelper.fetchAndSave(url, "utf-8",
                                                       "D:/pages/dx.com")
                    doc.append({
                        "filename": filename,
                        "url": url,
                        "state": "fetched"
                    })
                    print(total)
                    total += 1
                except Exception as err:
                    print(err)
        a = soup.find_all("a", attrs={"class": "next"})
        if a is None:
            entrance = None
            print("NO." + str(pageNum))
            pageNum += 1
            continue
        next = a[-1]
        entrance = "https://www.dx.com" + next['href']
        print(entrance)
        print(pageNum)
        pageNum += 1
    if doc != []:
        collection.insertMany(doc)
コード例 #20
0
ファイル: article.py プロジェクト: yang11037/Pycharm_Project
def updateAllArticle(MONGO_HOST, MONGO_DATABASE_NAME, IMPORT_URL):
    try:
        articleCollection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME,
                                        'article')
        total = 0
        while True:
            articleList = articleCollection.nextPage(10)
            if len(articleList) == 0:
                break

            newArticleList = []
            for article in articleList:
                total += 1
                print("total=" + str(total))

                url = article['url']
                retry = 0
                while True:
                    retry += 1
                    if retry > 2:
                        break
                    statusCode, html = HttpHelper.fetch(url)
                    if html != None and len(html) > 0:
                        article['status'] = 0
                        # Check title, TODO
                        print("update article ok, retry=" + str(retry) +
                              ", url=" + url)
                        break
                    else:
                        article['status'] = -1
                        print("update article error, retry=" + str(retry) +
                              ", url=" + url)
                        time.sleep(1)
                article['updateTime'] = datetime.now()
                articleCollection.updateOne(article)

    except Exception as err:
        print(err)
    finally:
        print("exit")
コード例 #21
0
def fetch_fashionbeans():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBFashionbeansCom",
                             "pages")
    entrance = "http://www.fashionbeans.com/category/mens-hairstyles/"
    doclist = []
    total = 1
    num = 1
    while total < 200 and entrance is not None:
        html = HttpHelper.fetch(entrance)
        soup = BeautifulSoup(html[1], "lxml")

        div = soup.find("div", attrs={"id": "catmainBody"})
        articles = div.find_all("div", attrs={"class": "catArticles"})
        for i in articles:
            try:
                a = i.find("a", attrs={"class": "left relative"})
                url = a['href']
                filename = HttpHelper.fetchAndSave(
                    url, "utf-8", "D:/pages/fashionbeans.com")
                doclist.append({
                    "filename": filename,
                    "url": url,
                    "state": "fetched",
                    "domain": "www.fashionbeans.com"
                })
                print(total)
                total += 1
            except Exception as err:
                print(err)

        a = soup.find("a", attrs={"class": "nextLink right"})
        print("页数:" + str(num))
        if a is None:
            entrance = None
            continue
        num += 1
        entrance = a['href']
    if doclist != []:
        collection.insertMany(doclist)
コード例 #22
0
def initCat():
    catDict = {
        '糖尿病': 'diabetes',
        '肺癌': 'Lung cancer',
        '风湿': 'Rheumatism',
        '牛皮癣': 'Psoriasis',
        '肺梗阻': 'Pulmonary obstruction',
        '失禁': 'Incontinence',
        'aarp补充医疗': 'aarp supplementary medical treatment',
        '其他疾病': 'other illnesses',
        '乳腺癌': 'Breast cancer',
        '多发性硬化症': 'Multiple sclerosis',
        '哮喘': 'Asthma',
        '药瘾': 'Drug addiction',
        '酒瘾': 'Alcoholism',
        #         '小企业融资':'Small Business Financing',
        #         'business phone':'business phone',
        #         'network security':'network security',
        #         'cloud':'cloud',
        #         '商业软件':'commercial software',
        #         '小企业软件':'Small Business Software',
        #         '财务软件':'financial software',
        #         '税务软件':'Tax software',
        #         '小企业安卓财务app':'Small Business Android Finance app',
        #         '薪酬管理软件':'Compensation Management Software',
        #         'SAP':'SAP',
        #         '补丁管理软件':'Patch management software',
        #         '网络服务':'Internet service',
        #         '域名':'Domain name',
        #         '虚拟服务器':'virtual server'
    }
    collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME,
                             'category')
    for cname in catDict.keys():
        ename = catDict[cname]
        slug = ename.lower().replace(" ", "-")
        doc = {"ename": ename, "cname": cname, "slug": slug}
        collection.insertOne(doc)
コード例 #23
0
def fetch_digitaltrends():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBDigitaltrendsCom",
                             "pages")
    entrance = "https://www.digitaltrends.com/tv-reviews/"
    doclist = []
    total = 1
    while total < 120 and entrance is not None:
        html = HttpHelper.fetch(entrance)
        soup = BeautifulSoup(html[1], "lxml")

        div = soup.find("div", attrs={"class": "m-products"})
        item = div.find_all("div", attrs={"class": "item"})
        for i in item:
            try:
                h3 = i.find("h3", attrs={"class": "title"})
                url = h3.find("a")['href']
                filename = HttpHelper.fetchAndSave(
                    url, "utf-8", "D:/pages/digitaltrends.com")
                doclist.append({
                    "filename": filename,
                    "url": url,
                    "state": "fetched",
                    "domain": "www.digitaltrends.com"
                })
                print(total)
                total += 1
            except Exception as err:
                print(err)

        a = soup.find_all("a", attrs={"class": "next page-numbers"})
        if a is None:
            entrance = None
            continue
        next = a[-1]
        entrance = next['href']
    if doclist != []:
        collection.insertMany(doclist)
コード例 #24
0
def fetch_cnet():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBCnetCom", "pages")
    entrance = "https://www.cnet.com/topics/tablets/products/"
    doclist = []
    total = 1
    while total < 80 and entrance is not None:
        html = HttpHelper.fetch(entrance)
        soup = BeautifulSoup(html[1], "lxml")

        section1 = soup.find("section", attrs={"id": "dfllResults"})
        section2 = section1.find_all(
            "section", attrs={"class": "col-3 searchItem product "})
        for i in section2:
            try:
                a = i.find("a", attrs={"class": "imageWrap"})
                url = "https://www.cnet.com" + a['href']
                filename = HttpHelper.fetchAndSave(url, "utf-8",
                                                   "D:/pages/cnet.com")
                doclist.append({
                    "filename": filename,
                    "url": url,
                    "state": "fetched",
                    "domain": "www.cnet.com"
                })
                print(total)
                total += 1
            except Exception as err:
                print(err)

        a = soup.find_all("a", attrs={"class": "next"})
        if a is None:
            entrance = None
            continue
        next = a[-1]
        entrance = "https://www.cnet.com" + next['href']
    if doclist != []:
        collection.insertMany(doclist)
コード例 #25
0
def createAllPost():
    try:
        pdCollection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg", "supplement_copy")
        
        total = 0
        while True:
            pdList = pdCollection.nextPage(10)
            if pdList == None or len(pdList) == 0:
                break
            
            for pd in pdList:
                if pd['state'] != 'built':
                    continue
                
                newID = createPost(pd)
                if newID != None:
                    doc = {"_id": pd['_id'], "ID":newID, "cat": pd['cat'], "fileName": pd['fileName'],
                                "url": pd['url'],
                                 "host": pd['host'],
                                 "state": "posted", "title": pd['title'], "content": pd['content'],
                                 "description": pd['description'],
                                 "attrlist": pd['attrlist'], "contenthtml": pd['contenthtml']}
                    print(doc['ID'])
                    pdCollection.updateOne(doc)
                    doc.clear()
                    print ("create post ok")
                else:
                    print ("create post error")
                
                total += 1
                print('total=' + str(total) + ', title=' + pd['title'])
                
        print ('Creawte all posts ok')
        
    except Exception as err :
        print(err)
コード例 #26
0
def test():
    collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "drugs",
                             "url")
    doclist = []
    print("dbcom")

    html = HttpHelper.fetch("https://www.drugs.com/alpha/a5.html")
    soup = BeautifulSoup(html[1])
    # print(soup)
    list = soup.find_all('ul', attrs={"class": "doc-type-list"})
    # list = soup.find_all('ul', attrs={"class": re.compile('doc-type*')})
    # print(list)
    for i in list:
        li = i.find_all('a')

        for j in li:
            a = j.text
            # print(a)
            b = j['href']
            # print(b)
            print('\n')
            doclist.append({"url": a, "title": b})
        print(doclist)
        collection.insertMany(doclist)
コード例 #27
0
def dump():
    collection = MongoHelper(MONGO_HOST, 27017, MONGO_DATABASE_NAME, 'keyword')
    total = 0
    while True:
        list = collection.nextPage(100)
        if list == None or len(list) == 0:
            break
        total += len(list)
        print("total=" + str(total))
    print("total=" + str(total))

    collection.resetStartId()
    total = 0
    while True:
        list = collection.nextPage(100)
        if list == None or len(list) == 0:
            break
        total += len(list)
        print("second total=" + str(total))
    print("second total=" + str(total))
コード例 #28
0
def test_chromedriver():
    try:
        total = 1
        doclist = []
        collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter")
        while True:
            slist = collection.nextPage(100)
            if slist == None or len(slist) == 0:
                break
            for i in slist:
                doclist.append(i)

        for page in doclist:
            print(total)
            total += 1
            if page['state'] != 'pass':
                continue

            driver = webdriver.Chrome('C:/Program Files (x86)/Google/Chrome/Application/chromedriver.exe')
            driver.get(page['url'])
            print("wait for u")

            size = driver.find_elements_by_tag_name("img")
            for i in size:
                if i.location['x'] == 19 or i.location['x'] == 71:
                    if i.size == {'height': 40, 'width': 40}:
                        ActionChains(driver).move_to_element(i).click(i).perform()

            html = driver.page_source.encode('utf-8')
            driver.close()
            soup = BeautifulSoup(html, "lxml")
            with open("./product.csv", "a+", newline='', encoding="utf-8") as c:
                writer = csv.writer(c, dialect='excel')
                list = soup.find_all("div", attrs={"class": "imgTagWrapper"})

                img = ""
                for i in list:
                    imge = i.find_all("img")
                    for j in imge:
                        img = img + j['src'] + ","
                img = img[0:-1]

                price = ""
                pricetxt = soup.find_all("span", attrs={"id": "priceblock_ourprice"})
                for i in pricetxt:
                    price = i.text
                    price = price.strip()

                des = ""
                text2 = soup.find_all("div", attrs={"id": "productDescription"})
                '''div,class:aplus-v2 desktop celwidget  
                   div id: productDescription
                '''
                for i in text2:
                    des = i
                des = des.encode("utf-8").decode()
                des = des.strip()
                des_html = "<div class=\"productdescription\">" + des +"</div>"

                img = img.encode("utf-8").decode()
                img = img.strip()

                sdes = page['inner_des']
                sdes = "<div class = \"short-des\">" + "<a href = \"" + page['brand_a'] + "\">" + \
                       "<font size=1 color=blue>" + page['brand'] + "</font></a><br>About the product<br>"\
                       + sdes + "</div>"

                writer.writerow(['', 'simple', '', page['title'], '1', '0', 'visible', sdes, des_html, '', '',
                                 'taxable', '', '1', '', '0', '0', '', '', '', '', '1', '', '', price,
                                 'blood glucose meter', '', '', img, '', '', '', '', '', '', '', '', '0'])
                print("csv ok")

                doc = {"_id": page['_id'], "brand": page['brand'], "url": page['url'], "state": "posted",
                            "price": price, "title": page['title'], "brand_a": page['brand_a'],
                            "inner_des": page['inner_des'], "product_des": des}
                collection.updateOne(doc)
                doc.clear()
                print("mongo ok")
            c.close()

    except Exception as err:
        print(err)
コード例 #29
0
def test2():
    '''
    以下过程为提取collection中的所有url,最后得到集合doclist
    :return:
    '''
    collection = MongoHelper("172.16.40.140", 27017, "ZDBMedlineplusOrg",
                             "supplement_copy", "url")
    nlp = NLPHelper()
    doclist = []
    doc = []  # doc作为新key:attrlist的
    while True:
        slist = collection.nextPage(100)
        if slist == None or len(slist) == 0:
            break
        for i in slist:
            doclist.append(i)
    # print(doclist)

    total = 0
    '''
    以下过程为解析每一个url
    '''
    for i in doclist:

        if i['state'] != "FETCHED":  # 只采集state为FETCHED的对象
            continue

        # 来源为一号网站时
        if i['cat'] == 1:
            html = HttpHelper.fetch(i['url'])
            soup = BeautifulSoup(
                html[1])  # html结构为[statusCode, html],所采集标号为1的元素
            slist = soup.find_all("section")
            content = ""  # 初始化content为一个空字符串
            for j in slist:  # j是section
                hlist = j.find_all("h2")  # tag为h2的都是小标题
                for x in hlist:  # x是title
                    title = x.text

                tlist = j.find_all("div", attrs={"class": "section-body"})
                for y in tlist:  # y是具体一个上面小标题对应的内容
                    doc.append({
                        "subtitle": title,
                        "innerhtml": str(y),
                        "text": y.text
                    })
                    content += y.text  # 总的一页内容是每一个小标题的内容之和

            description = nlp.getSummary(content, wordCount=20)  # 创建描述
            '''
            加入要覆盖当前collection的doc
            '''
            doc2 = {
                "_id": i['_id'],
                "cat": i['cat'],
                "fileName": i['fileName'],
                "url": i['url'],
                "host": i['host'],
                "state": "completed",
                "title": i['title'],
                "content": content,
                "description": description,
                "attrlist": doc
            }
            collection.updateOne(doc2)
            doc.clear()
            doc2.clear()  # 每完成一次更新将两个doc清空
            total += 1
            print(total)  # 打印出当前完成的document总数

        # 来源二号网站,原理基本相同
        elif i['cat'] == 2:
            html = HttpHelper.fetch(i['url'])
            soup = BeautifulSoup(html[1])
            slist = soup.find_all(
                "div", attrs={"class": re.compile('field field-name-body*')})
            content = ""
            for j in slist:
                hlist = j.find_all("h2")
                titlearr = []  # 用来存放当前页面的小标题,以便在插入时与innerhtml一一对应
                for x in hlist:  # x是title
                    title = x.text
                    titlearr.append(title)
                tlist = j.find_all("ul")
                index = 0
                for y in tlist:
                    if index > len(titlearr) - 1:  # 防止索引越界
                        break
                    doc.append({
                        "subtitle": str(titlearr[index]),
                        "innerhtml": str(y),
                        "text": y.text
                    })
                    content += y.text
                    index = index + 1
                titlearr.clear()
            #print(content)
            description = nlp.getSummary(content, wordCount=20)
            doc2 = {
                "_id": i['_id'],
                "cat": i['cat'],
                "fileName": i['fileName'],
                "url": i['url'],
                "host": i['host'],
                "state": "completed",
                "title": i['title'],
                "content": content,
                "description": description,
                "attrlist": doc
            }
            collection.updateOne(doc2)
            doc.clear()
            doc2.clear()
            # print(j)

            total += 1
            print(total)
コード例 #30
0
def amazonfetch():
    total = 1
    goods = 1
    url = "https://www.amazon.com/s/ref=sr_as_oo?rh=i%3Aaps%2Ck%3Ablood+pressure+monitor&keywords=blood+pressure+mon" \
          "itor&ie=UTF8&qid=1527130301"
    collection = MongoHelper("172.16.40.140", 27017, "ZDBTestCom", "bloodglucosemeter")

    '''excel = xlwt.Workbook()
    sheet = excel.add_sheet("Blood glucose meter")
    content = "brand"
    sheet.write(0,0,content)
    content = "url"
    sheet.write(0,1,content)
    row = 1'''
    doc = []

    while url != None:
        statuscode, html = HttpHelper.fetch(url)
        soup = BeautifulSoup(html)
        for s in soup('script'):
            s.extract()
        #print(soup.prettify())
        #return

        li_all = soup.find_all("li", attrs={"id":re.compile("^result_\d{1,2}")})
        #print(li_all[3])
        #return
        for li in li_all:
            print("正在检查第"+ str(goods) + "件商品")
            goods += 1
            flag = li.find_all("p",attrs={"class":"acs-mn2-midwidgetHeader"})
            if flag != []:
                print(flag)
                continue
            #print("flagok")
            a = li.find_all("a", attrs={"class": re.compile("^a-link-normal s-access-detail-page.*")})
            if a == []:
                continue
            #print("aok")
            for i in a:
                url2 = i['href']
            branddiv = li.find_all("div", attrs={"class": "a-row a-spacing-none"})
            if branddiv == []:
                continue
            #print("brandok")
            brand = ""
            for i in branddiv:
                span = i.find_all("span", attrs={"class": "a-size-small a-color-secondary"})
                if span == []:
                    continue
                #print("spanok")
                for j in span:
                    brand += j.text
            brand = brand[3:]
            p = li.find_all("span", attrs={"class": "sx-price-whole"})
            if p == []:
                continue
            for i in p:
                price = i.text
            if price == []:
                continue
            #print("priceok")
            div = li.find_all("div", attrs={"class":"a-row a-spacing-mini"})
            if div == []:
                continue
            #print("divok")
            for j in div:
                comment_all = j.find_all("a", attrs={"class":"a-size-small a-link-normal a-text-normal"})
                if comment_all == []:
                    continue
                #print("comok")
                for i in comment_all:
                    comment = i.text

            print("price的类型是:")
            print(type(price))
            print(type(comment))
            price = price.replace(",", "")
            comment = comment.replace(",", "")
            print(price)
            print(comment)

            try:
                if isinstance(price, str):
                    price1 = int(price)
                if isinstance(comment, str):
                    comment1 = int(comment)
            except Exception as err:
                print(err)
                continue

            if price1 > 20 and price1 < 50 and comment1 > 100:
                print(brand)
                print("No." + str(total))
                total +=1
                url3 = url2
                if re.match("^/{1}.*", url2):
                    url3 = "https://www.amazon.com" + url2
                '''sheet.write(row,0,brand)
                sheet.write(row,1,url3)
                row += 1'''
                doc.append({'brand': brand, 'url': url3, 'state': 'fetched', 'price': price + ".99"})

            if total > 90:
                print("completed")
                #excel.save("D:/电商/test.xls")
                collection.insertMany(doc)
                return


        next_page = soup.find_all("a",attrs={"id":"pagnNextLink"})
        if next_page == []:
            url = None
            continue
        for i in next_page:
            if re.match("^/{1}.*", i['href']):
                url = "https://www.amazon.com"+ i['href']
            else:
                url = i['href']

    print("not enough 90")
    # excel.save("D:/电商/test.xls")
    collection.insertMany(doc)