コード例 #1
0
def insertdb(data):
    downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    try:
        collection.bulk_write(data)
        print('添加完成' + downloadTime)
    except:
        print('重复添加' + downloadTime)
コード例 #2
0
def insertdb(urls, title, aa, pub_time, downloadTime, aid, onlyIds):
    site = "雪球网"
    siteId = 1048420
    push_state = 0
    data = []
    data.append(
        InsertOne({
            "url": urls,
            "title": title,
            "aid": aid,
            "content": aa,
            "site": site,
            "pub_time": pub_time,
            "push_state": push_state,
            "site_id": siteId,
            "download_Time": downloadTime,
            "only_id": onlyIds,
        }))
    downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    try:
        collection.bulk_write(data)
        print('添加完成' + downloadTime)
    except:

        print('重复添加' + downloadTime)
コード例 #3
0
def insertdb(data):
    try:
        collection.bulk_write(data)
        print('添加完成')
    except Exception as err:
        print("添加重复")
        pass
コード例 #4
0
def insertdb(datass):
    downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    try:
        collection.bulk_write(datass)
        print('添加完成' + downloadTime)
    except Exception as err:
        print("重")
        pass
コード例 #5
0
def insertdb (data):
    downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    try:
        collection.bulk_write(data)
        collection.update_one(data, {'$set': data}, upsert=True)
        print('添加完成'+downloadTime)
    except:
        print('重复添加'+downloadTime)
コード例 #6
0
def article():
    idd = [
        '{"page":1,"rows":10,"id":"59"}', '{}',
        '{"page":1,"rows":10,"id":"51"}', '{"page":1,"rows":10,"id":"58"}',
        '{"page":1,"rows":10,"id":"56"}'
    ]
    for datas in idd:
        try:
            datass = datas
            response = ss.post(
                'https://api.hunan-show.com/system/topicBase/getPageSetHome',
                headers=headers,
                data=datass)
            content = response.content.decode('utf-8')
            id = re.compile('"uuid":"(.*?)",').findall(str(content))
            for ids in id:
                try:
                    url = "https://api.hunan-show.com/system/topicBase/getDocDetailByUuid?uuid=" + ids
                    res = ss.get(url)
                    article = res.content.decode('utf-8')
                    print()
                    title = re.compile('"title":"(.*?)",').findall(
                        str(article))
                    pubtime = re.compile('"releaseTime":"(.*?)",').findall(
                        str(article))
                    content = re.compile('"content":"(.*?)",').findall(
                        str(article))
                    site = "中国(湖南)国际矿物宝石博览会"
                    siteId = 1049645
                    pushState = 0
                    downloadTime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                    data = []
                    data.append(
                        InsertOne({
                            "url": url,
                            "title": title[0],
                            "pub_time": pubtime[0],
                            "content": content[0],
                            "download_time": downloadTime,
                            "site": site,
                            "site_id": siteId,
                            "aid": ids,
                            'push_state': pushState,
                        }))
                    try:
                        collection.bulk_write(data)
                        print('添加完成')
                        print(downloadTime)
                    except Exception as err:
                        print("添加重复")
                except Exception as err:
                    time.sleep(10)
                    pass
        except Exception as err:
            time.sleep(10)
            pass
コード例 #7
0
def insertdb(data):
    downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    try:
        collection.bulk_write(data)
        print('添加完成' + downloadTime)

    except Exception as err:
        import traceback
        print('添加重复' + downloadTime)
        pass
コード例 #8
0
def safe_bulk_delete(collection: pymongo.collection.Collection, ids, id_key='_id'):
    """Sometimes when you want to delete a bunch of documents using an identifier the 'delete document' itself exceeds
    the 16MB Mongo limit.  This function will catch such cases and break up the command into suitably batches"""
    ids = list(set(ids))  # No needs to repeat ourselves
    try:
        collection.delete_many({id_key: q.in_(*ids)})
    except pymongo.errors.DocumentTooLarge:
        # Use bulk operation instead
        # Note, this could be spead up further by batching the deletes but for now it's not worth it
        bulk_ops = [pymongo.DeleteOne({id_key: entry_id}) for entry_id in ids]
        collection.bulk_write(bulk_ops)
コード例 #9
0
def article():
    response = requests.get(
        'https://www.0735cs.com/article/list_20_1_0_0_1_1.html',
        headers=headers,
        cookies=cookies)
    content = response.content.decode('utf-8')
    id = re.compile('article_(.*?).html').findall(str(content))
    for ids in id:
        try:
            url = 'https://www.0735cs.com/article/article_' + ids + '.html'
            res = requests.get(url)
            article = res.content.decode('utf-8')
            title = re.compile('<h3 .*?>(.*?)</h3>').findall(str(article))
            pubtime = re.compile('<font id="createtime">(.*?)</font>').findall(
                str(article))
            content = re.compile(
                '<div class="txt" id="resizeIMG">([\s\S]*?.)<div class="contentPadding" .*?>'
            ).findall(str(article))
            site = "郴州城事"
            siteId = 1049649
            pushState = 0
            downloadTime = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            data = []
            data.append(
                InsertOne({
                    "url": url,
                    "title": title[0],
                    "pub_time": pubtime[0],
                    "content": content[0],
                    "download_time": downloadTime,
                    "site": site,
                    "site_id": siteId,
                    "aid": ids,
                    'push_state': pushState,
                }))
            try:
                collection.bulk_write(data)
                print('添加完成')
                print(downloadTime)
            except Exception as err:
                print("添加重复")
        except Exception as err:
            print()
コード例 #10
0
            siteId = 1050145
            pushState = 0
            downloadTime = datetime.datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            data = []
            data.append(
                InsertOne({
                    "url": urll,
                    "title": title,
                    "aid": urll,
                    "content": strs,
                    "site": site,
                    "pub_time": pubTime,
                    "push_state": pushState,
                    "site_id": siteId,
                    "download_Time": downloadTime
                }))
            try:
                collection.bulk_write(data)
                print('添加完成')
                print('下载时间' + downloadTime)
                print('发布时间' + pubTime)
            except Exception as err:
                print("添加重复")
                print('下载时间' + downloadTime)
                print('发布时间' + pubTime)
        except Exception as err:
            print()

    time.sleep(300)
コード例 #11
0
def article():
    try:
        list = guanjianci.key_list
        for ids in list:
            url = "https://www.instagram.com"
            urlss = "https://www.instagram.com/explore/tags/" + ids + "/"

            # chrome_options = webdriver.ChromeOptions()
            # chrome_options.add_argument('--no-sandbox')  # 解决DevToolsActivePort文件不存在的报错
            # chrome_options.add_argument('window-size=1920x3000')  # 指定浏览器分辨率
            # chrome_options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
            # chrome_options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
            # chrome_options.add_argument('blink-settings=imagesEnabled=false')  # 不加载图片, 提升速度
            # chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败
            # driver = webdriver.Chrome("/usr/bin/chromedriver", chrome_options=chrome_options)

            option = webdriver.ChromeOptions()
            option.binary_location = 'C:\\Users\\86139\\AppData\\Local\\googles\\Chrome-bin\\chrome.exe'
            driver = webdriver.Chrome(
                'C:\\Users\\86139\\AppData\\Local\\googles\\Chrome-bin\\chromedriver.exe',
                options=option)

            driver.get(url)
            time.sleep(10)
            print('111')
            username_xpath = '//input[@name="username"]'
            login_xpath = '//button[@class="sqdOP  L3NKy   y3zKF     "]'
            password_xpath = '//input[@name="password"]'

            driver.find_element_by_xpath(username_xpath).send_keys(
                '+8615313137407')
            driver.find_element_by_xpath(password_xpath).send_keys('wqs159888')
            driver.find_element_by_xpath(login_xpath).click()

            time.sleep(10)
            driver.get(urlss)
            print('222')
            content = driver.page_source
            # response = requests.get('https://www.instagram.com/explore/grid/', headers=headers, params=params)
            # response = requests.get(url, headers=headers)
            # if response.status_code == 429:
            #     downloadTime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            #     print(downloadTime)
            #     time.sleep(86400)
            # content = response.content.decode('unicode-escape')
            # content = response.content
            id = re.compile('"code":"(.*?)"').findall(str(content))
            idshuzu = set(id)
            from random import choice
            for articleid in idshuzu:
                # arr = [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
                # arrs = choice(arr)
                # time.sleep(int(arrs))
                try:
                    imgs = ''
                    videourl = ''
                    articleurl = "https://www.instagram.com/p/" + articleid + "/"
                    driver.get(articleurl)
                    content = driver.page_source
                    ac = re.compile('"text":"(.*?)"}').findall(str(content))
                    ac = ac[0]
                    ab = re.compile('(\\\\ud...)').findall(str(ac))
                    for te in ab:
                        ac = ac.replace(te, '')
                    if ac:
                        ac = ac.encode('utf-8',
                                       'replace').decode('unicode-escape')
                    else:
                        pass
                    pubTime = re.compile(
                        '"taken_at_timestamp":(.*?),"').findall(str(content))
                    timeArray = time.localtime(int(pubTime[0]))
                    pubTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
                    replyCount = re.compile(
                        '"edge_media_to_parent_comment":{"count":(.*?),"'
                    ).findall(str(content))
                    likeCount = re.compile(
                        '"edge_media_preview_like":{"count":(.*?),"').findall(
                            str(content))
                    imgCount = re.compile('"display_url":"(.*?)"').findall(
                        str(content))
                    for im in imgCount:
                        im = im.encode('utf-8',
                                       'replace').decode('unicode-escape')
                        imgs += "<br><img src=\'" + im + "\'></img>"
                    videoCount = re.compile('"video_url":"(.*?)"').findall(
                        str(content))
                    if videoCount:
                        videourl = "<br><video src='" + videoCount[
                            0] + "' controls=" "></video>"
                        videourl = videourl.encode(
                            'utf-8', 'replace').decode('unicode-escape')
                    title = re.compile('<title>([\s\S]*?)</title>').findall(
                        str(content))
                    title = title[0]
                    if title == []:
                        title = ac
                    title = title
                    if title == '\nInstagram\n':
                        title = ac
                    articleContent = ac + '<br>' + imgs + '<br>' + videourl
                    site = "instagram"
                    siteId = 1049117
                    pushState = 0
                    downloadTime = datetime.datetime.now().strftime(
                        '%Y-%m-%d %H:%M:%S')
                    data = []
                    data.append(
                        InsertOne({
                            "url": articleurl,
                            "title": title,
                            "pub_time": pubTime,
                            "content": articleContent,
                            "download_time": downloadTime,
                            "site": site,
                            "site_id": siteId,
                            "aid": articleid,
                            "only_id": articleid,
                            'push_state': pushState,
                            'like_num': int(likeCount[0]),
                            'cmt_num': int(replyCount[0]),
                        }))
                    # insertdb(data)

                    try:
                        collection.bulk_write(data)
                        print('添加完成')
                        print('下载时间' + downloadTime)
                        print('发布时间' + pubTime)
                    except Exception as err:
                        print("添加重复")
                        print('下载时间' + downloadTime)
                        print('发布时间' + pubTime)
                except Exception as err:
                    import traceback
                    traceback.print_exc()
                    pass
            driver.quit()
            os.system('/root/chromes.sh')
            os.system('/root/chromess.sh')
            #video re.video_url
            #img re.display_url 视频的话就是封面图
        print()
    except Exception as err:
        import traceback
        driver.quit()
        os.system('/root/chromes.sh')
        os.system('/root/chromess.sh')
        traceback.print_exc()
        pass
コード例 #12
0
def article(headers2):
    a = 0
    list = guanjianci.key_list

    a = a + 1
    for lis in list:
        try:
            from random import choice
            par = dict(params)
            par['q'] = lis
            arr = [50]
            arrs = choice(arr)
            time.sleep(int(arrs))
            response = ss.get('https://www.facebook.com/search/posts',
                              params=par,
                              headers=headers2)
            print(response.status_code)
            # if response.status_code != 200:
            #     headers2 = headers1
            #     pass
            content = response.content.decode('utf-8')
            id = re.compile('"id":"vm-(.*?):').findall(str(content))
            url = re.compile('"permalink":"(.*?)"').findall(str(content))
            if url == []:
                print('进入休眠')
                time.sleep(7200)
                break
            for ur in url:
                try:
                    urls = str(ur).replace('\\', '')
                    # urls = 'https://www.facebook.com/groups/2337886349768125/posts/2883216531901768'
                    arrs = choice(arr)
                    time.sleep(int(arrs))
                    res = ss.get(urls, headers=headers2)
                    article = res.content.decode('utf-8')
                    articles = re.compile('"wwwURL":"(.*?)"').findall(
                        str(article))
                    times = re.compile('"creation_time":(.*?),').findall(
                        str(article))
                    likeCount = re.compile(
                        '"reaction_count":{"count":(.*?),"').findall(
                            str(article))
                    title = re.compile('"message":{"text":"(.*?)"},"').findall(
                        str(article))
                    for urs, ti, like, til in zip(articles, times, likeCount,
                                                  title):
                        try:
                            ac = ''
                            ab = re.compile('(\\\\ud...)').findall(str(til))
                            for te in ab:
                                til = til.replace(te, '')
                            if til[-1] == '\\':
                                til = til[:-1]
                            tils = til.encode(
                                'utf-8', 'replace').decode('unicode-escape')
                            urss = str(urs).replace('\\', '')
                            timeArray = time.localtime(int(ti))
                            pubTime = time.strftime("%Y-%m-%d %H:%M:%S",
                                                    timeArray)
                            arcontent = tils
                            site = "Facebook"
                            siteId = 1049117
                            pushState = 0
                            downloadTime = datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S')
                            data = []
                            data.append(
                                InsertOne({
                                    "url": urss,
                                    "title": tils,
                                    "pub_time": pubTime,
                                    "content": arcontent,
                                    "download_time": downloadTime,
                                    "site": site,
                                    "site_id": siteId,
                                    "aid": urss,
                                    'push_state': pushState,
                                    'like_num': int(like),
                                }))
                            try:
                                collection.bulk_write(data)
                                print('添加完成')
                                print('下载时间' + downloadTime)
                                print('发布时间' + pubTime)
                            except Exception as err:
                                print("添加重复")
                                print('下载时间' + downloadTime)
                                print('发布时间' + pubTime)
                        except Exception as err:
                            import traceback

                            traceback.print_exc()
                            pass
                except Exception as err:
                    import traceback

                    traceback.print_exc()
                    pass
        except Exception as err:
            import traceback

            traceback.print_exc()
            pass
コード例 #13
0
def insertdb(data):
    try:
        collection.bulk_write(data)
        print('添加完成')
    except:
        print('重复添加')
コード例 #14
0
def article(browser):
    import random

    a = 0
    list = guanjianci.key_list
    a = a + 1
    for lis in list:
        print("333")
        browser.find_element_by_xpath(
            "//input[@class='oajrlxb2 rq0escxv f1sip0of hidtqoto e70eycc3 lzcic4wl hzawbc8m ijkhr0an aaoq3grb sgqwj88q b3i9ofy5 oo9gr5id b1f16np4 hdh3q7d8 dwo3fsh8 qu0x051f esr5mh6w e9989ue4 r7d6kgcz br7hx15l h2jyy9rg n3ddgdk9 owxd89k7 ihxqhq3m jq4qci2q k4urcfbm iu8raji3 qypqp5cg l60d2q6s hv4rvrfc hwnh5xvq ez2duhqw rmlgq0sb dzqu5etb aj8hi1zk r4fl40cc kd8v7px7 m07ooulj mzan44vs']"
        ).send_keys(lis)
        browser.find_element_by_xpath("//input[@value='" + lis +
                                      "']").send_keys(Keys.ENTER)
        time.sleep(3)

        next_btn = browser.find_element_by_xpath("//*[text()='帖子']")
        browser.execute_script("arguments[0].click();", next_btn)
        time.sleep(10)
        next_btntwo = browser.find_element_by_xpath(
            "//input[@aria-label='近期帖子']")
        browser.execute_script("arguments[0].click();", next_btntwo)
        time.sleep(10)
        print("444")
        for a in range(10):
            browser.execute_script(
                'window.scrollTo(0,document.body.scrollHeight)')
            time.sleep(3)
        length = len(list)
        if length < 1:
            return ''
        if length == 1:
            return str(list[0])
        try:
            from random import choice
            # par = dict(params)
            # par['q'] = lis
            # arr = [30,10,20,40,50]
            # arrs = choice(arr)
            # time.sleep(int(arrs))
            # browser.get('https://www.facebook.com/search/posts?filters=eyJyZWNlbnRfcG9zdHM6MCI6IntcIm5hbWVcIjpcInJlY2VudF9wb3N0c1wiLFwiYXJnc1wiOlwiXCJ9In0=&q='+str(list[randomNumber]))
            content = browser.page_source
            url = re.compile(
                '<a class="oajrlxb2 g5ia77u1 qu0x051f esr5mh6w e9989ue4 r7d6kgcz rq0escxv nhd2j8a9 a8c37x1j p7hjln8o kvgmc6g5 cxmmr5t8 oygrvhab hcukyx3x jb3vyjys rz4wbd8a qt6c0cv9 a8nywdso i1ao9s8h esuyzwwr f1sip0of lzcic4wl gmql0nx0 p8dawk7l" href="https://www.facebook.com/groups/(.*?)/posts/(.*?)/" role="link" tabindex="0">'
            ).findall(str(content))
            if url == []:
                login()
            for ur in url:
                print("555")
                try:
                    # urls = str(ur).replace('\\', '')
                    # arrs = choice(arr)
                    # time.sleep(int(arrs))https://www.facebook.com/groups/334036650079422/posts/1955843984565339/
                    urls = "https://www.facebook.com/groups/" + ur[
                        0] + "/posts/" + ur[1] + "/"
                    next_btno = browser.find_element_by_xpath("//a[@href='" +
                                                              urls + "']")
                    browser.execute_script("arguments[0].click();", next_btno)
                    time.sleep(10)
                    print("123")
                    article = browser.page_source
                    times = re.compile('"item_logging_id":"(.*?):').findall(
                        str(article))
                    title = re.compile('<title>(.*?)</title>').findall(
                        str(article))
                    con = re.compile(
                        '<div dir="auto".*?>(.*?)<div.*?data-visualcompletion="ignore-dynamic">'
                    ).findall(str(article))
                    print(title)
                    try:
                        # ab = re.compile('(\\\\ud...)').findall(str(til))
                        # for te in ab:
                        #     til = til.replace(te, '')
                        # if til[-1] == '\\':
                        #     til = til[:-1]
                        # tils = til.encode('utf-8', 'replace').decode('unicode-escape')
                        # urss = str(urs).replace('\\', '')
                        # timeArray = time.localtime(int(times[0]))
                        pubTime = times[0] + " 00:00:00"
                        arcontent = con
                        site = "Facebook"
                        siteId = 1049117
                        pushState = 0
                        downloadTime = datetime.datetime.now().strftime(
                            '%Y-%m-%d %H:%M:%S')
                        data = []
                        data.append(
                            InsertOne({
                                "url": urls,
                                "title": title[0],
                                "pub_time": pubTime,
                                "content": arcontent,
                                "download_time": downloadTime,
                                "site": site,
                                "site_id": siteId,
                                "aid": urls,
                                'push_state': pushState,
                            }))
                        print("333")
                        try:
                            collection.bulk_write(data)
                            print('添加完成')
                            print('下载时间' + downloadTime)
                            print('发布时间' + pubTime)
                        except Exception as err:
                            print("添加重复")
                            print('下载时间' + downloadTime)
                            print('发布时间' + pubTime)
                    except Exception as err:
                        browser.quit()
                        login()
                except Exception as err:
                    browser.quit()
                    login()
        except Exception as err:
            browser.quit()
            login()