Example #1
0
def proxy_weixin(account):
    proxies = random_proxy()
    if proxies.get("random_int") == 1:
        ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3, auth=auth,
                                            proxies={"http": "{ip}".format(ip=proxies.get("http"))}, timeout=10)
    else:
        ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3,
                                            proxies={"http": "{ip}".format(ip=proxies.get("http"))}, timeout=5)
    # ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3, auth=auth,  timeout=5)
    dot = ws_api.get_gzh_info(account)
    return dot
Example #2
0
def GetgzhList(keyword):
    isSucess = False
    mostTryCounts = 20  #最大尝试次数
    count = 0
    while (isSucess == False and count < mostTryCounts):
        count = count + 1
        iplist = read_Proxies()  #得到代理IP列表
        itemList = []
        IP = {}
        for ip in iplist:
            try:
                ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=20)

                itemList = get_data(ws_api.search_gzh(keyword), 1)  #得到数据,并转换数据
                print("返回后列表长度:" + str(len(itemList)))
                if (len(itemList) != 0):
                    IP = ip
                    isSucess = True
                    break
            except:
                print("访问出错")
                continue
    if (isSucess == False):
        print("ERROR" + " 可能关键字不存在")

    else:
        print("SUCESS")
        return [itemList, IP, keyword]
Example #3
0
def get_news(gzh):
    ws_api = wechatsogou.WechatSogouAPI()
    articles = ws_api.get_gzh_article_by_history(gzh)
    for i in articles['article']:
        article_title_list.append(i['title'])
        article_content_list.append(i['content_url'])
        send_news()
def getInfo(gzh):
    # global k
    ws_api = wechatsogou.WechatSogouAPI()
    articles = ws_api.get_gzh_article_by_history(gzh)
    for i in articles['article']:
        # print(i)
        print(i['title'], i['abstract'], i['content_url'])
Example #5
0
 def __init__(self):
     logging.getLogger("wechatsogou").setLevel(logging.WARNING)
     logging.getLogger("peewee").setLevel(logging.WARNING)
     logging.getLogger("requests").setLevel(logging.WARNING)
     self.logger = logging.getLogger()
     handler = logging.StreamHandler()
     formatter = logging.Formatter(
         '%(asctime)s [%(threadName)s][%(levelname)s] %(message)s')
     handler.setFormatter(formatter)
     self.logger.addHandler(handler)
     self.logger.setLevel(logging.DEBUG)
     self.get_bad_proxies()
     self.WxTable = WechatInfo()
     self.get_conn()
     self.create_target()
     self.get_saved_data()
     self.proxies_list = NewGenerationProxy({
         'anony': 'L4',
         'post': 'false',
         'speed': 3000
     })
     proxyLine = self.proxies_list.getProxy()
     self.wx_api = wechatsogou.WechatSogouAPI(timeout=8,
                                              proxies={
                                                  'http': proxyLine,
                                                  'https': proxyLine
                                              })
     SpiderConfig = Config.SpiderConfig
     self.headers = SpiderConfig.headers.json()
     self.weChat_table = WechatInfo()
     self.proxies_table = UnableProxies()
     self.crawled_table = CrawledData()
     self.rk = RClient('ghost2017b', 'Ghost2017b', '107539',
                       'a8bd936aa1574ddb96d14564c1a0d022')
Example #6
0
 def startWeChatSpider(self, wechat_id_path, base_path, log_path):
     ws_api = wechatsogou.WechatSogouAPI()
     ids = self.readAllLinesFromExcel(wechat_id_path, 'gongzhonghao')
     for id in ids:
         finishedIdPath = base_path + '/' + id[0] + '_finished_id.csv'
         saveFilePath = base_path + '/' + id[0] + '_.csv'
         isFinishedIdFileExits = os.path.exists(finishedIdPath)
         isSaveFilePath = os.path.exists(saveFilePath)
         if isSaveFilePath is False:
             self.writeToCSV(saveFilePath, [
                 'title', 'abstract', 'author', 'content_url',
                 'copyright_stat', 'cover', 'datetime', 'fileid', 'main',
                 'send_id', 'source_url', 'type'
             ])
         if isFinishedIdFileExits is False:
             self.writeToCSV(finishedIdPath, ['finished_id'])
         history_list = ws_api.get_gzh_article_by_history(id[0])
         if len(history_list) > 0:
             self.parseData(id[0], history_list, finishedIdPath,
                            saveFilePath)
         time.sleep(1)
     wechatSpider.writeToTxt(
         log_path,
         str(wechatSpider.getCurrntTime() +
             ": finished get gongzhonghao..."))
Example #7
0
def main():
    print("抓取公众号的基础信息...")

    gzh_info = []
    ws_api = wechatsogou.WechatSogouAPI()
    for name in gzh_name:
        print("正在抓取公众号:", name)
        info = ws_api.get_gzh_info(name)
        gzh_info.append(info)

    if (not os.path.isdir(dir_name)):
        os.mkdir(dir_name)

    file_name = os.path.join(dir_name, "gzh_info.xls")
    print("正在写入文件:", file_name)
    wb = Workbook()
    ws = wb.add_sheet('sheet1')
    colums = list(gzh_info[0].keys())
    for j, col in enumerate(colums):
        ws.write(0, j, col)
    for i, row in enumerate(gzh_info):
        for j, col in enumerate(colums):
            ws.write(i + 1, j, row[col])
    wb.save(file_name)
    print("写入文件完成:", file_name)
def get_article(gzh, titleList):
    articleList = []
    deltaList = []
    maxConut = 3
    keyword = gzh
    count = 0
    isSuccess = False
    page = 1
    while (1):
        while (page <= 10):
            iplist = read_Proxies()
            print('read ip============================================')
            for ip in iplist:
                try:
                    # captcha_break_time:验证码重输次数
                    ws_api = wechatsogou.WechatSogouAPI(proxies=ip,
                                                        timeout=10,
                                                        captcha_break_time=2)
                    itemList = []
                    print('scrapy====%s====article==========page %d' %
                          (gzh, page))
                    time.sleep(10)
                    itemList = get_data(
                        ws_api.search_article(keyword, page=page),
                        gzh)  # 得到数据,并转换数据
                    if itemList == False:
                        continue
                    page = page + 1
                    print("\nreturn article list length:" + str(len(itemList)))
                    for art in itemList:
                        print(art['title'])
                        unique = art['title'] + '/' + art['time']
                        articleList.append(unique)
                        if unique not in titleList:
                            # 增量,在此处存入消息队列
                            print('kafka')
                            Kafka_fun(art)
                            deltaList.append(art['title'])
                    print("next article list")
                    isSuccess == True
                    break
                except Exception as e:
                    print("read article error,check ip is validable?")
                    print(e)
                    check_ip(ip)
                    continue
            if (isSuccess == False):
                count = count + 1
                if (count > maxConut):
                    print("OK,article locked!")  # 封锁后直接返回已爬取的
                    return False
                else:
                    get_ip()  # 得到代理IP列表
                    continue
            else:
                break
    print("Finish")
    return articleList
Example #9
0
def dynamic_config():
    # 可配置参数

    # 直连
    ws_api = wechatsogou.WechatSogouAPI()

    # 验证码输入错误的重试次数,默认为1
    ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)

    # 所有requests库的参数都能在这用
    # 如 配置代理,代理列表中至少需包含1个 HTTPS 协议的代理, 并确保代理可用
    ws_api = wechatsogou.WechatSogouAPI(proxies={
        "http": "127.0.0.1:8888",
        "https": "127.0.0.1:8888",
    })

    # 如 设置超时
    ws_api = wechatsogou.WechatSogouAPI(timeout=0.1)
Example #10
0
 def __init__(self, name):
     self.name = name
     self.get_gzh = ''
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
     }
     # 连接至微信搜狗搜索接口WechatSogouAPI
     self.wechat_gzh = wechatsogou.WechatSogouAPI(timeout=1, )
Example #11
0
 def wechat_info_list(self, nickname):
     """
     接入若快打码平台 获取公众号信息
     :param nickname: 公众号名称
     :return:
     """
     ocr_config = {
         'type': 'ruokuai',
         'dama_name': config.ruokuai_name,
         'dama_pswd': config.ruokuai_pswd,
         'dama_soft_id': config.ruokuai_soft_id,
         'dama_soft_key': config.ruokuai_soft_key
     }
     ws_api = wechatsogou.WechatSogouAPI(ocr_config=ocr_config)
     # 验证码输入错误的重试次数,默认为1
     ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)
     return ws_api.get_gzh_article_by_history(
         keyword=nickname,
         identify_image_callback_sogou=identify_image_callback_ruokuai_sogou,
         identify_image_callback_weixin=
         identify_image_callback_ruokuai_weixin)
Example #12
0
def get_article(gzhList, ip):
    articleList = []
    iplist = read_Proxies()
    deltaList = []
    maxConut = 1
    titleList = read_file(
        "./baiduspiderProject_new/baiduspider/jsonfile/sougou.json")
    for gzh in gzhList:
        keyword = gzh
        count = 0
        isSuccess = False
        while (1):
            for ip in iplist:
                try:
                    ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=10)
                    itemList = get_data(
                        ws_api.get_gzh_article_by_history(keyword),
                        2)  # 得到数据,并转换数据
                    print("返回后文章列表长度:" + str(len(itemList)))
                    for art in itemList:
                        print(art)
                        articleList.append(art)  #存入文章列表
                        if art['title'] not in titleList:
                            #
                            # 增量,在此处存入消息队列
                            #
                            deltaList.append(art['title'])
                    print("下一组文章")
                    isSuccess = True
                    break
                except:
                    print("文章访问出错")
                    continue
            if (isSuccess == False):
                count = count + 1
                if (count > maxConut):
                    print("尽力了,文章被封锁了!")  # 封锁后直接返回已爬取的
                    write_file(
                        "./baiduspiderProject_new/baiduspider/jsonfile/sougou_delta.json",
                        deltaList)
                    return articleList
                else:
                    get_ip()  # 得到代理IP列表
                    continue
            else:
                break
    write_file(
        "./baiduspiderProject_new/baiduspider/jsonfile/sougou_delta.json",
        deltaList)
    print("Finish")
    return articleList
def get_article_content(gzh):
    """
    该函数接受公众号参数返回该公众号最新的10篇文章以及这些文章中最新的时间戳
    :param gzh: 公众号
    :return: 最新的更新时间以及新的文章列表
    数据结构 = {
    'gzh': {
        'wechat_name': '',  # 名称
        'wechat_id': '',  # 微信id
        'introduction': '',  # 简介
        'authentication': '',  # 认证
        'headimage': ''  # 头像
    },
    'article': [
        {
            'datetime': int,  # 群发datatime 10位时间戳
            'title': '',  # 文章标题
            'abstract': '',  # 摘要
            'content_url': '',  # 文章链接
            'cover': '',  # 封面图
            'author': '',  # 作者
            'copyright_stat': int,  # 文章类型
        },
        ...
      ]
    }
    """
    ws_api = wechatsogou.WechatSogouAPI()
    result = ws_api.get_gzh_article_by_history(gzh)
    # print(result)
    articles = result.get('article')
    new_articles = []
    latest_time = 0
    latest_time_in_file = ''
    with open('/Users/mac/Desktop/Python-Scrapy/weixinarticle/latest_time.txt',
              'r') as f:
        latest_time_in_file = f.read()
    for article in articles:
        # 如果不是第一次则处理更新的文章
        if len(latest_time_in_file) > 0 and article['datetime'] > int(
                latest_time_in_file):
            article['article_content'] = request_article(article)
            new_articles.append(article)
        # 第一次获取公众号最近的10篇文章
        elif len(latest_time_in_file) == 0:
            article['article_content'] = request_article(article)
            new_articles.append(article)
        if article['datetime'] > latest_time:
            latest_time = article['datetime']
    return {'latest_time': latest_time, 'articles': new_articles}
Example #14
0
def content():
    keyword=request.args.get('key')
    vx_obj = wechatsogou.WechatSogouAPI()
    lists = []
    sugg_keywords = []
    md5_string = ''
    keywords = ''
    title = ''
    des = ''

    #try:
    if keyword.strip() != '':
        lists = vx_obj.search_article(keyword)
        for list in lists:
            wx_url = list['article']['url']
            hash = hashlib.md5()
            hash.update(bytes(wx_url))
            md5_str = hash.hexdigest()
            #list['article'].append('wx_url_md5')
            list['article']['wx_url_md5']=md5_str
            wx_urls = WXUrls(md5_str = md5_str,wx_url=wx_url)
            wx_urls.save()
        sugg_keywords = vx_obj.get_sugg(keyword)
    #except:
    #    print('value errot')

    key_count = len(sugg_keywords)


    if  key_count == 1:
        title = keywords= sugg_keywords[0]
    elif  key_count > 1:
        title = keyword+'_'+sugg_keywords[0]
        for sugg_key in sugg_keywords:
            keywords = keywords+ ','+sugg_key
        keywords = keywords[1:]
    else:
        title =keywords= keyword

    if title.strip() != '':
        hash = hashlib.md5()#md5对象,md5不能反解,但是加密是固定的,就是关系是一一对应,所以有缺陷,可以被对撞出来
        hash.update(bytes(title))#要对哪个字符串进行加密,就放这里
        md5_string = hash.hexdigest()#拿到加密字符串
        keywrods_id = Keywords(md5_string = md5_string,title=keyword)
        keywrods_id.save()
    else:
        print '404.html'

    return render_template('content.html',content_list = lists,title=title,keywords=keywords,des=des,sugg_keywords=sugg_keywords)
Example #15
0
 def parse_url(self, url):
     print(url)
     try:
         response = requests.get(url)
         r = etree.HTML(response.text, etree.HTMLParser(encoding='utf-8'))
         for item in r.xpath(
                 '/html/body/div[4]/div[1]/div[5]/ul/div[@class="newpicsmall_list"]'
         ):
             url = item.xpath('a/@href')[0]
             name = item.xpath('a/li[@class="xiaobiaotizi"]/text()')
             if len(name) == 0:
                 continue
             name = name[0]
             print(name, url)
             ws_api = wechatsogou.WechatSogouAPI()
             try:
                 wechat_info = ws_api.get_gzh_info(
                     name,
                     identify_image_callback=self.identify_image_callback)
             except wechatsogou.exceptions.WechatSogouVcodeOcrException as e:
                 result = rc.rk_report_error(self.result_id)
                 print("验证码错误,报错上传:", result["Result"])
                 continue
             if wechat_info is not None:
                 item = []
                 item.append(wechat_info['wechat_id'])  # 微信公众号名称
                 item.append(wechat_info['wechat_name'])  # 微信公众号id
                 item.append(wechat_info['introduction'])  # 简介
                 item.append(wechat_info['authentication'])  #认证
                 item.append(wechat_info['headimage'])  #头像url
                 item.append(wechat_info['open_id'])
                 item.append(wechat_info['qrcode'])  # 二维码
                 item.append(int(wechat_info['post_perm']))  # 最近一月群发量
                 item.append(int(wechat_info['view_perm']))  # 最近一月阅读数
                 item.append(wechat_info['profile_url'])  # 最近10条群发页链接
                 item.append(url)  # 原始url
                 item.append(
                     time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
                 self.wechat_infos.append(item)
                 self.insert_to_mysql()
         nextpage = r.xpath(
             '//*[@id="content-pagenation"]/div/div/div/a[@class="next"]/@href'
         )[0]
         if len(nextpage) > 0:
             self.parse_url(nextpage)
     except Exception as e:
         print(traceback.format_exc())
Example #16
0
def get_articles(original=True, timedel=1):
    ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)
    accounts = ['infoQ', '成都美食']
    articles = []
    for account in accounts:
        articles.extend(reformat(ws_api.get_gzh_article_by_history(account)))
        # 时间过滤,只选取规定天数以内的
        timestamp = int((datetime.now() - timedelta(days=timedel)).timestamp())
        articles = [
            article for article in articles if article['datetime'] > timestamp
        ]
        #监测是否为原创
    if original:
        for article in articles:
            if article['copyright_stat'] != 100:
                articles.remove(article)
    return articles
Example #17
0
def get_gzh_article_info(keyword):
    ws_api = wechatsogou.WechatSogouAPI()
    #搜索公众号
    #print(ws_api.search_gzh(keyword))
    #获取特定一个公众号信息
    #print(ws_api.get_gzh_info('腾讯'))
    #【搜索】微信文章
    #print(ws_api.search_article('腾讯'))
    get_gzh_info = ws_api.get_gzh_info(keyword)
    item_list = []
    item = {}
    #gzh info
    item['name'] = keyword
    item['gzh_wechat_name'] = get_gzh_info['wechat_name']
    item['gzh_wechat_id'] = get_gzh_info['wechat_id']
    item_list.append(item)
    return item_list
Example #18
0
def weixin_spider():
    cout = 1
    wx_accounts = get_weixin_auth().get("data")
    for i in xrange(len(wx_accounts)):
        wx_account = random.choice(wx_accounts)
        if wx_account.has_key("account"):
            account = wx_account.get("account")
            author_none = proxy_weixin(account)
            if author_none == None:
                with open("author_none.txt", "a+") as w:
                    w.write(account + "\r\n")
            else:
                ws_api = wechatsogou.WechatSogouAPI()
                doc = ws_api.get_gzh_article_by_history(keyword=account)
                name = doc['gzh']['wechat_name']
                for d in doc['article']:
                    try:
                        html = ws_api.get_article_content(d['content_url'])['content_html']
                    except:
                        continue
                    dom = etree.HTML(html)
                    d['content'] = "".join(dom.xpath("//text()")).replace("\n", "").replace(" ", "")
                    d['url'] = d['content_url']
                    d['pubtime'] = parse_date(d['datetime'])
                    d['site_name'] = u"微信公众号"
                    d['author'] = account
                    d['keyword'] = name
                    md5 = hashlib.md5()
                    md5.update(d['content_url'])
                    url_md5 = md5.hexdigest()
                    d['url_md5'] = url_md5
                    d.pop("send_id")
                    d.pop("datetime")
                    d.pop("type")
                    d.pop("main")
                    d.pop("abstract")
                    d.pop("fileid")
                    d.pop("content_url")
                    d.pop("source_url")
                    d.pop("cover")
                    d.pop("copyright_stat")
                    item_fileds(d, "data_wemedia", False)
                    time.sleep(1)
            time.sleep(10)
            cout += 1
Example #19
0
def gzh_history(name):
    ws_api = wechatsogou.WechatSogouAPI()
    print("正在抓取公众号:", name)
    gzh_hist = ws_api.get_gzh_article_by_history(name)
    file_name = "gzh_hist_" + name + ".xls"
    file_name = os.path.join(dir_name, file_name)
    print("正在写入文件:", file_name)
    gzh_article = gzh_hist['article']
    wb = Workbook()
    ws = wb.add_sheet('sheet1')
    colums = list(gzh_article[0].keys())
    for j, col in enumerate(colums):
        ws.write(0, j, col)
    for i, row in enumerate(gzh_article):
        for j, col in enumerate(colums):
            ws.write(i + 1, j, row[col])
    wb.save(file_name)
    print("写入文件完成:", file_name)
def GetgzhList(keyword, page):
    isSucess = False
    mostTryCounts = 3  # 最大尝试次数
    count = 0
    while (isSucess == False and count < mostTryCounts):
        count = count + 1
        iplist = read_Proxies()  # 得到代理IP列表
        itemList = []
        IP = {}
        ss = 0  # 成功的次数
        ff = 0  # 不成功的次数
        isFinaly = False
        for ip in iplist:
            try:
                ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=5)
                tempList = ws_api.search_gzh(keyword, page=page)
                itemList = get_data(tempList, 1)  # 数据列表
                if (tempList != []):
                    label = tempList[0]  # 判断是否最后一页的标志量
                    isFinaly = label['isFinaly']
                    print('!!!!!!!!!!!!!' + str(isFinaly))
                print("返回后公众号列表长度:" + str(len(itemList)))
                ss = ss + 1
                if (len(itemList) != 0 and isFinaly == True):
                    print("已经爬到最后一页")
                    isSucess = True
                    break
                if (len(itemList) != 0):
                    IP = ip
                    isSucess = True
                    break
            except Exception as e:
                print("公众号访问出错,检测ip是否失效")
                ff = ff + 1
                print(e)
                check_ip(ip)
                continue
        if isSucess == False and ss <= ff:
            get_ip()
    if isSucess == False:
        print("ERROR" + " 可能关键字不存在或者已经爬到最后一页")
    else:
        print("SUCESS")
        return [itemList, isFinaly]
def get_article(gzh,titleList):
    articleList = []
    deltaList = []
    maxConut = 3
    keyword = gzh
    count = 0
    isSuccess = False
    while (1):
        iplist = read_Proxies()
        print('读取ip============================================')
        for ip in iplist:
            try:
                ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=10)
                itemList = get_data(ws_api.get_gzh_article_by_history(keyword), 2)  # 得到数据,并转换数据
                print("\n返回后文章列表长度:" + str(len(itemList)))
                for art in itemList:
                    print(art['title'])
                    articleList.append(art)  # 存入文章列表
                    if art['title']+"/"+art['time'] not in titleList:
                        #
                        # 增量,在此处存入消息队列
                        Kafka_fun(art)
                        #
                        deltaList.append(art['title'])
                print("下一组文章")
                isSuccess = True
                break
            except Exception as e:
                print("文章访问出错,检测ip是否失效")
                print(e)
                check_ip(ip)
                continue
        if (isSuccess == False):
            count = count + 1
            if (count > maxConut):
                print("尽力了,文章被封锁了!")  # 封锁后直接返回已爬取的
                return False
            else:
                get_ip()  # 得到代理IP列表
                continue
        else:
            break
    print("Finish")
    return articleList
Example #22
0
    def spiderSogoWeixinSearch(self, identifyCodeTime, keyWord, pageSize):
        if (keyWord == '') or (pageSize < 0):
            return False

        if identifyCodeTime <= 0:
            identifyCodeTime = 1

        ws_api = wechatsogou.WechatSogouAPI(identifyCodeTime)
        result = []
        for i in range(0, pageSize):
            try:
                pageInfo = ws_api.search_article(keyWord, i + 1)
                for j in range(0, len(pageInfo)):
                    result.append(pageInfo[j]['article']['url'])
                time.sleep(3)
            except Exception as err:
                print(err)

        return result
def getAllPageWeixin():
    data = []
    ws_api = wechatsogou.WechatSogouAPI()
    for i in range(1, 10):
        onePageData = ws_api.search_article(
            'flyme',
            page=i,
            timesn=2,
        )
        if onePageData:
            data.extend(onePageData)
        if len(onePageData) < 10:
            break
        time.sleep(1)
    temp = []
    for x in data:
        if '吉他' not in x['gzh']['wechat_name']:  #有一个吉他乐队跟flyme重名了,过滤掉它
            temp.append(x)
    temp = arctical_filter(temp)
    return temp
Example #24
0
def test(inProxy=None):
    api = wechatsogou.WechatSogouAPI(timeout=8)
    proxyLine = '54.223.188.100:6666'
    proxyLine = '140.227.80.50:3128'  # antispider
    proxyLine = '114.110.21.146:53281'
    proxyLine = '45.6.216.79:80'
    proxyLine = '133.18.55.242:80'  # antispider
    proxyLine = '85.114.25.202:8080'  # OK
    proxyLine = '159.89.163.248:53281'  # antispider
    proxyLine = '110.164.181.164:8080'
    proxyLine = '200.87.134.30:53281'
    proxyLine = '115.203.219.81:33885'  # OK
    proxyLine = '180.122.147.226:24636'  # antispider
    if inProxy != None:
        proxyLine = inProxy

    if proxyLine != None:
        api.requests_kwargs['proxies'] = inProxy
    print(proxyLine)
    result = api.search_article('Java')
    pprint(result)
Example #25
0
def getArticleSummary(name):
    ws_api = wechatsogou.WechatSogouAPI()
    gzh_article = ws_api.get_gzh_article_by_history(name)
    data_list = []
    j = gzh_article['article']
    print(len(gzh_article['article']))

    cur = conn.cursor()
    cur.execute("SELECT VERSION()")
    data = cur.fetchone()
    print("Database version : %s " % data)

    if j:
        for i in j:
            title = i['title']
            author = i['author']
            dt = timestamp_to_date(i['datetime'])

            selectsql = "SELECT COUNT(`title`) AS count FROM `dgzx`.`news` \
                        WHERE `title` = '%s' AND `author` = '%s' AND `date` = '%s'" % \
                        (title, author, dt)
            cur.execute(selectsql)
            results = cur.fetchall()
            if results[0][0] < 1:
                sql = "INSERT INTO `dgzx`.`news` (`title`, `author`, `ifrom`, `date`) \
                VALUES (%s,%s,%s,%s)"
                cur.execute(sql, (title, author, name, dt))
                conn.commit()
                print('SQL Success: ' + title)
                try:
                    getArticleDescription(i['content_url'], i['cover'], title, author, dt)
                except:
                    print('SQL Update Failed for '+ title)
                    print('Failed URL:' + i['content_url'])
                    pass
            else:
                print("Existed: " + title + " " + author + " " + dt)
    cur.close()
def get_articles(headline=True, original=True, timedel=1, add_account=None):

    with open('gzh.txt', 'r') as f:
        accounts = [account.strip() for account in f.readlines()]
    # add_account必须是一个list或None
    if add_account is not None:
        if isinstance(list, add_account):
            accounts.extend(add_account)
            with open('gzh.txt', 'w') as f:
                for account in accounts:
                    f.write(account)
        else:
            print('add_account should be a list')

    ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)
    articles = []
    for account in accounts:
        articles.extend(reformat(ws_api.get_gzh_article_by_history(account)))

    # 时间过滤,只选取规定天数以内的
    timestamp = int((datetime.now() - timedelta(days=timedel)).timestamp())
    articles = [
        article for article in articles if article['datetime'] > timestamp
    ]

    # 头条文章过滤,是否选取头条文章,默认是
    if headline:
        articles = [article for article in articles if article['main'] == 1]

    # 原创文章过滤,是否选取原创文章,默认是
    if original:
        articles = [
            article for article in articles if article['copyright_stat'] == 100
        ]

    return articles
# -*- coding: utf-8 -*-
import wechatsogou

ws_api = wechatsogou.WechatSogouAPI()

# 搜索特定的一组微信公众号
wx_list = ["医美圈", "医美视界", "皮秒"]


# 根据关键词搜索公众号的文章
def search_article():
    for l in wx_list:
        res = ws_api.get_gzh_article_by_history(l)
        article = res['article']
        for a in article:
            print('公众号:' + l)
            print('标题:' + a['title'])
            print('摘要:' + a['abstract'] + "...")
            print('文章链接:' + a['content_url'])
            if a['source_url'] == "":
                print('阅读原文链接:无')
            print('\n')


search_article()
def get_article(gzh, titleList):
    articleList = []
    deltaList = []
    maxConut = 3
    keyword = gzh
    count = 0
    isSuccess = False
    with open('wechatatricles_zhima.txt', 'a+') as fp:
        fp.write('公众号:%s===============\n' % gzh)
        log_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        fp.write('time:%s\n' % log_time)
    while (1):
        iplist = read_Proxies()
        print('读取ip============================================')
        for ip in iplist:
            try:
                ws_api = wechatsogou.WechatSogouAPI(proxies=ip, timeout=10)
                itemList = get_data(ws_api.get_gzh_article_by_history(keyword),
                                    2)  # 得到数据,并转换数据
                print("\n返回后文章列表长度:" + str(len(itemList)))
                with open('wechatatricles_zhima.txt', 'a+') as fp:
                    fp.write('文章*************************%d\n' % len(itemList))
                    log_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime())
                    fp.write('time:%s\n' % log_time)
                for art in itemList:
                    print(art['title'])
                    articleList.append(art)  # 存入文章列表
                    if art['title'] + "/" + art['time'] not in titleList:
                        #
                        # 增量,在此处存入消息队列
                        Kafka_fun(art)
                        #
                        deltaList.append(art['title'])
                print("下一组文章")
                isSuccess = True
                break
            except Exception as e:
                with open('wechatatricles_zhima.txt', 'a+') as fp:
                    fp.write('00000000000000000000000000000000000000\n')
                    fp.write('文章访问出错\n')
                    fp.write('%s\n' % str(e))
                    log_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime())
                    fp.write('time:%s\n' % log_time)
                print("文章访问出错,检测ip是否失效")
                print(e)
                check_ip(ip)
                continue
        if (isSuccess == False):
            count = count + 1
            if (count > maxConut):
                with open('wechatatricles_zhima.txt', 'a+') as fp:
                    fp.write('11111111111111111111111111111111111111111\n')
                    fp.write('尽力了,文章被封锁了!\n')
                    log_time = time.strftime("%Y-%m-%d %H:%M:%S",
                                             time.localtime())
                    fp.write('time:%s\n' % log_time)
                print("尽力了,文章被封锁了!")  # 封锁后直接返回已爬取的
                return False
            else:
                get_ip()  # 得到代理IP列表
                continue
        else:
            break
    print("Finish")
    with open('wechatatricles_zhima.txt', 'a+') as fp:
        fp.write('*************************\n')
        fp.write('Finish\n')
        fp.write('\n\n')
    return articleList
 def __init__(self):
     super(Clean_Wechat, self).__init__('no', 'no', 'no', 'no')
     self.ws_api = wechatsogou.WechatSogouAPI()
Example #30
0
# coding=utf-8
# -*- coding utf-8 -*-
# python 2.7

import wechatsogou
import json

# 直连
ws_api = wechatsogou.WechatSogouAPI(captcha_break_time=3)
info = ws_api.search_gzh('商城')
data = json.dumps(info, indent=4, ensure_ascii=False)
print data