Ejemplo n.º 1
0
 def __init__(self):
     logging.getLogger("wechatsogou").setLevel(logging.WARNING)
     logging.getLogger("peewee").setLevel(logging.WARNING)
     logging.getLogger("requests").setLevel(logging.WARNING)
     self.logger = logging.getLogger()
     handler = logging.StreamHandler()
     formatter = logging.Formatter(
         '%(asctime)s [%(threadName)s][%(levelname)s] %(message)s')
     handler.setFormatter(formatter)
     self.logger.addHandler(handler)
     self.logger.setLevel(logging.DEBUG)
     self.get_bad_proxies()
     self.WxTable = WechatInfo()
     self.get_conn()
     self.create_target()
     self.get_saved_data()
     self.proxies_list = NewGenerationProxy({
         'anony': 'L4',
         'post': 'false',
         'speed': 3000
     })
     proxyLine = self.proxies_list.getProxy()
     self.wx_api = wechatsogou.WechatSogouAPI(timeout=8,
                                              proxies={
                                                  'http': proxyLine,
                                                  'https': proxyLine
                                              })
     SpiderConfig = Config.SpiderConfig
     self.headers = SpiderConfig.headers.json()
     self.weChat_table = WechatInfo()
     self.proxies_table = UnableProxies()
     self.crawled_table = CrawledData()
     self.rk = RClient('ghost2017b', 'Ghost2017b', '107539',
                       'a8bd936aa1574ddb96d14564c1a0d022')
Ejemplo n.º 2
0
def register(phone):
    coll_insert = connect_mongodb('192.168.0.235', 27017, 'fgg', 'user_info')
    s = requests.session()
    rc = RClient('ruokuaimiyao', 'goojia123456', '95632',
                 '6b8205cf61944329a5841c30e5ed0d5d')
    proxies = {'http': 'http://192.168.0.93:4234/'}
    c = int(time.time() * 1000)
    s.get('http://www.fungugu.com/ShenQingShiYong/fillInformation',
          proxies=proxies)
    jrbqiantai = s.cookies.get_dict()['jrbqiantai']
    cookie = 'Hm_lpvt_203904e114edfe3e6ab6bc0bc04207cd' + str(
        c) + ';Hm_lvt_203904e114edfe3e6ab6bc0bc04207cd' + str(
            c) + ';jrbqiantai=' + jrbqiantai
    headers = {
        'Cookie':
        cookie,
        'Referer':
        'http://www.fungugu.com/ShenQingShiYong/fillInformation',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'
    }
    print(headers)
    while True:
        url = 'http://www.fungugu.com/yzmControlller/refreshPicCode?id=pwdCode&' + str(
            c)
        response = s.get(url=url, proxies=proxies)
        code = rc.rk_create(response.content, 3040)['Result']
        print('code:', code)
        params = {
            'code': code,
            'type': 'pwdCode',
        }
        code_url = 'http://www.fungugu.com/yzmControlller/verificationPicCode'
        result = s.post(url=code_url, params=params, proxies=proxies)
        print(result.content)
        if 'true' in result.text:
            break
    params = {'phone': phone, 'picCode': code}
    # 发送手机号短信
    send_url = 'http://www.fungugu.com/yzmControlller/sendCode'
    result = s.post(url=send_url, params=params, proxies=proxies)
    print(result.content.decode())
    print(phone)
    # phone_verification = get_msg(token, phone)
    phone_verification = input('验证码 :')
    params = {'securityCode': phone_verification, 'phone': phone}
    phone_url = 'http://www.fungugu.com/yzmControlller/verificationSMSCode'
    result = s.post(url=phone_url, params=params, proxies=proxies)
    print('aaa:', result.content)
    user_name = yonghuming()
    params = {
        'keyCode': '',
        'keHuShouJi': phone,
        'yongHuMing': user_name,
        'keHuMiMa': '4ac9fa21a775e4239e4c72317cdca870',
        'quanXianChengShi': '上海',
        'jiGouMingCheng': user_name,
    }
    register_url = 'http://www.fungugu.com/ShenQingShiYong/completionUserInfo'
    result = s.post(url=register_url, params=params, proxies=proxies)
    print(result.text)
    if 'true' in result.text:
        data = {
            'user_name': user_name,
        }
        coll_insert.insert_one(data)
        print('插入成功')
    else:
        print('错误')
Ejemplo n.º 3
0
# -*- coding: utf-8 -*

import requests
import re
import time
import getpass
from rk import RClient
'''
定义全局cookies,所有网络请求返回的cookies都放在这里
'''
all_cookies = {}
#获取若快图像识别客户端
rc = RClient('ljc1998', 'ljc19980217.', '117226',
             'abf23a6f920644d9b8db7908b773f16a')
'''
定义需要用到的URL
'''
url = {
    'yiban_login': '******',  # get,登录页面
    'yiban_do_login': '******',  # post,请求登录
    'yiban_index': 'https://www.yiban.cn/',  # get,易班首页
    'yiban_app_base':
    'https://q.yiban.cn/app/index/appid/',  # get,易班app页面,后面需加应用id
    'yiban_signup_get':
    'https://q.yiban.cn/signup/getSignupAjax/',  # post,查询讲座状态
    'yiban_signup_insert':
    'https://q.yiban.cn/signup/insertBoxAjax/',  # post,抢讲座入口
    'yiban_captcha_get': 'https://www.yiban.cn/captcha/index/'  #get,获取登录的验证码
}
'''
工具函数,用于将网络请求返回的cookies与全局cookies合并,返回全局cookies
Ejemplo n.º 4
0
class Crawl(object):
    def __init__(self):
        logging.getLogger("wechatsogou").setLevel(logging.WARNING)
        logging.getLogger("peewee").setLevel(logging.WARNING)
        logging.getLogger("requests").setLevel(logging.WARNING)
        self.logger = logging.getLogger()
        handler = logging.StreamHandler()
        formatter = logging.Formatter(
            '%(asctime)s [%(threadName)s][%(levelname)s] %(message)s')
        handler.setFormatter(formatter)
        self.logger.addHandler(handler)
        self.logger.setLevel(logging.DEBUG)
        self.get_bad_proxies()
        self.WxTable = WechatInfo()
        self.get_conn()
        self.create_target()
        self.get_saved_data()
        self.proxies_list = NewGenerationProxy({
            'anony': 'L4',
            'post': 'false',
            'speed': 3000
        })
        proxyLine = self.proxies_list.getProxy()
        self.wx_api = wechatsogou.WechatSogouAPI(timeout=8,
                                                 proxies={
                                                     'http': proxyLine,
                                                     'https': proxyLine
                                                 })
        SpiderConfig = Config.SpiderConfig
        self.headers = SpiderConfig.headers.json()
        self.weChat_table = WechatInfo()
        self.proxies_table = UnableProxies()
        self.crawled_table = CrawledData()
        self.rk = RClient('ghost2017b', 'Ghost2017b', '107539',
                          'a8bd936aa1574ddb96d14564c1a0d022')

    def get_conn(self):
        conn = pymysql.connect(host=DBinfo.MYSQL_HOST,
                               port=DBinfo.MYSQL_PORT,
                               user=DBinfo.MYSQL_USER,
                               passwd=str(DBinfo.MYSQL_PASSWD),
                               db=DBinfo.MYSQL_DANAME,
                               charset='utf8')
        return conn

    def get_saved_data(self):
        conn = self.get_conn()
        cursor = conn.cursor()
        sql = """select main_url from crawleddata"""
        cursor.execute(sql)
        self.saved_data_list = [x for x, in cursor.fetchall()]

    def get_bad_proxies(self):
        conn = self.get_conn()
        cursor = conn.cursor()
        sql = """select object from unableproxies """
        cursor.execute(sql)
        self.bad_proxies_list = [x for x, in cursor.fetchall()]

    def create_target(self):
        self.target_list = []
        conn = self.get_conn()
        cursor = conn.cursor()
        sql = """select jobTypeId , jobTypeName from data_job_type where level = '3'"""
        cursor.execute(sql)
        data = cursor.fetchall()
        for num, item in enumerate(data):
            jobTypeId = item[0]
            jobTypeName = item[1]
            target = {
                'jobTypeId': jobTypeId,
                'jobTypeName': jobTypeName,
                'items': []
            }
            self.target_list.append(target)
        cursor.close()

    # def get_proxy(self, scene='default'):
    #     while True:
    #         # url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=c6ffc08035cc49f49ed15c834ba2c8ee&orderno=YZ20185305368PAYAqN&returnType=1&count=1'
    #         # url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId=c6ffc08035cc49f49ed15c834ba2c8ee&orderno=YZ2018634506zLQTDj&returnType=2&count=1'
    #         # url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=1&time=1&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1&regions='
    #         url = 'http://dev.kdlapi.com/api/getproxy/?orderid=953059683695998&num=1&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=2&method=1&an_an=1&an_ha=1&sep=1'
    #         html = requests.get(url).text
    #         time.sleep(random.randint(5, 10))
    #         if re.findall('普通订单5秒钟内只允许提取1次', html):
    #             self.logger.debug('need to get proxy ' + scene + ' again!')
    #             continue
    #
    #         try:
    #             proxyLine = html
    #             proxies = {
    #                 'https': proxyLine
    #             }
    #         except:
    #             pprint(html)
    #             continue
    #
    #         if str(proxies) in self.bad_proxies_list:
    #             self.logger.debug('ip is bad')
    #             continue
    #         else:
    #             self.logger.debug(u'Update %s proxy to %s' % (scene, proxyLine))
    #             return proxies  # proxyLine   proxies
    '''
        for run.py
    '''

    def get_target_list(self, target):
        page = 10
        keyword = target['jobTypeName']
        target['items'] = []
        target['main_url'] = []
        for i in range(page):
            url = u'http://weixin.sogou.com/weixin?type=2&page=%s&ie=utf8&query=%s&interation=' % (
                i + 1, parse.quote(str(keyword)))
            if url in self.saved_data_list:
                self.logger.debug('the url had been crawled %s/%s' %
                                  (keyword, i + 1))
                continue
            while True:

                def identify_image_callback(image):
                    result = self.rk.rk_create(image, 3060)
                    if 'Result' in result:
                        self.logger.debug(u'Captcha: %s, ID: %s' %
                                          (result['Result'], result['Id']))
                        return result['Result']
                    self.logger.debug(result)
                    return ''

                try:
                    target_little_list = self.wx_api.search_article(
                        target['jobTypeName'],
                        page=i + 1,
                        identify_image_callback=identify_image_callback,
                    )
                    target['main_url'].append(url)
                    self.logger.debug(
                        u'Name: %s, Page: %s, Count: %s' %
                        (target['jobTypeName'], i, len(target_little_list)))
                    break
                except Exception as e:
                    self.logger.debug(traceback.format_exc())
                    self.logger.debug(u'Name: %s, Page: %s, Error: %s' %
                                      (target['jobTypeName'], i, e.__repr__()))
                    proxyLine = self.proxies_list.getProxy()
                    self.logger.debug(u'Update proxy to %s' % (proxyLine))
                    self.wx_api.requests_kwargs['proxies'] = {
                        "https": proxyLine,
                    }
            if target_little_list.__len__() == 0:
                break
            target['items'].append(target_little_list)
        return target

    def get_target_list_v2(self, target):
        page = 10
        keyword = target['jobTypeName']
        target['items'] = []
        target['main_url'] = []
        first_data = 'init_proxy'
        for i in range(page):
            while True:
                url = u'http://weixin.sogou.com/weixin?type=2&page=%s&ie=utf8&query=%s&interation=' % (
                    i + 1, parse.quote(keyword))
                self.headers['Referer'] = url
                self.headers['Cookie'] = 'SUV="";SNUID="";'
                if url not in self.saved_data_list:
                    try:
                        self.logger.debug("queue get before size:%s" %
                                          q_proxies.qsize())
                        proxies = q_proxies.get()
                        self.logger.debug("queue get after size :%s" %
                                          q_proxies.qsize())
                        resp = requests.get(
                            url,
                            proxies=proxies,
                            headers=self.headers,
                            timeout=8,
                        )
                        if resp.ok:
                            if u'antispider' in resp.url:
                                # TODO:记录一下被识别为爬虫的代理IP到数据库 #
                                self.proxies_table.create(
                                    object=proxies['http'], )
                                self.logger.debug(
                                    u'Name: %s, Page: %s, DetachAntiSpider: %s'
                                    % (keyword, str(i + 1), proxies['http']))
                                first_data = 'detach_spider'
                                continue
                            else:
                                time.sleep(random.randint(2, 5))
                                target_little_list = WechatSogouStructuring.get_article_by_search(
                                    resp.text)
                                if target_little_list.__len__() == 0:
                                    break  # break for the page doesnt have data
                                target['items'].append(target_little_list)
                                target['main_url'].append(url)
                                self.logger.debug(
                                    'get item %s page %d total %d ' %
                                    (keyword, i + 1,
                                     target_little_list.__len__()))
                                break  # break for success
                        else:
                            self.logger.debug(
                                u'Name: %s, Page: %s, HttpError: %s' %
                                (keyword, str(i + 1), str(resp.status_code)))
                            first_data = 'http_error'
                    except Exception as e:
                        self.logger.debug(u'Name: %s, Page: %s, Error: %s' %
                                          (keyword, i + 1, type(e)))
                        first_data = 'catch_exception'
                else:
                    self.logger.debug('the url had been crawled')
                    break  # break for exist
        self.logger.debug(u'Name: %s, Total: %s' %
                          (keyword, len(target['items'])))
        return target

    def get_data(self, target):
        try:
            for num, topItem in enumerate(target['items']):
                url = target['main_url'][num]
                self.crawled_table.create(
                    main_url=url,
                    target=target['jobTypeName'],
                )
                for item in topItem:
                    detail_url = item['article']['url']
                    html = requests.get(url=detail_url,
                                        headers=self.headers).text
                    content = re.findall(
                        '<div class="rich_media_content " lang=="en" id="js_content">([\s\S]*)</div>',
                        html)
                    biz = re.findall('var biz = ""\|\|"(.*?)"', html)
                    is_delete = re.findall('该内容已被发布者删除', html)
                    if is_delete.__len__() == 0:
                        wechatId = re.findall(
                            '"profile_meta_value">(.*?)</span>', html)
                        if wechatId.__len__() == 0:
                            wechatId = ''
                        elif wechatId[0] == '':
                            wechatId = ''
                        else:
                            wechatId = wechatId[0]
                        if biz.__len__() == 0:
                            biz = ''
                        else:
                            biz = biz[0]
                        obj_wechat = self.weChat_table.create(
                            url=detail_url,
                            jobTypeId=target['jobTypeId'],
                            title=item['article']['title'],
                            content=''.join(content),
                            desc=item['article']['abstract'],
                            createAt=time.strftime(
                                '%Y-%m-%dT%H:%M:%S',
                                time.localtime(item['article']['time'])),
                            wechatId=wechatId,
                            wechatName=item['gzh']['wechat_name'],
                            biz=biz,
                        )
                        self.logger.debug(
                            'item: %s save id: %s' %
                            (target['jobTypeName'], str(obj_wechat.id)))
                    else:
                        self.logger.debug('pass for item is deleted by author')
        except Exception as e:
            self.logger.warning('something wrong : ' + e.__repr__())
Ejemplo n.º 5
0
    def insert_mysql(self, result):
        self.cursor.execute(
            """insert into {table} (username, password, mobile, api, cookie, id)
            value (%s, %s, %s, %s, %s, %s)""".format(table=TABLE),
            (result['username'], str(result['password']), str(
                result['mobile']), result['api'], result['cookie'],
             result['id']))
        self.connect.commit()


if __name__ == '__main__':
    while True:
        try:
            suma = SuMa()
            rc = RClient('tosshl1985', 'sa123456', '106174',
                         '4a5d4d1e0d334a5b8f59e1ad94ada2cc')
            zhima = ZhiMa()
            mobile = suma.mobile
            session = zhima.getPic()
            vcode = rc.run()
            code = zhima.getVocde(mobile, vcode, session)
            if code == '1':
                print('ok')
                phone_code = suma.getVcodeAndHoldMobilenum()
                if phone_code:
                    zhima.login(mobile, vcode, phone_code, session)
            else:
                suma.addIgnoreList(mobile)
                pass
        except Exception as e:
            print(e)