Exemple #1
0
 def __init__(self):
     self.crawl = Crawl()
     self.analysis = Analysis()
     self.pipe = Pipeline()
     self._use_log()
     try:
         self.args_dict = eval(sys.argv[1:])
         if not isinstance(self.args_dict, dict):
             raise ValueError('args must be like key-value ')
     except Exception as e:
         self.args_dict = {}
         logging.warning('get args failed:{}'.format(e))
     self.proxies = self.args_dict.get('proxies')  # 代理配置
     self.hdfs = self.args_dict.get('hdfs', {})  # hdfs配置
     # 如果没有这两个参数 直接报异常 不执行
     if not self.hdfs or not self.proxies:
         raise ValueError('args not have hdfs or proxies')
     self.sleep_time = self.args_dict.get('sleep_time', 0.2)  # 休眠时间
     self.service_args = self.args_dict.get('service_args',
                                            {})  # PhantomJS代理配置
     self.aliyun_log = self.args_dict.get('aliyun_log', {})
     self.alilog = AliyunLog(
         '{}_{}'.format(setting.OTA_NAME, setting.CATEGORY_NAME),
         endp=self.aliyun_log.get('endpoint', endpoint),
         accid=self.aliyun_log.get('accessKeyId', accessKeyId),
         acckey=self.aliyun_log.get('accessKey', accessKey),
         proj=self.aliyun_log.get('project', project),
         logst=self.aliyun_log.get('logstore',
                                   logstore))  # 阿里云log配置文件,需要校验如果没有该参数会不会报错
     try:
         self.HDFS = HDFileSystem(host=self.hdfs.get(
             'ip', '192.168.100.178'),
                                  port=self.hdfs.get('port', 8020))
     except:
         pass
Exemple #2
0
def cmd_crawl(args, options):
    if len(args) != 1:
        logging.error("Missing build URL")
        return 1
    if options.to_file and not os.path.exists(options.to_file):
        os.mkdir(options.to_file)
    if options.from_file and not os.path.exists(options.from_file):
        os.mkdir(options.from_file)
    db = open_db(options)
    crawl = Crawl(db, options)
    if options.reverse:
        roots = crawl.reverse_crawl(args[0])
    else:
        roots = crawl.crawl(args[0])
    close_db(db)
    stat = roots[0].extra
    logging.info("Started: %s\n\tend: %s\n\telapsed: %s\n\tduration: %ss\n\tNb builds: %s\n\ttrhoughput: %s\n" % (
            stat['start'], stat['stop'], stat['elapsed'], stat['duration'], stat['count'], stat['throughput']))
    if not options.output:
        svg_file = roots[0].getId() + ".svg"
    else:
        svg_file = options.output
    graphviz(roots, svg_file)
    logging.info("%s generated." % svg_file)
    return 0
Exemple #3
0
def thread_func(filename,cur):
    c = Crawl()
    
    #读取文件
    f = open('uploads/'+filename,'r')
    i = 1
    while 1:
        print(cur,i)
       
        line = f.readline().strip('\n')
        if i<=cur:
            i = i+1
            continue
        rs = Setting.query.filter_by(name='is_crawl').first()
        
        if rs.value == '0':
            break
        if not line:
            break
        time.sleep(1)
        flag = c.crawl(line)
        
        if flag:
            db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="2",opt_time=int(time.time())))
            db.session.commit()
        else:
            db.session.add(PhoneList(filename=filename,phonenumber=str(line),status="1",opt_time=int(time.time())))
            db.session.commit()
        pass # do something
    f.close()
Exemple #4
0
    def __init__(self, goal, time):
        '''goal = 今日订阅目标(增加量
            time = 刷新时间 (分钟)'''
        self.goal = goal
        self.time_in_seconds = time * 60
        self.c = Crawl(goal)  # 初始化Crawler
        # 设置GUI界面
        self.root = Tk()

        ###########################     设置初始windows位置 ##################
        self.root.geometry('220x45+40+560')  # 长 X  宽  + 向右平移 + 向下平移
        #####################################################################

        self.root.title('就是要莽')
        top_frame = Frame(self.root)  # 上层frame用于显示信息
        top_frame.pack(fill=BOTH)
        self.label_text1 = StringVar()
        self.label_text1.set('今日订阅:')
        text_label = Label(top_frame, textvariable=self.label_text1, font="32")
        text_label.grid(row=0, sticky='w')
        self.cur_num = StringVar()  # 当前订阅数
        num_label = Label(top_frame,
                          textvariable=self.cur_num,
                          fg="red",
                          font="28")
        num_label.grid(row=0, column=1, sticky='e')
        self.label_text2 = StringVar()
        self.label_text2.set('/' + str(self.goal))
        objective_label = Label(top_frame,
                                textvariable=self.label_text2,
                                font="28")
        objective_label.grid(row=0, column=2, sticky='w')
        top_frame.columnconfigure(0, weight=4)  # 调整widget位置
        top_frame.columnconfigure(1, weight=2)
        top_frame.columnconfigure(2, weight=2)

        bottom_frame = Frame(self.root)  # 下层frame用于手动获取最新订阅量
        bottom_frame.pack(fill=BOTH, side=BOTTOM)
        refresh_button = Button(bottom_frame, text='手动刷新', font="25")
        refresh_button.bind('<Button-1>', self.refresh)
        refresh_button.grid(row=0, column=0, sticky=("N", "S", "E", "W"))
        fans_button = Button(bottom_frame, text='当前订阅', font="25")
        fans_button.bind('<Button-1>', self.refresh_total_fans)
        fans_button.grid(row=0, column=1, sticky=("N", "S", "E", "W"))
        bottom_frame.columnconfigure(0, weight=1)
        bottom_frame.columnconfigure(1, weight=1)
        self.root.rowconfigure(0, weight=3)  # 调整widget位置
        self.root.rowconfigure(1, weight=1)

        t = threading.Thread(target=self.start_crawl)  # 开始运行
        t.daemon = True
        t.start()
        self.root.mainloop()
Exemple #5
0
def wrap_crawl(url, threads, user_agent, proxy, timeout, obey_robots, max_urls,
               data_format):
    freeze_support()
    seo = Crawl(url,
                threads=threads,
                user_agent=user_agent,
                proxy=proxy,
                timeout=timeout,
                obey_robots=obey_robots,
                max_urls=max_urls,
                data_format=data_format)
    seo.run_crawler()
Exemple #6
0
 def crawl(self):
     crawl = Crawl()
     proxies = []
     self.logger.info('crawl beginning -------')
     for parser in PARSER_LIST:
         for url in parser['urls']:
             self.logger.info('crawling {0}'.format(url))
             result = crawl.run(url, parser)
             proxies.extend(result)
     self.logger.info('crawl end -------\n'
                      'crawl {0} ips'.format(len(proxies)))
     return proxies
Exemple #7
0
class Environment(Base):

	def __init__(self,root="."):
		self.search_path = Crawl(root)
		self.version = ''
		self.cache = None

		self.engines = copy.deepcopy(engine_registry)
		self.mimetypes = copy.deepcopy(mimetype_registry)
		self.processors = copy.deepcopy(processor_registry)

		class ctx(Context):
			pass

		self.context_class = ctx

		for path in path_registry.paths:
			self.search_path.append_path(path)

		for extension in self.mimetypes.mimetypes.keys():
			self.search_path.append_extension(extension)

		for ext,engine in self.engines.engines.iteritems():
			self.add_engine_to_search_path(ext,engine)

	@property
	def index(self):
		return Index(self)

	def find_asset(self,path,**options):

		if not options:
			options = {}

		if not options.has_key('bundle'):
			options['bundle'] = True

		key = self.cache_key_for(path,**options)
		asset = self.assets[key] if self.assets.has_key(key) else None
		if asset and asset.is_fresh(self):
			return asset
		else:
			asset = self.index.find_asset(path,**options)
			if asset:
				return asset

		return None

	def expire_index(self):
		self._digest = None
		self.assets = {}
Exemple #8
0
class Environment(Base):
    def __init__(self, root="."):
        self.search_path = Crawl(root)
        self.version = ''
        self.cache = None

        self.engines = copy.deepcopy(engine_registry)
        self.mimetypes = copy.deepcopy(mimetype_registry)
        self.processors = copy.deepcopy(processor_registry)

        class ctx(Context):
            pass

        self.context_class = ctx

        for path in path_registry.paths:
            self.search_path.append_path(path)

        for extension in self.mimetypes.mimetypes.keys():
            self.search_path.append_extension(extension)

        for ext, engine in self.engines.engines.iteritems():
            self.add_engine_to_search_path(ext, engine)

    @property
    def index(self):
        return Index(self)

    def find_asset(self, path, **options):

        if not options:
            options = {}

        if not options.has_key('bundle'):
            options['bundle'] = True

        key = self.cache_key_for(path, **options)
        asset = self.assets[key] if self.assets.has_key(key) else None
        if asset and asset.is_fresh(self):
            return asset
        else:
            asset = self.index.find_asset(path, **options)
            if asset:
                return asset

        return None

    def expire_index(self):
        self._digest = None
        self.assets = {}
Exemple #9
0
 def __init__(self, name):
     self.crawl = Crawl()
     self.analysis = Analysis()
     self.pipe = Pipeline()
     self.options = webdriver.ChromeOptions()
     # 指定下载位置
     prefs = {
         'profile.default_content_settings.popups': 0,
         'download.default_directory': os.path.abspath('DATA')
     }
     self.options.add_experimental_option('prefs', prefs)
     self.driver = webdriver.Chrome(chrome_options=self.options)
     self.realname = name
     self.name = str(name.encode('gbk'))[2:-1].replace('\\x', '%').upper()
Exemple #10
0
	def __init__(self):
		init_params = {
			'site_name' : SITE_NAME,
			'init_url'  : INIT_URL,
			'skip_url'  : SKIP_URL,
			'redis_crawling_urls' : REDIS_CRAWLING_URLS,
			'redis_crawled_urls' : REDIS_CRAWLED_URLS,
			'redis_product_urls' : REDIS_PRODUCT_URLS,
			'product_pattern' : PRODUCT_PATTERN,
			'process_num' : PROCESS_NUM,
			'use_tor' : USE_TOR
		}
		Crawl.__init__(self, **init_params)
		#select collection
		self.mongo_collection = self.mongo_conn['nguyenkim_product']
Exemple #11
0
    def __init__(self):
        self.count = {
            'count': 0,  #爬取总数
            'failed_count': 0,  #爬取失败总数
            'sucess_count': 0,  #成功爬取总数
            'start_time': time.asctime(),  #开始时间
            'end_time': 0,  #结束时间
        }
        self.endtime = time.localtime().tm_min + 1
        self.proxy = next(proxies)
        self.Crawl = Crawl()
        self.Crawl.proxy = self.proxy

        self.Taskqueue = Queue()
        self.Urlqueue = Queue()
Exemple #12
0
    def start_crawl(self):
        # Validation for the inputs. (if i got time)

        # Start the crawl
        Crawl(self.website_url_input.get(), self.crawl_depth_input.get(),
              self.user_defined_regex_input.get())
        print("Crawl finished")
Exemple #13
0
	def __init__(self):
		init_params = {
			'site_name' : SITE_NAME,
			'init_url'  : INIT_URL,
			'skip_url'  : SKIP_URL,
			'redis_crawling_urls' : REDIS_CRAWLING_URLS,
			'redis_crawled_urls' : REDIS_CRAWLED_URLS,
			'redis_product_urls' : REDIS_PRODUCT_URLS,
			'product_pattern' : PRODUCT_PATTERN,
			'process_num' : PROCESS_NUM,
			'use_tor' : USE_TOR
		}
		Crawl.__init__(self, **init_params)
		#select collection
		self.mongo_collection = self.mongo_conn['tiki_product']

		self.page_link_format = re.compile(r'(.*)\?.*(p=\d+).*', re.MULTILINE|re.DOTALL)
Exemple #14
0
    def __init__(self):
        init_params = {
            'site_name': SITE_NAME,
            'init_url': INIT_URL,
            'skip_url': SKIP_URL,
            'redis_crawling_urls': REDIS_CRAWLING_URLS,
            'redis_crawled_urls': REDIS_CRAWLED_URLS,
            'redis_product_urls': REDIS_PRODUCT_URLS,
            'product_pattern': PRODUCT_PATTERN,
            'process_num': PROCESS_NUM,
            'use_tor': USE_TOR
        }
        Crawl.__init__(self, **init_params)
        #select collection
        self.mongo_collection = self.mongo_conn['lazada_product']

        self.page_link_format = re.compile(r"(.*)\?.*(page=\d+).*",
                                           re.MULTILINE | re.DOTALL)
 def crawl_price(self, item_id_inner, proxy_inner, mall_id_inner):
     if mall_id_inner == '1':
         crawl = Crawl()
         item_price_inner = crawl.get_price_jd(item_id_inner, proxy_inner)
         return item_price_inner
     elif mall_id_inner == '2':
         #crawl = Crawl()
         #item_price_inner = crawl.get_price_tm(item_id_inner, proxy_inner)
         #return item_price_inner
         temp_item_price = '-1'
         return temp_item_price
     elif mall_id_inner == '3':
         #crawl = Crawl()
         #item_price_inner = crawl.get_price_tb(item_id_inner, proxy_inner)
         #return item_price_inner
         temp_item_price = '-1'
         return temp_item_price
     else:
         return '-1'
 def evaluation_chart(self):
     # 销量榜数据表名称
     sales_volume_rankings_table_name = 'sales_volume_rankings'
     # 热评榜数据表名称
     heat_rankings_table_name = 'heat_rankings'
     # 创建自定义数据库对象
     mysql = MySQL()
     # 创建爬去对象
     mycrawl = Crawl()
     # 连接数据库
     sql = mysql.connection_sql()
     # 创建游标
     cur = sql.cursor()
     good_rate_list = [] # 好评率列表
     # 查询关注图书的信息,中的京东id
     attention_message = mysql.query_attention(cur, 'jd_id,book_name',
                                               sales_volume_rankings_table_name, "attention = '1'")
     for i in range(len(attention_message)):
         # 获取好评率与评价时间
         good_rate,time = mycrawl.get_evaluation(0, attention_message[i][0])
         # 将关注的商品名称与好评率添加至列表当中
         good_rate_list.append((attention_message[i][1], good_rate))
         # 关注的第一个商品
         if i == 0:
             plt1 = PlotCanvas() # 创建如表画布类对象
             # 显示评价分析图
             plt1.pie_chart(good_rate_list[0][1],
                            (100 - good_rate_list[0][1]), good_rate_list[0][0])
             # 将评价分析图添加至布局中
             self.horizontalLayout_0.addWidget(plt1)
         # 关注的第二个商品
         if i == 1:
             plt2 = PlotCanvas()
             plt2.pie_chart(good_rate_list[1][1],
                            (100 - good_rate_list[1][1]), good_rate_list[1][0])
             self.horizontalLayout_1.addWidget(plt2)
         # 关注的第三个商品
         if i == 2:
             plt3 =PlotCanvas()
             plt3.pie_chart(good_rate_list[2][1],
                            (100 - good_rate_list[2][1]),good_rate_list[2][0])
             self.horizontalLayout_2.addWidget(plt3)
     mysql.close_sql() # 关闭数据库
 def crawl_name(self, item_id_inner, proxy_inner, mall_id_inner):
     if mall_id_inner == '1':  # jd
         crawl = Crawl()
         item_name_inner = crawl.get_name_jd(item_id_inner, proxy_inner)
         return item_name_inner
     elif mall_id_inner == '2':  # tm
         #crawl = Crawl()
         #item_name_inner = crawl.get_name_tm(item_id_inner, proxy_inner)
         #return item_name_inner
         temp_item_name = '天猫价格抓取正在攻克中,名称暂不显示'
         return temp_item_name
     elif mall_id_inner == '3':  # tb
         #crawl = Crawl()
         #item_name_inner = crawl.get_name_tb(item_id_inner, proxy_inner)
         #return item_name_inner
         temp_item_name = '淘宝价格抓取正在攻克中,名称暂不显示'
         return temp_item_name
     else:
         return '该商品未设定商城名'
Exemple #18
0
def main():
    try:
        name = prompt()
        # create authenticated twitter api object
        auth = authenticate.Authenticate(creds_file='twitter_creds.BU')
        # crawl the given twitter profile for reciprocal friends
        crawl = Crawl(twitter_api=auth.twitter_api,
                      screen_name=name,
                      node_max=100)
        # crawl = Crawl(twitter_api=auth.twitter_api, screen_name='smerconish', node_max=100)
        crawl.crawl_followers()
        crawl.file_output.close()  # close file

        #create a graph object using networkx and visualize it using graphviz
        g = Graph(use_name=True,
                  twitter_api=auth.twitter_api,
                  screen_name=name)

    except Exception as e:
        print(traceback.format_exc())
Exemple #19
0
    def __init__(self, root="."):
        self.search_path = Crawl(root)
        self.version = ''
        self.cache = None

        self.engines = copy.deepcopy(engine_registry)
        self.mimetypes = copy.deepcopy(mimetype_registry)
        self.processors = copy.deepcopy(processor_registry)

        class ctx(Context):
            pass

        self.context_class = ctx

        for path in path_registry.paths:
            self.search_path.append_path(path)

        for extension in self.mimetypes.mimetypes.keys():
            self.search_path.append_extension(extension)

        for ext, engine in self.engines.engines.iteritems():
            self.add_engine_to_search_path(ext, engine)
Exemple #20
0
    def __init__(self, email='', rate=60, note=60 * 60):
        config='config.cfg'
        cfg = configparser.ConfigParser()
        parentDirPath=os.path.dirname(os.path.abspath(__file__))
        path=parentDirPath+'/config/'+config 
        cfg.read(path)
        self.option = cfg.get('select', 'option')
        self.scheduler = sched.scheduler(time.time, time.sleep)
        self.goods_dict = {}
        self.db = DB()
        self.crawl = Crawl()
        self.mail = Mail()
        self.ding = Dingding()
        self.email = [email]  # 电子邮箱
        self.rate = rate  # 刷新频率
        self.note = note  # 通知频率

        # 加载数据
        result = self.db.query()
        print('----------加载数据----------')
        for id, item in result.items():
            self.goods_dict[id] = Goods(item['id'], item['want'], item['status'], item['dname'])
            print(self.goods_dict[id].__dict__)
        print('----------加载完成----------')
    def __init__(self,goal,time):
        '''goal = 今日订阅目标(增加量
            time = 刷新时间 (分钟)'''
        self.goal = goal
        self.time_in_seconds = time*60
        self.c = Crawl(goal)      # 初始化Crawler
        # 设置GUI界面
        self.root = Tk()

        ###########################     设置初始windows位置 ##################
        self.root.geometry('220x45+40+560')         # 长 X  宽  + 向右平移 + 向下平移
        #####################################################################

        self.root.title('就是要莽')
        top_frame = Frame(self.root)  # 上层frame用于显示信息
        top_frame.pack(fill=BOTH)
        self.label_text1 = StringVar()
        self.label_text1.set('今日订阅:')
        text_label = Label(top_frame, textvariable=self.label_text1,font="32")
        text_label.grid(row=0,sticky='w')
        self.cur_num = StringVar()   # 当前订阅数
        num_label = Label(top_frame, textvariable=self.cur_num,fg="red",font="28")
        num_label.grid(row=0, column=1,sticky='e')
        self.label_text2 = StringVar()
        self.label_text2.set('/'+str(self.goal))
        objective_label = Label(top_frame,textvariable=self.label_text2,font="28")
        objective_label.grid(row=0,column=2,sticky='w')
        top_frame.columnconfigure(0,weight=4)     # 调整widget位置
        top_frame.columnconfigure(1,weight=2)
        top_frame.columnconfigure(2,weight=2)

        bottom_frame = Frame(self.root)  # 下层frame用于手动获取最新订阅量
        bottom_frame.pack(fill=BOTH, side=BOTTOM)
        refresh_button = Button(bottom_frame, text='手动刷新',font="25")
        refresh_button.bind('<Button-1>', self.refresh)
        refresh_button.grid(row=0,column=0,sticky=("N", "S", "E", "W"))
        fans_button=Button(bottom_frame,text='当前订阅',font="25")
        fans_button.bind('<Button-1>', self.refresh_total_fans)
        fans_button.grid(row=0,column=1,sticky=("N", "S", "E", "W"))
        bottom_frame.columnconfigure(0,weight=1)
        bottom_frame.columnconfigure(1,weight=1)
        self.root.rowconfigure(0,weight=3)   # 调整widget位置
        self.root.rowconfigure(1,weight=1)

        t = threading.Thread(target=self.start_crawl)   # 开始运行
        t.daemon = True
        t.start()
        self.root.mainloop()
Exemple #22
0
	def __init__(self,root="."):
		self.search_path = Crawl(root)
		self.version = ''
		self.cache = None

		self.engines = copy.deepcopy(engine_registry)
		self.mimetypes = copy.deepcopy(mimetype_registry)
		self.processors = copy.deepcopy(processor_registry)

		class ctx(Context):
			pass

		self.context_class = ctx

		for path in path_registry.paths:
			self.search_path.append_path(path)

		for extension in self.mimetypes.mimetypes.keys():
			self.search_path.append_extension(extension)

		for ext,engine in self.engines.engines.iteritems():
			self.add_engine_to_search_path(ext,engine)
Exemple #23
0
def get_data(usr_id, token):
    sys.path.append('../')
    from crawl import Crawl
    import time
    c = Crawl()
    print 'Start web crawl.'
    c.update([usr_id], token_list=[token])
    c.update_img([usr_id], token_list=[token])
    c.update_voice([usr_id], token_list=[token])
    print 'Crawl is finished.'

    print 'Start analysis.'
    #os.system('java -Djava.ext.dirs=../../predict/lib -jar ../../predict/predictor.jar ../../analysis/data_json/'+usr_id)
    os.system(
        'java -Djava.ext.dirs=./lib -jar predictor.jar ../../analysis/data_json/'
        + usr_id)
    print 'Analysis is finished.'

    global five_result
    #with open('../../predict/predict_result/'+usr_id+'.txt') as ifile:
    with open('predict_result/' + usr_id + '.txt') as ifile:
        five_result = eval(ifile.read())
    global finished
    finished = True
Exemple #24
0
class Work:
    def __init__(self):
        self.c=Crawl()
        self.e=Excel()

    def thread_it(self,func):
        # 创建线程
        t = threading.Thread(target=func)
        # 守护线程
        t.setDaemon(True)
        # 启动
        t.start()

    def setUp(self):
        #pb.start()
        self.c.setUp()
        #pb.stop()

    def crawl(self):
        var.set('')
        start_row=int(start.get())
        end_row=int(end.get())
        list=self.e.get_title_list(start_row,end_row)#title_list
        print(list,flush=True)
        self.c.crawl(list)
        time.sleep(2)
        start.delete(0,tk.END)
        end.delete(0,tk.END)
        time.sleep(1)
        start.insert(0,end_row+1)
        end.insert(0,end_row+4)
        num=end_row-start_row+1
        var.set('请输入'+str(num)+'个结果 ')
        #num_list=c.insert() 
        #self.e.write_num(num_list)

    def insert(self):
        num=e.get()
        num_list=[int(i) for i in re.split('[,,]',num)]
        print(num_list,flush=True)
        self.e.write_num(num_list)
        e.delete(0,tk.END)
        var.set('数据已导入 ')

    def tearDown(self):
       self.c.tearDown()
Exemple #25
0
def get_data(usr_id, token):
    sys.path.append('../')
    from crawl import Crawl
    import time
    c = Crawl()
    print 'Start web crawl.'
    c.update([usr_id], token_list=[token])
    c.update_img([usr_id], token_list=[token])
    c.update_voice([usr_id], token_list=[token])
    print 'Crawl is finished.'
    
    print 'Start analysis.'
    #os.system('java -Djava.ext.dirs=../../predict/lib -jar ../../predict/predictor.jar ../../analysis/data_json/'+usr_id)
    os.system('java -Djava.ext.dirs=./lib -jar predictor.jar ../../analysis/data_json/'+usr_id)
    print 'Analysis is finished.'
    
    global five_result
    #with open('../../predict/predict_result/'+usr_id+'.txt') as ifile:
    with open('predict_result/'+usr_id+'.txt') as ifile:
        five_result = eval(ifile.read())
    global finished
    finished = True
Exemple #26
0
class Monitor:
    def __init__(self, email='', rate=60, note=60 * 60):
        config='config.cfg'
        cfg = configparser.ConfigParser()
        parentDirPath=os.path.dirname(os.path.abspath(__file__))
        path=parentDirPath+'/config/'+config 
        cfg.read(path)
        self.option = cfg.get('select', 'option')
        self.scheduler = sched.scheduler(time.time, time.sleep)
        self.goods_dict = {}
        self.db = DB()
        self.crawl = Crawl()
        self.mail = Mail()
        self.ding = Dingding()
        self.email = [email]  # 电子邮箱
        self.rate = rate  # 刷新频率
        self.note = note  # 通知频率

        # 加载数据
        result = self.db.query()
        print('----------加载数据----------')
        for id, item in result.items():
            self.goods_dict[id] = Goods(item['id'], item['want'], item['status'], item['dname'])
            print(self.goods_dict[id].__dict__)
        print('----------加载完成----------')

    # 添加商品
    def add(self, id, want, status=True, dname=''):
        if id not in self.goods_dict.keys():
            self.db.add(id, want, status, dname)
            goods = Goods(id, want, status, dname)
            name, price, date = self.crawl.get(id)
            goods.update(name, price, date)
            self.goods_dict[id] = goods
            print(self.goods_dict[id].__dict__)
            return True
        else:
            return False

    # 删除商品
    def remove(self, id):
        if id in self.goods_dict.keys():
            self.goods_dict.pop(id)
            self.db.delete(id)
            return True
        else:
            return False

    # 更新期望价格
    def update_want(self, id, want):
        if id in self.goods_dict.keys():
            self.goods_dict[id].update_want(want)
            self.goods_dict[id].update_note(0)  # 刷新通知时间
            self.db.update_want(id, want)
            return True
        else:
            return False

    # 更新运行状态
    def update_status(self, id, status):
        if id in self.goods_dict.keys():
            self.goods_dict[id].update_status(status)
            self.goods_dict[id].update_note(0)  # 刷新通知时间
            self.db.update_status(id, status)
            return True
        else:
            return False

    # 获取历史价格
    def history(self, id):
        if id in self.goods_dict.keys():
            return self.crawl.get_history(id)
        else:
            return ''

    # 定时任务
    def task(self):
        ids = list(self.goods_dict.keys())
        for id in ids:
            goods = self.goods_dict[id]
            if goods.status:
                name, price, date = self.crawl.get(id)
                if id not in self.goods_dict.keys(): continue  # 防止商品已经删除
                goods.update(name, price, date)

                ########## 检查是否符合发送条件 ##########
                # 满足通知间隔时间 & 当前价格小于期望价格
                if (date - goods.note >= self.note) and (price <= goods.want):
                    if (self.option == 'mail'):
                        print('邮件发送')
                        self.mail.send(self.email, name, price, goods.want, goods.url)
                    else:
                        print('钉钉发送')
                        self.ding.send(name, price, goods.want, goods.url)
                    goods.update_note(date)
        print('----------刷新数据----------')
        for goods in self.goods_dict.values():
            print(goods.__dict__)
        print('----------刷新完成----------')

    # 定时器
    def _run(self):
        self.scheduler.enter(self.rate, 0, self._run, ())  # delay, priority, action, argument=()
        self.task()

    # 定时器
    def run(self):
        self.scheduler.enter(0, 0, self._run, ())  # delay, priority, action, argument=()
        self.scheduler.run()
Exemple #27
0
from crawl import Crawl
from crawley import Crawley

crawley = Crawley()
crawley.welcome() 
url, levels, user_defined_regex = crawley.user_input()
crawl = Crawl(url, levels, user_defined_regex)

crawl.perform_crawl()
#crawl.test_variables()
crawl.save_report()
crawley.report()

while True:
    if crawley.crawl_another() == True:
        url, levels, user_defined_regex = crawley.user_input()
        crawl = Crawl(url, levels, user_defined_regex)

        crawl.perform_crawl()
        #crawl.test_variables()
        crawl.save_report()
        crawley.report()
    else:
        crawley.goodbye()
        break
Exemple #28
0
 def __init__(self):
     self.crawl = Crawl()
     self.analysis = Analysis()
     self.pipe = Pipeline()
Exemple #29
0
class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_city_link(self):
        """
        获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL,
                                          headers=setting.HEADERS,
                                          proxies=self._engine_use_proxy())
        element_city = self.analysis.analysis_by_xpath(content,
                                                       setting.XPATH_CITY_A)
        city_list = []
        for each_element in element_city:
            city_name = self.analysis.analysis_by_xpath(
                each_element, setting.XPATH_CITY_NAME)
            city_url = self.analysis.analysis_by_xpath(each_element,
                                                       setting.XPATH_CITY_URL)
            city_list.append('{}\u0001{}'.format(''.join(city_name),
                                                 ''.join(city_url)))
        self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST)

    def _engine_amuse_link(self):
        """
        获取每个城市中所有的娱乐场所的链接
        :return:
        """
        city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST)
        for each_city in city_list:
            try:
                url = each_city.strip().split('\u0001')[1] + '-wanle'
                name = each_city.strip().split('\u0001')[0]
                params_city = {'page': 0}
                maxpage = 200  # 默认最大页数
                while True:
                    save_list = []
                    params_city['page'] += 1
                    content = self.crawl.crawl_by_get(
                        url,
                        headers=setting.HEADERS,
                        params=params_city,
                        proxies=self._engine_use_proxy(),
                        retry=2,
                        timeout=15)
                    if not content:
                        break
                    # 获取总页数
                    if params_city['page'] == 1:
                        # 找到最大页数,使用map函数
                        pagecount = map(
                            lambda x: int(x) if x != '下一页' else -1,
                            self.analysis.analysis_by_xpath(
                                content, xpahter=setting.XPATH_NEXTPAGE))
                        try:
                            maxpage = max(pagecount)
                        except:
                            break
                    element_li = self.analysis.analysis_by_xpath(
                        content, xpahter=setting.XPATH_LI)
                    if not element_li:
                        break

                    for each_ele in element_li:
                        amuse_name = self.analysis.analysis_by_xpath(
                            each_ele, xpahter=setting.XPATH_AMUSE_NAME)
                        amuse_type = self.analysis.analysis_by_xpath(
                            each_ele, xpahter=setting.XPATH_AMUSE_TYPE)
                        amuse_url = self.analysis.analysis_by_xpath(
                            each_ele, xpahter=setting.XPATH_AMUSE_URL)
                        try:
                            save_info = '{}\u0001{}\u0001{}\u0001{}'.format(
                                name, ''.join(amuse_name), ''.join(amuse_type),
                                ''.join(amuse_url))
                        except:
                            continue
                        save_list.append(save_info)
                    self.pipe.pipe_txt_save(save_list,
                                            filename=setting.FILE_AMUSE_LIST,
                                            savetype='a')
                    if params_city['page'] >= maxpage:
                        break
                    time.sleep(0.2)
            except:
                continue

    def _engine_amuse_info(self):
        """
        获取所有娱乐场所详细数据
        :return:
        """
        amuse_list = self.pipe.pipe_txt_load(filename=setting.FILE_AMUSE_LIST)
        for each_amuse in amuse_list:
            try:
                # 娱乐场所数据
                amuse_info = each_amuse.strip().split('\u0001')
                city_name = amuse_info[0]
                amuse_name = amuse_info[1]
                amuse_type = amuse_info[2]
                amuse_url = amuse_info[3]
                find_id = re.search(re.compile(r'p-oi(\d+)-'), amuse_url)
                if find_id:
                    amuse_id = find_id.group(1)
                else:
                    amuse_id = 0
                # 获取娱乐场所详细信息
                content = self.crawl.crawl_by_get(
                    amuse_url,
                    headers=setting.HEADERS,
                    proxies=self._engine_use_proxy(),
                    retry=5,
                    timeout=10)
                detail = self.analysis.analysis_by_xpath(
                    content, xpahter=setting.XPATH_AMUSE_DETAIL)
                detail['city_name'] = city_name
                detail['amuse_name'] = amuse_name
                detail['amuse_type'] = amuse_type
                detail['amuse_url'] = amuse_url
                detail['amuse_id'] = amuse_id
                detail['get_time'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                # 存储数据
                # 字段顺序
                # city_name, amuse_name, amuse_type, amuse_id,
                # score, ranking, describe, address, tel, open_time, arrive, intro, web, get_time, amuse_url
                save_data = '{0[city_name]}\u0001{0[amuse_name]}\u0001{0[amuse_type]}\u0001' \
                            '{0[amuse_id]}\u0001{0[score]}\u0001{0[ranking]}\u0001' \
                            '{0[describe]}\u0001{0[address]}\u0001{0[tel]}\u0001' \
                            '{0[open_time]}\u0001{0[arrive]}\u0001{0[intro]}\u0001' \
                            '{0[web]}\u0001{0[get_time]}\u0001{0[amuse_url]}\u0001'.format(detail)
                self.pipe.pipe_txt_save(save_data,
                                        filename=setting.FILE_AMUSE_INFO,
                                        savetype='a')
                # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_shop_info')
                time.sleep(0.1)
            except Exception as e:
                print('crawl error', e)
                continue

    def _engine_amuse_comments(self):
        """
        获取所有购物店评论数据
        :return:
        """
        amuse_list = self.pipe.pipe_txt_load(filename=setting.FILE_AMUSE_LIST)
        # 每个店铺最新评论时间表
        check_dict = self.pipe.pipe_pickle_load(
            filename=setting.FILE_COMMENTS_CHECK)
        if not check_dict:
            check_dict = {}
        for each_amuse in amuse_list:
            try:
                # 店铺数据
                city = each_amuse.strip().split('\u0001')[0]
                amuse = each_amuse.strip().split('\u0001')[1]
                type = each_amuse.strip().split('\u0001')[2]
                amuse_url = each_amuse.strip().split('\u0001')[3]
                find_id = re.search(re.compile(r'p-oi(\d+)-'), amuse_url)
                if not find_id:
                    break
                amuse_id = find_id.group(1)
                api = setting.COMMENTS_API.format(amuse_id)
                setting.HEADERS_COMMENTS['Referer'] = amuse_url
                params = {
                    'page': 0,
                    'pageSize': '10',
                    'poiList': 'true',
                    'rank': 0,  # 全部评论
                    'sortField': 0  # 按照时间排序
                }
                comments_time = set([])
                current_time = check_dict.get(amuse_id, '0')
                max_page = 1
                while True:
                    params['page'] += 1
                    content = self.crawl.crawl_by_get(
                        api,
                        headers=setting.HEADERS_COMMENTS,
                        proxies=self._engine_use_proxy(),
                        params=params,
                        retry=2,
                        timeout=15)
                    try:
                        content_dict = json.loads(content)
                    except:
                        break
                    if not content_dict.get('data'):
                        break
                    content_comments = content_dict.get('data')
                    # 第一遍抓取要确定评论页数
                    if params['page'] == 1:
                        page = self.analysis.analysis_by_xpath(
                            content_comments,
                            xpahter=setting.XPATH_COMMENTS_PAGE)
                        if page:
                            max_page = int(''.join(page))
                    elements_com = self.analysis.analysis_by_xpath(
                        content_comments, xpahter=setting.XPATH_COMMENTS_LI)
                    if not elements_com:
                        break
                    for each_element in elements_com:
                        title = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_TITLE)
                        start = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_START)
                        nick = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_NICK)
                        more = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_MORE)
                        if more:
                            content_more = self.crawl.crawl_by_get(
                                more[0],
                                headers=setting.HEADERS,
                                proxies=self._engine_use_proxy())
                            content = self.analysis.analysis_by_xpath(
                                content_more,
                                xpahter=setting.XPATH_COMMENTS_DETAIL)
                        else:
                            content = self.analysis.analysis_by_xpath(
                                each_element,
                                xpahter=setting.XPATH_COMMENTS_CONTENT)
                        date = self.analysis.analysis_by_xpath(
                            each_element, xpahter=setting.XPATH_COMMENTS_DATE)
                        deal_content = ''.join(
                            list(
                                map(
                                    lambda x: x.replace('\n', '').replace(
                                        '\r', '').replace('\t', '').replace(
                                            ' ', ''), content)))
                        if ''.join(date) > current_time:
                            commetents_info = {
                                'city':
                                city,
                                'amuse':
                                amuse,
                                'amuse_id':
                                amuse_id,
                                'type':
                                type,
                                'title':
                                ''.join(title),
                                'nick':
                                ''.join(nick),
                                'start':
                                ''.join(start),
                                'content':
                                deal_content,
                                'date':
                                ''.join(date),
                                'get_time':
                                datetime.datetime.now().strftime(
                                    '%Y-%m-%d %H:%M:%S'),
                                'url':
                                amuse_url
                            }
                            for eachkey in commetents_info.keys():
                                commetents_info[eachkey] = commetents_info[
                                    eachkey].replace('\n',
                                                     '').replace('\r', '')
                            # 存储数据
                            # 字段顺序
                            # city, amuse, amuse_id, type, title, nick, start, content, date, get_time, url
                            save_data = '{0[city]}\u0001{0[amuse]}\u0001{0[amuse_id]}\u0001' \
                                        '{0[type]}\u0001{0[title]}\u0001{0[nick]}\u0001' \
                                        '{0[start]}\u0001{0[content]}\u0001{0[date]}\u0001' \
                                        '{0[get_time]}\u0001{0[url]}'.format(commetents_info)
                            self.pipe.pipe_txt_save(
                                save_data,
                                filename=setting.FILE_AMUSE_COMMENTS,
                                savetype='a')
                            # self.pipe.pipe_mongo_save(commetents_info, dbname='db_qunaer', colname='col_shopping_comments')
                            comments_time.add(''.join(date))
                    # 超过评论最大页数则切换
                    if params['page'] >= max_page:
                        break
                    # 当前页面没有新增评论也切换至下一店铺
                    if not len(comments_time):
                        break
                # 每个店铺最新的评论时间
                if comments_time:
                    check_dict[amuse_id] = max(comments_time)
                # 抓取到的评论数据
                self.pipe.pipe_pickle_save(
                    check_dict, filename=setting.FILE_COMMENTS_CHECK)
            except:
                continue

    def _temp_city_info(self, cityname):
        """
        做22项数据处理时临时用
        :return:
        """
        citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt')
        city_params = {
            '国别': '&',
            '省自治区全称': '&',
            '省自治区简称': '&',
            '市州全称': '&',
            '市州简称': '&',
            '区县全称': '&',
            '区县简称': '&',
            '地区编码': '&',
            '等级': '&'
        }
        spec_city = {
            '北京': '110000',
            '天津': '120000',
            '上海': '310000',
            '重庆': '500000'
        }
        for each in citylist:
            cityinfo = each.split('\u0001')
            if cityname in cityinfo:
                site = cityinfo.index(cityname)
                if site == 4 or site == 5:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['区县全称'] = cityinfo[4].strip()
                    city_params['区县简称'] = cityinfo[5].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()
                    city_params['等级'] = '区县级'
                elif site == 2 or site == 3:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00'
                    city_params['等级'] = '地市级'
                elif cityname in ['北京', '重庆', '上海', '天津']:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityname + '市'
                    city_params['省自治区简称'] = cityname
                    city_params['市州全称'] = cityname + '市'
                    city_params['市州简称'] = cityname
                    city_params['地区编码'] = spec_city[cityname]
                    city_params['等级'] = '直辖'
                break

        return city_params

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    def start_engine(self):
        # self._engine_city_link()
        # self._engine_amuse_link()
        # 店铺信息和店铺评论可以同时抓取的,用多进程实现,后期可根据需求添加该功能,目前未开发循环抓取功能
        # self._engine_amuse_info()
        self._engine_amuse_comments()
Exemple #30
0
# 关于窗体初始化类
class About_Window(QMainWindow, About_MainWindow):
    def __init__(self):
        super(About_Window, self).__init__()
        self.setupUi(self)

    # 打开窗体
    def open(self):
        self.show()


if __name__ == "__main__":
    # 创建自定义数据库对象
    mysql = MySQL()
    # 创建爬去对象
    mycrawl = Crawl()
    # 连接数据库
    sql = mysql.connection_sql()
    # 创建游标
    cur = sql.cursor()

    app = QApplication(sys.argv)
    # 主窗体对象
    main = Main()
    # 显示主窗体
    main.show()
    # 销量排行窗体对象
    sales = Sales()
    # 热评排行窗体对象
    heat = Heat()
Exemple #31
0
"""
This script just for test and learn how to crawl web pages using python

"""

from parser import Parser
from crawl import Crawl

c = Crawl()
c.fetch('http://www.blogfa.com/')

p = Parser()
p.set_html(c.content)

p.get_title()
p.get_links()

print "count of links: %s" % len(p.links)
print "title of current url: %s" % p.title
Exemple #32
0
class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.pipe = Pipeline()
        self.analysis = Analysis()

    # def _engine_residential_area_by_json(self):
    #     """
    #     获取小区数据,output为json,
    #     但是高德那边返回的json数据小区更位置对应不上,只能使用xml数据,故不用该模块,使用xml
    #     """
    #     citys = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID)
    #     types = self.pipe.pipe_txt_load(filename=setting.FILE_TYPE_ID)
    #     current_params = deepcopy(setting.PARAMS)
    #     current_params['key'] = setting.KEY
    #     # 每种类型
    #     for each_type in types:
    #         typeinfo = each_type.strip().split('\u0001')
    #         type_id = typeinfo[0]  # 类型id
    #         type_large = typeinfo[1]  # 类型大分类
    #         type_middle = typeinfo[2]  # 类型中分类
    #         type_small = typeinfo[3]  # 类型小分类
    #         current_params['types'] = type_id
    #         save_filename = '{}_{}_{}_{}.txt'.format(type_id, type_large, type_middle, type_small)
    #         # 每个城市
    #         for each_city in citys:
    #             cityinfo = each_city.strip().split('\u0001')
    #             province = cityinfo[0]  # 省名
    #             city_name = cityinfo[1]  # 城市名
    #             city_id = cityinfo[2]  # 城市id
    #             current_params['city'] = city_id
    #             current_params['page'] = 0
    #             save_data = []
    #             while True:
    #                 current_params['page'] += 1
    #                 content_json = self.crawl.crawl_by_get(setting.SEARCH_API, params=current_params,
    #                                                        retry=2, timeout=30)
    #                 try:
    #                     data_json = json.loads(content_json)
    #                 except:
    #                     continue
    #                 pois_list = data_json.get('pois')
    #                 if not pois_list:
    #                     break
    #                 for each_poi in pois_list:
    #                     """
    #                     字段说明:
    #                     id: 唯一ID, name: 名称, pcode: poi所在省份编码,  pname: poi所在省份名称,citycode: 城市编码,
    #                     cityname: 城市名,adcode: 区域编码, adname: 区域名称,address: 地址,  alias: 别名,
    #                     biz_ext: 深度信息, biz_type: 行业类型, business_area: 所在商圈, discount_num: 优惠信息数目,
    #                     distance: 离中心点距离(此结果仅在周边搜索的时候有值), email: 该POI的电子邮箱, entr_location: 入口经纬度,
    #                     exit_location: 出口经纬度, gridcode: 地理格ID, groupbuy_num: 团购数据, indoor_data: 室内地图相关数据,
    #                     indoor_map: 是否有室内地图标志, location: 经纬度, navi_poiid: 地图编号, photos: 照片相关信息,
    #                     postcode: 邮编, tag: 该POI的特色内容, tel: 该POI的电话, type: 兴趣点类型, typecode: 兴趣点类型编码,
    #                     website: 该POI的网址
    #                     """
    #                     save_dict = {}
    #                     save_dict['id'] = each_poi.get('id', '')  # id: 唯一ID
    #                     save_dict['name'] = each_poi.get('name', '')  # name: 名称
    #                     save_dict['pcode'] = each_poi.get('pcode', '')  # pcode: poi所在省份编码
    #                     save_dict['pname'] = each_poi.get('pname', '')  # pname: poi所在省份名称
    #                     save_dict['citycode'] = each_poi.get('citycode', '')  # citycode: 城市编码
    #                     save_dict['cityname'] = each_poi.get('cityname', '')  # cityname: 城市名
    #                     save_dict['adcode'] = each_poi.get('adcode', '')  # adcode: 区域编码
    #                     save_dict['adname'] = each_poi.get('adname', '')  # adname: 区域名称
    #                     save_dict['address'] = each_poi.get('address', '')  # address: 地址
    #                     save_dict['alias'] = each_poi.get('alias', '')  # alias: 别名
    #                     save_dict['biz_ext'] = each_poi.get('biz_ext', '')  # biz_ext: 深度信息
    #                     save_dict['biz_type'] = each_poi.get('biz_type', '')  # biz_type: 行业类型
    #                     save_dict['business_area'] = each_poi.get('business_area', '')  # business_area: 所在商圈
    #                     save_dict['discount_num'] = each_poi.get('discount_num', '')  # discount_num: 优惠信息数目
    #                     save_dict['email'] = each_poi.get('email', '')  # email: 该POI的电子邮箱
    #                     save_dict['entr_location'] = each_poi.get('entr_location', '')  # entr_location: 入口经纬度
    #                     save_dict['exit_location'] = each_poi.get('exit_location', '')  # exit_location: 出口经纬度
    #                     save_dict['gridcode'] = each_poi.get('gridcode', '')  # gridcode: 地理格ID
    #                     save_dict['groupbuy_num'] = each_poi.get('groupbuy_num', '')  # groupbuy_num: 团购数据
    #                     save_dict['indoor_data'] = each_poi.get('indoor_data', '')  # indoor_data: 室内地图相关数据
    #                     save_dict['indoor_map'] = each_poi.get('indoor_map', '')  # indoor_map: 是否有室内地图标志
    #                     save_dict['location'] = each_poi.get('location', '')  # location: 经纬度
    #                     save_dict['navi_poiid'] = each_poi.get('navi_poiid', '')  # navi_poiid: 地图编号
    #                     photos = each_poi.get('photos', [])  # photos: 照片相关信息
    #                     save_dict['photo_info'] = ''
    #                     for each_photo in photos:
    #                         if isinstance(each_photo.get('title', {}), dict):
    #                             each_photo['title'] = 'notitle'
    #                         save_dict['photo_info'] += '{0[title]}-{0[url]},'.format(each_photo)
    #                     save_dict['postcode'] = each_poi.get('postcode', '')  # postcode: 邮编
    #                     save_dict['tag'] = each_poi.get('tag', '')  # tag: 该POI的特色内容
    #                     save_dict['tel'] = each_poi.get('tel', '')  # tel: 该POI的电话
    #                     save_dict['type'] = each_poi.get('type', '')  # type: 兴趣点类型
    #                     save_dict['typecode'] = each_poi.get('typecode', '')  # typecode: 兴趣点类型编码
    #                     save_dict['website'] = each_poi.get('website', '')  # website: 该POI的网址
    #                     for each_key in save_dict.keys():
    #                         save_dict[each_key] = \
    #                             save_dict[each_key] if not isinstance(save_dict[each_key], dict) else ''
    #                     # 存储字段类型
    #                     # id, name, pcode, pname, citycode, cityname, adcode, adname,
    #                     # address, alias, biz_type, business_area, discount_num, email,
    #                     # entr_location, exit_location, gridcode, groupbuy_num, indoor_data,
    #                     # indoor_map, location, navi_poiid, photo_info, postcode, tag, tel, type, typecode, website,
    #                     save_info = '{0[id]}\u0001{0[name]}\u0001{0[pcode]}\u0001{0[pname]}\u0001' \
    #                                 '{0[citycode]}\u0001{0[cityname]}\u0001{0[adcode]}\u0001{0[adname]}\u0001' \
    #                                 '{0[address]}\u0001{0[alias]}\u0001{0[biz_type]}\u0001{0[business_area]}\u0001' \
    #                                 '{0[discount_num]}\u0001{0[email]}\u0001{0[entr_location]}\u0001' \
    #                                 '{0[exit_location]}\u0001' \
    #                                 '{0[gridcode]}\u0001{0[groupbuy_num]}\u0001{0[indoor_data]}\u0001' \
    #                                 '{0[indoor_map]}\u0001' \
    #                                 '{0[location]}\u0001{0[navi_poiid]}\u0001{0[photo_info]}\u0001{0[postcode]}\u0001' \
    #                                 '{0[tag]}\u0001{0[tel]}\u0001{0[type]}\u0001{0[typecode]}\u0001' \
    #                                 '{0[website]}'.format(save_dict)
    #                     save_data.append(save_info)
    #                     time.sleep(0.1)
    #             self.pipe.pipe_txt_save(save_data, filename=save_filename, savetype='a')

    def _engine_residential_area(self):
        """获取小区数据"""
        citys = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_ID)
        types = self.pipe.pipe_txt_load(filename=setting.FILE_TYPE_ID)
        current_params = deepcopy(setting.PARAMS)
        current_params['key'] = setting.KEY
        # 每种类型
        for each_type in types:
            typeinfo = each_type.strip().split('\u0001')
            type_id = typeinfo[0]  # 类型id
            type_large = typeinfo[1]  # 类型大分类
            type_middle = typeinfo[2]  # 类型中分类
            type_small = typeinfo[3]  # 类型小分类
            current_params['types'] = type_id
            save_filename = '{}_{}_{}_{}.txt'.format(type_id, type_large, type_middle, type_small)
            # 每个城市
            for each_city in citys:
                cityinfo = each_city.strip().split('\u0001')
                province = cityinfo[0]  # 省名
                city_name = cityinfo[1]  # 城市名
                city_id = cityinfo[2]  # 城市id
                current_params['city'] = city_id
                current_params['page'] = 0
                save_data = []
                while True:
                    current_params['page'] += 1
                    content = self.crawl.crawl_by_get(setting.SEARCH_API, params=current_params,
                                                      retry=2, timeout=30)
                    try:
                        con = re.search(re.compile(r'<response>(.*?)</response>', re.S), content).group(1)
                        pois_list = self.analysis.analysis_by_xpath(con, xpahter=setting.XPATH_POIS)
                    except:
                        continue
                    if not pois_list:
                        break
                    for each_poi in pois_list:
                        """
                        字段说明:
                        id: 唯一ID, name: 名称, pcode: poi所在省份编码,  pname: poi所在省份名称,citycode: 城市编码, 
                        cityname: 城市名,adcode: 区域编码, adname: 区域名称,address: 地址,  alias: 别名, 
                        biz_ext: 深度信息, biz_type: 行业类型, business_area: 所在商圈, discount_num: 优惠信息数目,
                        distance: 离中心点距离(此结果仅在周边搜索的时候有值), email: 该POI的电子邮箱, entr_location: 入口经纬度,
                        exit_location: 出口经纬度, gridcode: 地理格ID, groupbuy_num: 团购数据, indoor_data: 室内地图相关数据, 
                        indoor_map: 是否有室内地图标志, location: 经纬度, navi_poiid: 地图编号, photos: 照片相关信息, 
                        postcode: 邮编, tag: 该POI的特色内容, tel: 该POI的电话, type: 兴趣点类型, typecode: 兴趣点类型编码, 
                        website: 该POI的网址
                        """
                        save_dict = self.analysis.analysis_by_xpath(each_poi, xpahter=setting.XPATH_DETAIL)
                        photos = self.analysis.analysis_by_xpath(each_poi, xpahter=setting.XPATH_PHOTOS)
                        photo_info = ''
                        for each_photo in photos:
                            photo_dict = self.analysis.analysis_by_xpath(each_photo, xpahter=setting.XPATH_PHOTO_DETAIL)
                            photo_dict['title'] = photo_dict['title'] if photo_dict['title'] else 'no_title'
                            photo_info += '{0[title]}-{0[url]},'.format(photo_dict)
                        save_dict['photo_info'] = photo_info
                        # 存储字段类型
                        # id, name, pcode, pname, citycode, cityname, adcode, adname,
                        # address, alias, biz_type, business_area, discount_num, email,
                        # entr_location, exit_location, gridcode, groupbuy_num, indoor_data,
                        # indoor_map, location, navi_poiid, photo_info, postcode, tag, tel, type, typecode, website,
                        save_info = '{0[id]}\u0001{0[name]}\u0001{0[pcode]}\u0001{0[pname]}\u0001' \
                                    '{0[citycode]}\u0001{0[cityname]}\u0001{0[adcode]}\u0001{0[adname]}\u0001' \
                                    '{0[address]}\u0001{0[alias]}\u0001{0[biz_type]}\u0001{0[business_area]}\u0001' \
                                    '{0[discount_num]}\u0001{0[email]}\u0001{0[entr_location]}\u0001' \
                                    '{0[exit_location]}\u0001' \
                                    '{0[gridcode]}\u0001{0[groupbuy_num]}\u0001{0[indoor_data]}\u0001' \
                                    '{0[indoor_map]}\u0001' \
                                    '{0[location]}\u0001{0[navi_poiid]}\u0001{0[photo_info]}\u0001{0[postcode]}\u0001' \
                                    '{0[tag]}\u0001{0[tel]}\u0001{0[type]}\u0001{0[typecode]}\u0001' \
                                    '{0[website]}'.format(save_dict)
                        save_data.append(save_info)
                        time.sleep(5)
                self.pipe.pipe_txt_save(save_data, filename=save_filename, savetype='a')

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {"host": proxy_host,
                                                                     "port": proxy_port,
                                                                     "user": proxy_user,
                                                                     "pass": proxy_pass}
        proxies = {"http": proxy_meta,
                   "https": proxy_meta}

        return proxies

    def run_engine(self):
        self._engine_residential_area()
Exemple #33
0
#encoding=utf-8
from crawl import Crawl


useridFile = open("userid.txt", 'r')
userid = useridFile.read().strip()
useridFile.close()

open("result.txt", 'w').close()

c = Crawl()

print "Job Started ...\n"
page = 1
url = c.host + '/'+userid+'/myfans?t=4&page=' + str(page)
while ( c.run(url) ):
    print "fans in page "+str(page)+"\n"
    page += 1 
    url = c.host + '/'+userid+'/myfans?t=4&page=' + str(page)

print "Done!\n"
Exemple #34
0
class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_city_link(self):
        """
        获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL,
                                          headers=setting.HEADERS,
                                          proxies=self._engine_use_proxy())
        element_city = self.analysis.analysis_by_xpath(content,
                                                       setting.XPATH_CITY_A)
        city_list = []
        for each_element in element_city:
            city_name = self.analysis.analysis_by_xpath(
                each_element, setting.XPATH_CITY_NAME)
            city_url = self.analysis.analysis_by_xpath(each_element,
                                                       setting.XPATH_CITY_URL)
            city_list.append('{}\u0001{}'.format(''.join(city_name),
                                                 ''.join(city_url)))
        self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST)

    def _engine_tactic_link(self):
        """
        获取每个城市中所有的攻略的链接
        :return:
        """
        city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST)
        tactic_check = self.pipe.pipe_pickle_load(
            filename=setting.FILE_TACTIC_CHECK)
        if not tactic_check:
            tactic_check = set([])
        for each_city in city_list:
            """
            http://travel.qunar.com/travelbook/list/22-城市拼音-城市id/
            hot(hot为热门游记,elite为精华游记,start为行程计划)_ctime(ctime为按最新发表排序,heat为热度排序)/页码.htm 
            """
            try:
                url = each_city.strip().split('\u0001')[1]
                name = each_city.strip().split('\u0001')[0]
                pattern = re.compile(r'p-cs(\d+)-(\w+)')
                city_pname = re.search(pattern, url).group(2)
                city_id = re.search(pattern, url).group(1)
                # 拼接攻略所在url(1.城市拼音名称:city_pname, 2.城市id:city_id, 3.分类)
                tactic_type = ['hot', 'elite', 'start']  # 攻略分类,目前脚本先抓取hot类
                tactic_url = setting.TACTIC_URL.format(city_pname, city_id,
                                                       tactic_type[0])
                current_page = 0
                maxpage = 200  # 默认最大页数
                while True:
                    save_list = []
                    current_page += 1
                    content = self.crawl.crawl_by_get(
                        tactic_url + '{}.htm'.format(current_page),
                        headers=setting.HEADERS,
                        retry=2,
                        timeout=15,
                        proxies=self._engine_use_proxy())
                    if not content:
                        break
                    # 获取总页数
                    if current_page == 1:
                        # 找到最大页数,使用map函数
                        pagecount = map(
                            lambda x: int(x) if x != '下一页>' else -1,
                            self.analysis.analysis_by_xpath(
                                content, xpahter=setting.XPATH_NEXTPAGE))
                        try:
                            maxpage = max(pagecount)
                        except:
                            break
                    tactic_ids = self.analysis.analysis_by_xpath(
                        content, xpahter=setting.XPATH_ID)
                    for each_id in tactic_ids:
                        each_url = 'http://travel.qunar.com/youji/{}'.format(
                            each_id)
                        save_info = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format(
                            name, city_pname, city_id, each_id, each_url)
                        if each_id not in tactic_check:
                            save_list.append(save_info)
                            tactic_check.add(each_id)
                    if save_list:
                        self.pipe.pipe_txt_save(
                            save_list,
                            filename=setting.FILE_TACTIC_LIST,
                            savetype='a')
                    if current_page >= maxpage:
                        break
                    time.sleep(0.2)
            except:
                continue

    def _engine_tactic_info(self):
        """
        获取所有攻略详细数据
        :return:
        """
        tactic_list = self.pipe.pipe_txt_load(
            filename=setting.FILE_TACTIC_LIST)
        for each_tactic in tactic_list:
            try:
                # 攻略数据
                tactic_info = each_tactic.strip().split('\u0001')
                city_name = tactic_info[0]
                city_pname = tactic_info[1]
                city_id = tactic_info[2]
                tactic_id = tactic_info[3]
                tactic_url = tactic_info[4]
                # 获取娱乐场所详细信息
                content = self.crawl.crawl_by_get(
                    tactic_url,
                    headers=setting.HEADERS,
                    proxies=self._engine_use_proxy(),
                    retry=3,
                    timeout=15)
                detail = self.analysis.analysis_by_xpath(
                    content, xpahter=setting.XPATH_TACTIC_DETAIL)
                detail['city_name'] = city_name
                detail['city_pname'] = city_pname
                detail['city_id'] = city_id
                detail['tactic_id'] = tactic_id
                detail['tactic_url'] = tactic_url
                detail['get_time'] = datetime.datetime.now().strftime(
                    '%Y-%m-%d %H:%M:%S')
                # 存储数据
                # 字段顺序
                # city_name, city_pname, city_id,
                # tactic_id,title,author,
                # create_date,start_date,days,
                # avgs_price,person,play_type,
                # content,get_time, tactic_url
                save_data = '{0[city_name]}\u0001{0[city_pname]}\u0001{0[city_id]}\u0001' \
                            '{0[tactic_id]}\u0001{0[title]}\u0001{0[author]}\u0001' \
                            '{0[create_date]}\u0001{0[start_date]}\u0001{0[days]}\u0001' \
                            '{0[avgs_price]}\u0001{0[person]}\u0001{0[play_type]}\u0001' \
                            '{0[content]}\u0001{0[get_time]}\u0001{0[tactic_url]}\u0001'.format(detail)
                self.pipe.pipe_txt_save(save_data,
                                        filename=setting.FILE_TACTIC_INFO,
                                        savetype='a')
                # self.pipe.pipe_mongo_save(detail, dbname='db_qunaer', colname='col_shop_info')
                time.sleep(0.1)
            except Exception as e:
                print('crawl error', e)
                continue

    def _engine_tactic_comments(self):
        """
        获取所有攻略评论数据
        :return:
        """
        tactic_list = self.pipe.pipe_txt_load(
            filename=setting.FILE_TACTIC_LIST)
        # 每个店铺最新评论时间表

        for each_tactic in tactic_list:
            try:
                # 店铺数据
                each_info = each_tactic.strip().split('\u0001')
                city_name = each_info[0]
                city_pname = each_info[1]
                city_id = each_info[2]
                tactic_id = each_info[3]
                tactic_url = each_info[4]
                setting.HEADERS_COMMENTS['Referer'] = tactic_url
                params = {
                    'bookId': tactic_id,  # 攻略id
                    'csrfToken':
                    'o7mGNaK63wbEaYFJTnDue14WX7sPlyXB',  # 暂时固定token
                    'page': 0,  # 页码
                    'pageSize': 30,  # 每页数量
                }
                while True:
                    params['page'] += 1
                    content = self.crawl.crawl_by_get(
                        setting.COMMENTS_API,
                        headers=setting.HEADERS_COMMENTS,
                        proxies=self._engine_use_proxy(),
                        params=params,
                        retry=2,
                        timeout=15)
                    try:
                        content_dict = json.loads(content)
                    except:
                        break
                    if not content_dict.get('data', {}).get('html'):
                        break
                    content_comments = content_dict.get('data', {}).get('html')
                    # 第一遍抓取要确定评论页数
                    elements_com = self.analysis.analysis_by_xpath(
                        content_comments, xpahter=setting.XPATH_COMMENTS_LI)
                    if not elements_com:
                        break
                    for each_element in elements_com:
                        ask_content = self.analysis.analysis_by_xpath(
                            each_element,
                            xpahter=setting.XPATH_COMMENTS_ASK_CONTENT)
                        answer_content = self.analysis.analysis_by_xpath(
                            each_element,
                            xpahter=setting.XPATH_COMMENTS_ANSWER_CONTENT)
                        ask_date = self.analysis.analysis_by_xpath(
                            each_element,
                            xpahter=setting.XPATH_COMMENTS_ASK_DATE)
                        answer_date = self.analysis.analysis_by_xpath(
                            each_element,
                            xpahter=setting.XPATH_COMMENTS_ANSWER_DATE)

                        commetents_info = {
                            'city_name':
                            city_name,
                            'city_id':
                            city_id,
                            'tactic_id':
                            tactic_id,
                            'ask_content':
                            ask_content,
                            'answer_content':
                            answer_content,
                            'ask_date':
                            ask_date,
                            'answer_date':
                            answer_date,
                            'get_time':
                            datetime.datetime.now().strftime(
                                '%Y-%m-%d %H:%M:%S'),
                            'tactic_url':
                            tactic_url
                        }
                        for eachkey in commetents_info.keys():

                            if isinstance(commetents_info[eachkey], str):
                                commetents_info[eachkey] = commetents_info[eachkey]\
                                    .replace('\n', '').replace('\r', '').replace('\xa0', '')
                            elif isinstance(commetents_info[eachkey], list):
                                commetents_info[eachkey] = ''.join(commetents_info[eachkey])\
                                    .replace('\n', '').replace('\r', '')
                            # 存储数据
                            # 字段顺序
                            # city_name, city_id, tactic_id,
                            # ask_content, answer_content, ask_date,
                            # answer_date, get_time, tactic_url,
                        save_data = '{0[city_name]}\u0001{0[city_id]}\u0001{0[tactic_id]}\u0001' \
                                    '{0[ask_content]}\u0001{0[answer_content]}\u0001{0[ask_date]}\u0001' \
                                    '{0[answer_date]}\u0001{0[get_time]}\u0001' \
                                    '{0[tactic_url]}\u0001'.format(commetents_info)
                        self.pipe.pipe_txt_save(
                            save_data,
                            filename=setting.FILE_TACTIC_COMMENTS,
                            savetype='a')
                        # self.pipe.pipe_mongo_save(commetents_info, dbname='db_qunaer', colname='col_shopping_comments')
            except:
                continue

    def _temp_city_info(self, cityname):
        """
        做22项数据处理时临时用
        :return:
        """
        citylist = self.pipe.pipe_txt_load(filename='city_list_total.txt')
        city_params = {
            '国别': '&',
            '省自治区全称': '&',
            '省自治区简称': '&',
            '市州全称': '&',
            '市州简称': '&',
            '区县全称': '&',
            '区县简称': '&',
            '地区编码': '&',
            '等级': '&'
        }
        spec_city = {
            '北京': '110000',
            '天津': '120000',
            '上海': '310000',
            '重庆': '500000'
        }
        for each in citylist:
            cityinfo = each.split('\u0001')
            if cityname in cityinfo:
                site = cityinfo.index(cityname)
                if site == 4 or site == 5:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['区县全称'] = cityinfo[4].strip()
                    city_params['区县简称'] = cityinfo[5].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()
                    city_params['等级'] = '区县级'
                elif site == 2 or site == 3:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityinfo[0].strip()
                    city_params['省自治区简称'] = cityinfo[1].strip()
                    city_params['市州全称'] = cityinfo[2].strip()
                    city_params['市州简称'] = cityinfo[3].strip()
                    city_params['地区编码'] = cityinfo[-1].strip()[:-2] + '00'
                    city_params['等级'] = '地市级'
                elif cityname in ['北京', '重庆', '上海', '天津']:
                    city_params['国别'] = 'CN'
                    city_params['省自治区全称'] = cityname + '市'
                    city_params['省自治区简称'] = cityname
                    city_params['市州全称'] = cityname + '市'
                    city_params['市州简称'] = cityname
                    city_params['地区编码'] = spec_city[cityname]
                    city_params['等级'] = '直辖'
                break

        return city_params

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    def start_engine(self):
        self._engine_city_link()
        # 本版块循环策略为循环抓取攻略,然后评论每次抓取一次攻略列表之后,抓取一遍所有攻略所有评论,并入存入新的文本
        self._engine_tactic_link()
        self._engine_tactic_info()
        self._engine_tactic_comments()
Exemple #35
0
class EngineSelenium:
    def __init__(self, name):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()
        self.options = webdriver.ChromeOptions()
        # 指定下载位置
        prefs = {
            'profile.default_content_settings.popups': 0,
            'download.default_directory': os.path.abspath('DATA')
        }
        self.options.add_experimental_option('prefs', prefs)
        self.driver = webdriver.Chrome(chrome_options=self.options)
        self.realname = name
        self.name = str(name.encode('gbk'))[2:-1].replace('\\x', '%').upper()

    def _engine_get_trend(self):
        """
        趋势研究板块数据
        :return:
        """
        # 获取数据对应的标题
        url = 'http://index.baidu.com/?tpl=trend&word={}'.format(self.name)
        self.driver.get(url)
        # 等待页面跳转
        time.sleep(5)
        content_page = self.driver.page_source
        page_str_date = self.analysis.analysis_by_xpath(
            content_page,
            xpahter=
            "substring-after(.//*[text()='搜索指数概况']/parent::div//*[@class='compInfo'][2], '至')"
        )
        end_date = page_str_date.strip()
        element_div_all = self.analysis.analysis_by_xpath(
            content_page, xpahter=setting.XPATH_DIV)
        element_dict = {
            "近7天": element_div_all[0:6],
            "近30天": element_div_all[6::]
        }
        for element_name, element_div in element_dict.items():
            ele_title = element_div[0:3]  # 前3个element为标题
            ele_content = element_div[3:6]  # 后3个element为图片数据
            for i in range(3):
                if i == 0:
                    value_pic = {}
                    title = self.analysis.analysis_by_xpath(
                        ele_title[i], xpahter=setting.XPATH_TITLE)
                    element_pic = self.analysis.analysis_by_xpath(
                        ele_content[i],
                        xpahter=".//span[starts-with(@class, 'ftlwhf')]")
                    # ===========图片操作
                    pic_url = self.analysis.analysis_by_xpath(
                        ele_content[i], xpahter=setting.XPATH_PIC)
                    if pic_url:
                        downurl = ''.join(pic_url)
                        try:
                            url = 'http://index.baidu.com' + re.search(
                                re.compile(r'url\("(.*?)"\)', re.S),
                                downurl).group(1)
                        except:
                            url = ''
                    # 去访问图片的下载链接
                    url_real = url.replace('amp;', '')
                    self.driver.get(url_real)
                    time.sleep(1)
                    # 读取下载的图片并用java那边提供的接口识别内容
                    pic_code = self.pipe.pipe_pic_load(filename='下载')
                    # 删除该图片
                    self.pipe.pipe_pic_del(filename='下载')
                    # ===========图片操作
                    n = 1
                    titles = list(
                        map(lambda x: x.replace(' ', ''), title.split('|')))
                    for each in element_pic:
                        pic_info = self.analysis.analysis_by_xpath(
                            each, xpahter=".//span[@class='imgval']")
                        res_pic = []
                        for each_info in pic_info:
                            imgval = self.analysis.analysis_by_xpath(
                                each_info, xpahter="@style")
                            imgtxt = self.analysis.analysis_by_xpath(
                                each_info,
                                xpahter=".//*[@class='imgtxt']/@style")
                            pic_px = '{},{}'.format(
                                self._engine_tool_regex(imgval),
                                self._engine_tool_regex(imgtxt))
                            res_pic.append(
                                pic_px.replace('px', '').replace('-', ''))
                        value_pic[titles[n - 1]] = ';'.join(res_pic)
                        n += 1
                    # 图片识别完输出数据,此处图片二进制文件进行base64处理
                    for pic_name, pic_px in value_pic.items():
                        data = {
                            'data': base64.b64encode(pic_code),
                            'num1': pic_px,
                            'type': 'm'
                        }
                        pic_value = self.crawl.crawl_by_post(
                            url=setting.RECOGNITION_URL, data=data)
                        print(end_date, element_name, pic_name, pic_value)
                        save_data = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format(
                            self.realname, end_date, element_name, pic_name,
                            pic_value)
                        self.pipe.pipe_txt_save(
                            save_data,
                            filename=setting.FILE_TREND_ZSGK.format(
                                self.realname, end_date),
                            savetype='a')
                else:
                    title = self.analysis.analysis_by_xpath(
                        ele_title[i], xpahter=setting.XPATH_TITLE)
                    titles = title.replace(' ', '').split('|')
                    element_pic = self.analysis.analysis_by_xpath(
                        ele_content[i],
                        xpahter=".//span[starts-with(@class, 'ftlwhf')]")
                    pics = []
                    n = 1
                    for each in element_pic:
                        syboml = self.analysis.analysis_by_xpath(
                            each,
                            xpahter=".//*[starts-with(@class,'rat')]/text()")
                        pic_info = list(
                            map(
                                self._engine_tool_regex,
                                self.analysis.analysis_by_xpath(
                                    each, xpahter=".//*/i/@style")))
                        pic_px = list(
                            map(
                                lambda x: int(
                                    x.replace('-', '').replace('px', '')),
                                pic_info))
                        pic_value = ''.join(
                            list(
                                map(
                                    lambda x: '{:.0f}'.format(x / 8)
                                    if x != 80 else '%', pic_px)))
                        value = ''.join(syboml) + pic_value
                        pics.append(value)
                        n += 1
                    # 可以直接输出的数据
                    current_pic = dict(zip(titles, pics))
                    for pic_name, pic_value in current_pic.items():
                        print(end_date, element_name, pic_name, pic_value)
                        save_data = '{}\u0001{}\u0001{}\u0001{}\u0001{}'.format(
                            self.realname, end_date, element_name, pic_name,
                            pic_value)
                        self.pipe.pipe_txt_save(
                            save_data,
                            filename=setting.FILE_TREND_ZSGK.format(
                                self.realname, end_date),
                            savetype='a')
        # 搜索指数趋势
        content_page = self.driver.page_source
        # 获取res 和 res2
        pattern_res = re.compile(r'res=(.*?)&', re.S)
        pattern_res2 = re.compile(r'res2=(.*?)&', re.S)
        res = re.search(pattern_res, content_page).group(1)
        res2 = re.search(pattern_res2, content_page).group(1)
        page_str_date = self.analysis.analysis_by_xpath(
            content_page,
            xpahter=
            "substring-after(.//*[text()='搜索指数趋势']/parent::div//*[@class='compInfo'][2], '至')"
        )
        page_date = datetime.datetime.strptime(page_str_date.strip(),
                                               '%Y-%m-%d')
        # 此处调节日期
        startdate = (page_date -
                     datetime.timedelta(days=29)).strftime('%Y-%m-%d')
        enddate = page_date.strftime('%Y-%m-%d')
        # 构造url用于获取res3参数(res3参数需要去请求才能得到)
        url_res3 = 'http://index.baidu.com/Interface/Search/getAllIndex' \
                   '/?res={}&res2={}&startdate={}&enddate={}'.format(res, res2, startdate, enddate)
        self.driver.get(url_res3)
        time.sleep(2)
        content_res3 = self.driver.page_source
        # 返回的数据有整体趋势 pc趋势 移动趋势
        pattern_res3 = re.compile(r'<body>(.*?)</body>', re.S)
        res3 = re.search(pattern_res3, content_res3).group(1)
        # 取3种趋势的对应参数
        res3_dict = json.loads(res3)
        res3_data = res3_dict.get('data')
        if not res3_data:
            print('未能获取搜索指数趋势res3数据')
            return
        # all 整体趋势 pc pc趋势 wise 移动趋势
        try:
            data_dict = {
                '整体趋势': res3_data.get('all')[0].get('userIndexes_enc'),
                'pc趋势': res3_data.get('pc')[0].get('userIndexes_enc'),
                '移动趋势': res3_data.get('wise')[0].get('userIndexes_enc')
            }
        except Exception as e:
            data_dict = {}
            print('获取对应res3数据出错:{}'.format(e))
        for name, current_res3 in data_dict.items():
            res3_list = current_res3.split(',')[::-1]
            timenow = int(time.time() * 1000)
            n = 0
            for each_res3 in res3_list:
                if n >= 7:
                    break
                trend_pic = {}
                # 当前日期
                current_date = (
                    page_date -
                    datetime.timedelta(days=n)).strftime('%Y-%m-%d')
                url_trend_pic = 'http://index.baidu.com/Interface/IndexShow/show/?res={}&res2={}&classType=1&' \
                                'res3[]={}&className=view-value&{}'.format(res, res2, each_res3, timenow)
                self.driver.get(url_trend_pic)
                # 切换到图片所在页面并等待一下
                time.sleep(1)
                content_each_pic = self.driver.page_source
                # 获取对应图片展示的html
                code = re.search(re.compile(r'"code":\[(.*?)\]', re.S),
                                 content_each_pic).group(1)
                deal_code = code.replace('\\',
                                         '').replace('&quot;', '').replace(
                                             '&lt;', '<').replace('&gt;',
                                                                  '>')[1:-1]
                # 获取对应图片的下载链接
                url_current_pic = 'http://index.baidu.com' + re.search(
                    re.compile(r'url\("(.*?)"\)', re.S), deal_code).group(1)
                # 访问以下url将会下载图片
                url_img = url_current_pic.replace('amp;', '')
                # 下载图片
                self.driver.get(url_img)
                time.sleep(0.5)
                # 读取下载的图片并用java那边提供的接口识别内容
                pic_code = self.pipe.pipe_pic_load(filename='下载')
                # 图片有可能下载失败, 后期这里可能需要调整
                if not pic_code:
                    return
                # 删除该图片
                self.pipe.pipe_pic_del(filename='下载')
                # ==============等待处理这张图片
                element_span = self.analysis.analysis_by_xpath(
                    deal_code, xpahter=".//*/span")
                res_pic = []
                for each in element_span:
                    pic_info = self.analysis.analysis_by_xpath(
                        each, xpahter=".//@style")
                    pic_px = '{},{}'.format(
                        self._engine_tool_regex(pic_info[0]),
                        self._engine_tool_regex(pic_info[1]))
                    res_pic.append(pic_px.replace('px', '').replace('-', ''))
                    # 给出对应的日期?还有给出url
                trend_pic['date'] = current_date
                trend_pic['name'] = name
                data = {
                    'data': base64.b64encode(pic_code),
                    'num1': ';'.join(res_pic),
                    'type': 'm'
                }
                pic_value = self.crawl.crawl_by_post(
                    url=setting.RECOGNITION_URL, data=data)
                # 数据输出
                print(current_date, name, pic_value)
                save_data = '{}\u0001{}\u0001{}\u0001{}'.format(
                    self.realname, current_date, name, pic_value)
                self.pipe.pipe_txt_save(
                    save_data,
                    filename=setting.FILE_TREND_ZSQS.format(
                        self.realname, enddate),
                    savetype='a')
                n += 1

    def _engine_get_demand(self):
        """
        需求图谱板块
        :return:
        """
        url = 'http://index.baidu.com/?tpl=demand&word={}'.format(self.name)
        self.driver.get(url)
        time.sleep(6)
        content_page = self.driver.page_source
        # 需求图谱数据
        page_str_date = self.analysis.analysis_by_xpath(
            content_page,
            xpahter=
            "substring-after(.//*[text()='需求图谱']/parent::div//*[@class='compInfo'][2], '至')"
        )
        end_date = page_str_date.strip()
        element_demand = self.analysis.analysis_by_xpath(
            content_page,
            xpahter=".//*[@id='demand']//*[contains(@style,"
            "'text-anchor: middle')and not(contains(@fill,"
            "'#9a9a9a'))]")
        for each_demand in element_demand:
            text = self.analysis.analysis_by_xpath(
                each_demand, xpahter='.//descendant::text()')
            value_x = self.analysis.analysis_by_xpath(each_demand,
                                                      xpahter='.//@x')
            value_y = self.analysis.analysis_by_xpath(each_demand,
                                                      xpahter='.//@y')
            value_dy = self.analysis.analysis_by_xpath(each_demand,
                                                       xpahter='.//tspan/@dy')
            if text:
                save_data = '{}\u0001{}\u0001{}\u0001{}'.format(
                    ''.join(text), ''.join(value_x), ''.join(value_y),
                    ''.join(value_dy))
                print('{}_{}_{}_{}'.format(''.join(text), ''.join(value_x),
                                           ''.join(value_y),
                                           ''.join(value_dy)))
                self.pipe.pipe_txt_save(
                    save_data,
                    filename=setting.FILE_DEMAND_XQTP.format(
                        self.realname, end_date),
                    savetype='a')

        # 相关词分类数据
        element_tab = self.analysis.analysis_by_xpath(
            content_page, xpahter=".//*[@id='tablelist']//*[@class='listN1']")
        page_str_date = self.analysis.analysis_by_xpath(
            content_page,
            xpahter=
            "substring-after(.//*[text()='相关词分类']/parent::div//*[@class='compInfo'][2], '至')"
        )
        enddate = page_str_date.strip()
        # 左边框内容(来源相关词)
        for i in range(0, 2):
            th = self.analysis.analysis_by_xpath(
                element_tab[i], xpahter=".//descendant::th/text()")
            title = ''.join(th)
            trs = self.analysis.analysis_by_xpath(
                element_tab[i], xpahter=".//*[@class='rank']/parent::tr")
            for each_tr in trs:
                rank = self.analysis.analysis_by_xpath(
                    each_tr, xpahter=".//*[@class='rank']/text()")
                words = self.analysis.analysis_by_xpath(
                    each_tr, xpahter=".//*[@class='hotWord']/text()")
                style = self.analysis.analysis_by_xpath(
                    each_tr, xpahter=".//*[@class='psBar']/@style")
                width = re.search(re.compile(r'width:(.*?);', re.S),
                                  ''.join(style)).group(1)
                save_data = '{}\u0001{}\u0001{}\u0001{}'.format(
                    title, ''.join(rank), ''.join(words), width.strip())
                print(save_data)
                self.pipe.pipe_txt_save(
                    save_data,
                    filename=setting.FILE_DEMAND_XGC.format(
                        self.realname, enddate))
        # 右边框内容(搜索指数,上升最快)
        for i in range(2, 4):
            th = self.analysis.analysis_by_xpath(
                element_tab[i], xpahter=".//descendant::th/text()")
            title = ''.join(th)
            trs = self.analysis.analysis_by_xpath(
                element_tab[i], xpahter=".//*[@class='rank']/parent::tr")
            for each_tr in trs:
                rank = self.analysis.analysis_by_xpath(
                    each_tr, xpahter=".//*[@class='rank']/text()")
                words = self.analysis.analysis_by_xpath(
                    each_tr, xpahter=".//*[@class='hotWord']/text()")
                num = self.analysis.analysis_by_xpath(
                    each_tr, xpahter="string(.//td[last()])")
                save_data = '{}\u0001{}\u0001{}\u0001{}'.format(
                    title, ''.join(rank), ''.join(words), num.strip())
                print(save_data)
                self.pipe.pipe_txt_save(
                    save_data,
                    filename=setting.FILE_DEMAND_XGC.format(
                        self.realname, enddate))

    def _engine_get_sentiment(self):
        """
        资讯关注板块数据
        :return:
        """
        url = 'http://index.baidu.com/?tpl=trend&word={}'.format(self.name)
        self.driver.get(url)
        # 新闻资讯监测数据
        time.sleep(2)
        content_page = self.driver.page_source
        # 获取res 和 res2
        pattern_res = re.compile(r'res=(.*?)&', re.S)
        pattern_res2 = re.compile(r'res2=(.*?)&', re.S)
        res = re.search(pattern_res, content_page).group(1)
        res2 = re.search(pattern_res2, content_page).group(1)
        # 用res/res2去请求getPcFeedIndex这个接口,此处res/res2来自需求图谱板块获取的,但是不影响数据结果
        # 资讯指数接口
        api_info = 'http://index.baidu.com/Interface/search/getPcFeedIndex/?res={}&res2={}&type=feed'.format(
            res, res2)
        # 新闻指数接口
        api_news = 'http://index.baidu.com/Interface/search/getNews/?res={}&res2={}&type=search'.format(
            res, res2)
        api_dict = {'资讯指数': api_info, '媒体指数': api_news}
        for api_name, api_url in api_dict.items():
            self.driver.get(api_url)
            content_data = self.driver.page_source
            # 请求对应解密码锁需要的唯一id,并且必须在一定时间内(目测在10-20s左右)要完成请求,不然请求回来的解密码就失效了
            uniqid = re.search(re.compile(r'"uniqid":"(.*?)"', re.S),
                               content_data).group(1)
            # 所有的数据在这,为页面上选择全部数据时候的内容,后期可根据需要,选择数量,此数据需要截取
            userindexs = re.search(re.compile(r'"userIndexes":"(.*?)"', re.S),
                                   content_data).group(1)
            # 当前数据时间段
            data_date = re.search(re.compile(r'"period":"\d+\|(\d+)"', re.S),
                                  content_data).group(1)
            # 当前搜索内容
            name = re.search(re.compile(r'"key":"(.*?)",', re.S),
                             content_data).group(1)
            # 需要拿到uniqid去请求对应解密码,以下是接口
            url_ptbk = 'http://index.baidu.com/Interface/api/ptbk?res={}&res2={}&uniqid={}'.format(
                res, res2, uniqid)
            self.driver.get(url_ptbk)
            content_pasw = self.driver.page_source
            # 获取返回的解密码
            pasw = re.search(re.compile(r'"data":"(.*?)"', re.S),
                             content_pasw).group(1)
            # 将解密码组合成字典,其中,值为,的key则为本次数据中的隔断
            pasw_key = pasw[0:int(len(pasw) / 2)]
            pasw_value = pasw[int(len(pasw) / 2)::]
            pasw_dict = dict(zip(pasw_key, pasw_value))
            # 数据分割
            for k, v in pasw_dict.items():
                if v == ',':
                    data_list = userindexs.split(k)
                    break
            # 处理数据
            n = 1
            print(api_name)
            for each_data in data_list:
                current_time = (
                    datetime.datetime.strptime(data_date, '%Y%m%d') -
                    datetime.timedelta(days=len(data_list) -
                                       n)).strftime('%Y-%m-%d')
                each_value = ''
                for i in each_data:
                    each_value += pasw_dict[i]
                # current_time 为时间 each_value 为对应的搜索数量
                save_data = '{}\u0001{}\u0001{}'.format(
                    api_name, current_time, each_value)
                print(save_data)
                self.pipe.pipe_txt_save(
                    save_data,
                    filename=setting.FILE_SENTIMENT_XWZS.format(
                        self.realname, data_date),
                    savetype='a')
                n += 1
            time.sleep(2)

        # 最下面的新闻数据
        url_news = 'http://index.baidu.com/?tpl=sentiment&word={}'.format(
            self.name)
        self.driver.get(url_news)
        time.sleep(6)
        content_news = self.driver.page_source
        # 直接从页面上取
        element_a = self.analysis.analysis_by_xpath(
            content_news,
            xpahter=".//*[starts-with(@class,'stmNews')]"
            "//*[@class='listN1']//*[starts-with(@class,"
            "'mhref')]/a")
        # 当前页面展示的新闻链接及标题
        for each_ele in element_a:
            title = self.analysis.analysis_by_xpath(each_ele,
                                                    xpahter=".//@title")
            href = self.analysis.analysis_by_xpath(each_ele,
                                                   xpahter=".//@href")
            save_data = '{}\u0001{}'.format(''.join(title), ''.join(href))
            print(save_data)
            self.pipe.pipe_txt_save(
                save_data,
                filename=setting.FILE_SENTIMENT_NEWS.format(
                    self.realname, data_date),
                savetype='a')

    def _engine_get_crowd(self):
        """
        人群画像板块数据
        :return:
        """
        url = 'http://index.baidu.com/?tpl=crowd&word={}'.format(self.name)
        self.driver.get(url)
        time.sleep(6)
        content_page = self.driver.page_source
        page_str_date = self.analysis.analysis_by_xpath(
            content_page,
            xpahter=
            "substring-after(.//*[text()='地域分布']/parent::div//*[@class='compInfo'][2], '至')"
        )
        end_date = page_str_date.strip()
        # 地域分布
        for name in ['省份', '城市', '区域']:
            element = self.driver.find_element_by_xpath(
                ".//*[text()='{}']".format(name))
            element.click()
            time.sleep(2)
            content_page = self.driver.page_source
            ele_trs = self.analysis.analysis_by_xpath(
                content_page,
                xpahter=
                ".//*[@class='tang-scrollpanel-content']//*[starts-with(@class,'items')]/descendant::tr"
            )
            # 区域只有7个,后面的数据为城市的
            if name == '区域':
                ele_trs = ele_trs[0:7]
            for each_tr in ele_trs:
                rank = self.analysis.analysis_by_xpath(
                    each_tr, xpahter=".//*[@class='scRank']/text()")
                cityname = self.analysis.analysis_by_xpath(
                    each_tr, xpahter=".//*[@class='scName']/text()")
                style = self.analysis.analysis_by_xpath(
                    each_tr, xpahter=".//*[@class='zbar'][1]/@style")
                width = re.search(re.compile(r'width:(.*?);', re.S),
                                  ''.join(style)).group(1)
                save_data = '{}\u0001{}\u0001{}\u0001{}'.format(
                    end_date, ''.join(rank), ''.join(cityname), ''.join(width))
                print(save_data)
                self.pipe.pipe_txt_save(
                    save_data,
                    filename=setting.FILE_CROWD_DYFB.format(
                        self.realname, end_date),
                    savetype='a')
        # 人群属性
        content_page = self.driver.page_source
        page_str_date = self.analysis.analysis_by_xpath(
            content_page,
            xpahter=
            "substring-after(.//*[text()='人群属性']/parent::div//*[@class='compInfo'][2], '至')"
        )
        enddate = page_str_date.strip()
        # 年龄分布
        age_height = self.analysis.analysis_by_xpath(
            content_page,
            xpahter=".//*[@id='grp_social_l']//*["
            "@fill='#3ec7f5']/@height")
        # value = self.analysis.analysis_by_xpath(content_page, xpahter=".//*[@id='grp_social_l']//*[starts-with(@style,"
        #                                                             "'text-anchor: middle')]/descendant::text()")
        # 计算总数
        total = reduce(lambda x, y: float(x) + float(y), age_height)
        # 计算每一个阶段的百分比
        percent = list(
            map(lambda x: '{:.2f}%'.format((float(x) / total) * 100),
                age_height))
        # 构造对应数据,这里把每个数据key写为固定的
        age_dict = {
            '19岁及以下': percent[0],
            '20-29岁': percent[1],
            '30-39岁': percent[2],
            '40-49岁': percent[3],
            '50岁及以上': percent[4],
        }
        # 性别分布
        sex_height = self.analysis.analysis_by_xpath(
            content_page,
            xpahter=".//*[@id='grp_social_r']//*["
            "@fill='#3ec7f5']/@height")
        # 计算总数
        total = reduce(lambda x, y: float(x) + float(y), sex_height)
        # 计算每一个阶段的百分比
        percent = list(
            map(lambda x: '{:.2f}%'.format((float(x) / total) * 100),
                sex_height))
        # 构造对应数据,这里把每个数据key写为固定的
        sex_dict = {'男': percent[0], '女': percent[1]}
        save_data = []
        for k, v in age_dict.items():
            save_info = '{}\u0001年龄分布\u0001{}\u0001{}'.format(enddate, k, v)
            save_data.append(save_info)
        for k1, v1 in sex_dict.items():
            save_info = '{}\u0001性别分布\u0001{}\u0001{}'.format(enddate, k1, v1)
            save_data.append(save_info)
        print(save_data)
        self.pipe.pipe_txt_save(save_data,
                                filename=setting.FILE_CROWD_RQSX.format(
                                    self.realname, enddate),
                                savetype='a')

    def _engine_do_login(self):
        """
        登录处理
        :return:
        """
        login_url = 'http://index.baidu.com/'
        self.driver.get(login_url)
        element = self.driver.find_element_by_xpath(".//*[text()='登录']")
        element.click()
        time.sleep(5)
        element = self.driver.find_element_by_xpath(
            ".//*/input[@name='userName']")
        element.send_keys('daqbigdata')
        time.sleep(3)
        element = self.driver.find_element_by_xpath(
            ".//*/input[@name='password']")
        element.send_keys('daqsoft')
        time.sleep(1)
        element = self.driver.find_element_by_xpath(
            ".//*/input[@type='submit']")
        element.click()
        time.sleep(8)

    @staticmethod
    def _engine_tool_regex(str_data):
        """
        正则取数据
        :return:
        """
        if isinstance(str_data, list):
            deal_data = ''.join(str_data)
        else:
            deal_data = str_data

        try:
            return re.search(re.compile(r':([-]{0,1}\d+px)', re.S),
                             deal_data).group(1)
        except:
            return

    def run_engine(self):
        # 先登录
        self._engine_do_login()
        self._engine_get_trend()
        self._engine_get_demand()
        self._engine_get_sentiment()
        self._engine_get_crowd()
        # 最后关闭浏览器
        self.driver.close()
Exemple #36
0
class Engine:
    def __init__(self):
        self.crawl = Crawl()
        self.analysis = Analysis()
        self.pipe = Pipeline()

    def _engine_city_link(self):
        """
        获取所有城市的名称和url链接,结果输出到file_city_list.txt文本中
        :return:
        """
        content = self.crawl.crawl_by_get(setting.START_URL,
                                          headers=setting.HEADERS,
                                          proxies=self._engine_use_proxy())
        element_city = self.analysis.analysis_by_xpath(content,
                                                       setting.XPATH_CITY_A)
        city_list = []
        for each_element in element_city:
            city_name = self.analysis.analysis_by_xpath(
                each_element, setting.XPATH_CITY_NAME)
            city_url = self.analysis.analysis_by_xpath(each_element,
                                                       setting.XPATH_CITY_URL)
            city_list.append('{}\u0001{}'.format(''.join(city_name),
                                                 ''.join(city_url)))
        self.pipe.pipe_txt_save(city_list, filename=setting.FILE_CITY_LIST)

    def _engine_scenic_link(self):
        """
        获取每个城市中所有的热门景点的链接
        :return:
        """
        city_list = self.pipe.pipe_txt_load(filename=setting.FILE_CITY_LIST)
        for each_city in city_list:
            url = each_city.strip().split('\u0001')[1] + '-jingdian'
            city_name = each_city.strip().split('\u0001')[0]
            content = self.crawl.crawl_by_get(url,
                                              headers=setting.HEADERS,
                                              proxies=self._engine_use_proxy(),
                                              retry=2,
                                              timeout=15)
            element_a = self.analysis.analysis_by_xpath(
                content, xpahter=setting.XPATH_HOT_A)
            save_list = []
            for each_ele in element_a:
                scenic_full_name = self.analysis.analysis_by_xpath(
                    each_ele, xpahter=setting.XPATH_HOT_NAME)
                current_url = self.analysis.analysis_by_xpath(
                    each_ele, xpahter=setting.XPATH_HOT_HREF)
                scenic_name = ''.join(scenic_full_name).replace('旅游攻略', '')
                scenic_url = ''.join(current_url)
                scenic_id = re.search(re.compile(r'p-oi(\d+)-'),
                                      scenic_url).group(1)
                # 存储字段
                # city_name, scenic_id, scenic_name, scenic_url
                save_info = '{}\u0001{}\u0001{}\u0001{}'.format(
                    city_name, scenic_id, scenic_name, scenic_url)
                save_list.append(save_info)
            self.pipe.pipe_txt_save(save_list,
                                    filename=setting.FILE_SCENIC_LIST,
                                    savetype='a')

    @staticmethod
    def _engine_use_proxy():
        """
        使用代理ip
        :return: 代理ip
        """
        proxy_host = "proxy.abuyun.com"
        proxy_port = "9010"
        proxy_user = "******"
        proxy_pass = "******"
        proxy_meta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
            "host": proxy_host,
            "port": proxy_port,
            "user": proxy_user,
            "pass": proxy_pass
        }
        proxies = {"http": proxy_meta, "https": proxy_meta}

        return proxies

    def start_engine(self):
        self._engine_city_link()
        self._engine_scenic_link()
Exemple #37
0
#encoding=utf-8
from crawl import Crawl

c = Crawl()
url = "/345920104"
c.run(c.host+url)

print "Done!\n"
Exemple #38
0
 def __init__(self):
     Crawl.__init__(self, INIT_URL, SKIP_URL, USE_TOR)
     #select collection
     self.mongo_collection = self.mongo_conn['cdiscount_product']
Exemple #39
0
def run():
    crawler = Crawl()
    vips = crawler.all_come_to_bowl()
    print_vips(vips)