def task_tr_click(url): mongo = mongoConnection.mongoConnection(db='wanFang', collection='paperinfo') info = mongo.collection.find({'url': url}, {'spidertime': 1}) info = dict( collections.Counter([str(x['spidertime'].split(' ')[0]) for x in info])) #time oneday = datetime.timedelta(days=1) now = datetime.date.today() x_value = [str(now - oneday * x) for x in range(30)] x_value.reverse() y_value = [] for x in x_value: value = info.get(x) if value == None: y_value.append(0) else: y_value.append(value) xy_value = {'y_value': y_value, 'x_value': x_value} mongo = mongoConnection.mongoConnection(db='wanFang', collection='spider') content = list(mongo.collection.find({'url': url}, {'_id': 0})) if len(content) > 0: content = content[0] return {'xy_value': xy_value, 'content': content}
def already_exist_data(): #get query task list mongo = mongoConnection.mongoConnection(db='patent', collection='spider') info = mongo.collection.find({}) info = [x for x in info] #get related count mongo = mongoConnection.mongoConnection(db='patent', collection='patentinfo') find = mongo.collection.find #function reference for x in info: x['_id'] = str(x['_id']) x['count'] = find({'spider_id': x['_id']}, {'_id': 0}).count() return info
def api_delete_task(url): try: #delete spider information mongo = mongoConnection.mongoConnection(db='wanFang', collection='spider') _ = mongo.collection.remove({'url': url}) #delete paperinfo's information mongo = mongoConnection.mongoConnection(db='wanFang', collection='paperinfo') _ = mongo.collection.remove({'url': url}) return 0 except Exception as e: return -1
def patent_delete_task(spider_id): try: #delete spider information mongo = mongoConnection.mongoConnection(db='patent', collection='spider') _ = mongo.collection.remove({'_id': ObjectId(spider_id)}) #delete ppatentinfo's information mongo = mongoConnection.mongoConnection(db='patent', collection='patentinfo') _ = mongo.collection.remove({'spider_id': spider_id}) return 0 except Exception as e: return -1
def iqiyi_url_spider(content, site='iqiyi', socketio=None): mongoDB = mongoConnection.mongoConnection(db='video', collection='spider') data = list( mongoDB.collection.find({ 'content': content, 'site': site }, { 'time_limit': 1, '_id': 0, 'length': 1 }))[0] print(data) page_num = get_page_nums(content, site, time_limt=data['time_limit'], length=data['length']) if socketio: socketio.emit('my_response', {'data': '总数为:' + str(page_num)}, namespace='/video') page_num = min((page_num + 19) // 20, 20) mongoDB = mongoConnection.mongoConnection(db='video', collection='urlinfo') for index in range(1, page_num + 1): result = get_page_info(content, site, time_limt=data['time_limit'], length=data['length'], pagenum=index) if socketio: for line in result: socketio.emit('my_response', { 'data': 'Currently crawling title is: ' + line['videoname'] }, namespace='/video') socketio.sleep(1) try: print(line) infomation_id = mongoDB.collection.insert(line) mongoDB.db['spider'].update( { 'site': site, 'content': content }, {'$set': { 'inactive': 0 }}) except Exception as e: logger.debug(e) socketio.emit('my_response', {'data': '已完成'}, namespace='/video') socketio.emit('disconnect', {'data': 'disconnect'}, namespace='/video')
def task_tr_click(spider_id): mongo = mongoConnection.mongoConnection(db='patent', collection='patentinfo') info = mongo.collection.find({'spider_id': spider_id}, {'spidertime': 1}) info = dict( collections.Counter([str(x['spidertime'].split(' ')[0]) for x in info])) #time oneday = datetime.timedelta(days=1) now = datetime.date.today() x_value = [str(now - oneday * x) for x in range(30)] x_value.reverse() y_value = [] for x in x_value: value = info.get(x) if value == None: y_value.append(0) else: y_value.append(value) xy_value = {'y_value': y_value, 'x_value': x_value} content = list(mongo.collection.find({'spider_id': spider_id}, {'_id': 0})) for x in range(len(content)): if isinstance(content[x]['institution'], list): content[x]['institution'] = list(set(content[x]['institution'])) return {'xy_value': xy_value, 'content': content}
def store_spider(**kwargs): """ """ try: mongoDB = mongoConnection.mongoConnection(db='video', collection='spider') if not isinstance(kwargs['site'], list): kwargs['site'] = [kwargs['site']] infomations = [{ 'content': kwargs['content'], 'status': 1, 'site': site, 'feq': kwargs['feq'], 'length': kwargs['length'], 'time_limit': kwargs['time_limit'], 'time': kwargs['time'], 'last_time': kwargs['time'], 'inactive': 0 } for site in kwargs['site']] infomation_id = mongoDB.collection.insert_many(infomations, ordered=False) return infomation_id except Exception as e: print(e) return None
def youtube_url_spider(content, socketio=None): base_url = 'https://www.youtube.com/results?search_query=' url = base_url + content response = requests.get(url) if response.status_code != 200: return False doc = html.fromstring(response.text) mongoDB = mongoConnection.mongoConnection(db='video', collection='spider')
def api_delete_task(content, site, time): try: #delete spider information mongo = mongoConnection.mongoConnection(db='video', collection='spider') _ = mongo.collection.remove({ 'content': content, 'site': site, 'time': time }) #delete urlinfo information mongo = mongoConnection.mongoConnection(db='video', collection='urlinfo') _ = mongo.collection.remove({'content': content, 'site': site}) return 0 except Exception as e: return -1
def api_video_detail(content, site): mongo = mongoConnection.mongoConnection(db='video', collection='urlinfo') info = mongo.collection.find({ 'content': content, 'site': site }, {'_id': 0}) info = [x for x in info] return {'info': info}
def get_bilibili_info(args, socketio=None): """ """ try: url = args[0] logger.info(url) content = args[1] sesson = None if not sesson: response = requests.get(url) else: response = sesson.get(url) if response.status_code != 200: return None json = response.json() doc = html.fromstring(json['html']) #Information extraction urls = doc.xpath("//li/a/@href") names = doc.xpath("//li/a/@title") infos = doc.xpath("//li/div/div[@class='des hide']/text()") playtimes = [ x.strip() for x in doc.xpath( "//li/div/div[@class='tags']/span[@class='so-icon watch-num']/text()" )[1::2] ] showtimes = [ x.strip() for x in doc.xpath( "//li/div/div[@class='tags']/span[@class='so-icon time']/text()" )[1::2] ] authors_sites = re.compile( r'http://space.bilibili.com/[0-9]{0,}').findall(response.text) times = [] for x in playtimes: if x[-1] == u'万': temp = int(float(x[:-1]) * 10000) elif x[-1] == '-': temp = 0 else: temp = int(x) times.append(temp) result = [{ "videoname": x[0].strip(), "url": x[1], "showtime": x[2], "videoinfo": x[3].strip(), "playtimes": x[4], "spidertime": time.strftime('%Y-%m-%d %X', time.localtime()), "site": "sina", "content": content, "status": 1, "authors_site": x[5] } for x in zip(names, urls, showtimes, infos, times, authors_sites)] # pprint.pprint (len(result)) # assert 1==2 mongoDB = mongoConnection.mongoConnection(db='video', collection='urlinfo') infomation_id = mongoDB.collection.insert_many(result, ordered=False) except Exception as e: logger.debug(e)
def auto_click(id, socketio=None, proxy=False): mongo = mongoConnection.mongoConnection(db='patent', collection='spider') content = list(mongo.collection.find({'_id': ObjectId(id)})) content = content[0] url = URL if socketio: socketio.emit('my_response', {'data': URL}, namespace='/patent') socketio.sleep(1) click(url, content, socketio=socketio, proxy=proxy)
def get_collections(**kwargs): wanFang = mongoConnection.mongoConnection(**kwargs) collections = wanFang.db.collection_names() collections.remove('system.indexes') data = [{ 'collection': x, 'count': wanFang.db[x].count() } for x in collections] return data
def already_exist_data(): #get query task list mongo = mongoConnection.mongoConnection(db='wanFang', collection='spider') info = mongo.collection.find({}, { 'last_time': 1, 'url': 1, 'time': 1, 'feq': 1, '_id': 0 }) info = [x for x in info] #get related count mongo = mongoConnection.mongoConnection(db='wanFang', collection='paperinfo') find = mongo.collection.find #function reference for x in info: x['count'] = find({'url': x['url']}).count() return info
def auto_run(): mongo = mongoConnection.mongoConnection(db='video', collection='spider') tasks = list(mongo.collection.find({}, { 'content': 1, 'site': 1, '_id': 0 })) tasks = [(x['content'], x['site']) for x in tasks] for x in tasks: click(*x)
def patent_add_item(**kwargs): """ add task, don't ask me why use dict as the param... just because it's more shorter. """ try: patent = mongoConnection.mongoConnection(db='patent', collection='spider') tag = patent.collection.insert(kwargs) return tag except Exception as e: print(e) return None
def modify_task(**kwargs): #modify task o_url = kwargs['url'] url = url_constract(**kwargs) mongo = mongoConnection.mongoConnection(db='wanFang', collection='spider') feq = list(mongo.collection.find({'url': o_url}, {'feq': 1, '_id': 0}))[0] if_exist = mongo.collection.find({'url': url}).count() if if_exist != 0 and feq['feq'] == kwargs['feq']: return None kwargs['url'] = url if url != o_url: i_tag = mongo.collection.insert(kwargs) # r_tag = mongo.collection.remove({'url':o_url}) #逻辑问题,旧有的任务会造成paper悬空,因此暂时不删除原有任务 else: mongo.collection.update({'url': url}, {"$set": {"feq": kwargs['feq']}}) return 0
def add_item(**kwargs): """ add task, don't ask me why use dict as the param... just because it's more shorter. """ try: wanFang = mongoConnection.mongoConnection(db='wanFang', collection='spider') url = url_constract(**kwargs) kwargs['url'] = url tag = wanFang.collection.insert(kwargs) return tag except Exception as e: print(e) return None
def click(content, site, socketio=None, proxy=False): args = {} logger.info(content + ':' + site) args['content'] = content args['site'] = site mongoDB = mongoConnection.mongoConnection(db='video', collection='spider') time_ = time.strftime('%Y-%m-%d %X', time.localtime()) infomation_id = mongoDB.collection.update( { 'site': site, 'content': content }, {'$set': { 'inactive': 1, 'last_time': time_ }}) url_spider(args, socketio=socketio)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', '--type', dest='type') parser.add_argument('-c', '--content', dest='content') args = parser.parse_args() spider_id = args.content #数据库链接 mongo = mongoConnection.mongoConnection(db='patent', collection='spider') content = list(mongo.collection.find({'_id': ObjectId(spider_id)})) content = content[0] url = URL if (args.type == 'click'): click(url, content) elif args.type == 'auto': auto_run() logger.info('Success: Task update finshed..')
def get_info(url): logger.info('Currently crawling web pages is: ' + url) if socketio: socketio.emit( 'my_response', {'data': 'Currently crawling web pages is: ' + url}, namespace='/video') socketio.sleep(1) response = requests.get(url) if response.status_code != 200: return None result = [{ "videoname": re.compile(r'(<.*?>)').sub("", x.get("videoname")), "url": x.get("url"), "showtime": x.get("showtime"), "videoinfo": x.get("videoinfo"), "playtimes": x.get("playtimes"), "spidertime": time.strftime('%Y-%m-%d %X', time.localtime()), "site": "sina", "content": content, "status": 1 } for x in response.json()["list"]] mongoDB = mongoConnection.mongoConnection(db='video', collection='urlinfo') try: infomation_id = mongoDB.collection.insert_many(result, ordered=False) mongoDB.db['spider'].update( { 'site': 'sina', 'content': content }, {'$set': { 'inactive': 0 }}) except Exception as e: logger.debug(e) return 0
def paper_list(url): #根据task的url查询关于次url的所有文章,返回dict, mongo = mongoConnection.mongoConnection(db='wanFang', collection='paperinfo') info = mongo.collection.find({'url': url}, { '_id': 0, 'spidertime': 1, 'title': 1, 'link': 1, 'abstract': 1, 'authors': 1, 'date': 1 }) info = [x for x in info] for line in info: line['authors'] = list(line['authors'].keys()) line['abstract'] = line['abstract']['Chinese'] if line['date']: line['date'] = line['date'].get('year') return info
def store(patents, spider_id): #将得到的patents存入mongdb result = [{ "title": x[0].strip(), "t_id": x[1], "o_id": x[2], "o_date": x[3].strip(), "author": x[4], "spidertime": time.strftime('%Y-%m-%d %X', time.localtime()), "institution": x[5], "proxie": x[6], "proxy_insititution": x[7], "icp_id": x[8], "spider_id": spider_id, "r_date": x[9] } for x in zip(patents["titles"], patents["t_ids"], patents["o_ids"], patents["o_dates"], patents["authors"], patents["institutions"], patents["proxies"], patents["proxy_insititutions"], patents["icp_ids"], patents["r_dates"])] mongoDB = mongoConnection.mongoConnection(db='patent', collection='patentinfo') infomation_id = mongoDB.collection.insert_many(result, ordered=False) return infomation_id
def api_video_gragh(content, site): mongo = mongoConnection.mongoConnection(db='video', collection='urlinfo') info = mongo.collection.find({ 'content': content, 'site': site }, {'spidertime': 1}) info = dict( collections.Counter([str(x['spidertime'].split(' ')[0]) for x in info])) #time oneday = datetime.timedelta(days=1) now = datetime.date.today() x_value = [str(now - oneday * x) for x in range(120)] #display 120days data x_value.reverse() y_value = [] for x in x_value: value = info.get(x) if value == None: y_value.append(0) else: y_value.append(value) xy_value = {'y_value': y_value, 'x_value': x_value} return {'xy_value': xy_value, 'content': content}
def click(url, content, socketio=None, proxy=False): proxies = [None] if proxy: if socketio: socketio.sleep(1) socketio.emit('my_response', {'data': '免费代理获取中 \n这可能花费几分钟,请稍后...'}, namespace='/patent') proxies = fproxy.fetch_all() proxies = [{'http': 'http://' + x} for x in proxies] if socketio: socketio.sleep(1) socketio.emit('my_response', {'data': '免费代理获取中 \n这可能花费几分钟,请稍后...'}, namespace='/patent') form = form_produce(content) num = get_page_nums(url, form) logger.info(num) mongo = mongoConnection.mongoConnection(db='patent', collection='patentinfo') i = 1 if not num and socketio: socketio.emit('my_response', {'data': '目标网站连接失败,请稍后重试!'}, namespace='/patent') socketio.emit('disconnect', {'data': 'disconnect'}, namespace='/patent') return while i <= num: failed_tag = 0 attempt = 0 form = form_produce(content, i) proxie = random.choice(proxies) patents = get_patent(url, form, proxie) while patents is None: logger.debug('失败次数为:' + str(attempt + 1) + str(failed_tag)) failed_tag += 1 attempt += 1 if attempt % 3 == 0: attempt = 0 break if failed_tag % 10 == 0: logger.info("抓取新代理,请稍等") if socketio: socketio.sleep(1) socketio.emit('my_response', {'data': '抓取新代理,请稍等'}, namespace='/patent') proxies = fproxy.fetch_all() proxies = [{'http': 'http://' + x} for x in proxies] proxie = random.choice(proxies) # print('新换ip代理为:',proxie) patents = get_patent(url, form, proxie) failed_tag = 0 if patents != -1: try: for x in patents['titles']: logger.info('title:' + x) if socketio: socketio.sleep(1) socketio.emit('my_response', {'data': 'title:' + x}, namespace='/patent') store(patents, str(content['_id'])) except Exception as e: logger.debug(e) logger.debug('插入数据库失败...') i += 50
def get_count(**kwargs): #default collection:patent_new #get collection's item num patent = mongoConnection.mongoConnection(**kwargs) result = patent.collection.count() return result
def initial_spider(): mongo = mongoConnection.mongoConnection(db='video', collection='spider') info = mongo.collection.find({}, {'_id': 0}) info = [x for x in info] return info
def auto_run(proccess_num=10): mongo = mongoConnection.mongoConnection(db='patent', collection='spider') tasks = list(mongo.collection.find({})) pool = ThreadPool(proccess_num) results = pool.map(click, zip(repeat(URL), tasks))
def get_count(**kwargs): #default collection:paper_new #get collection's item num wanFang = mongoConnection.mongoConnection(**kwargs) result = wanFang.collection.count() return result
def db_find(key): wanFang = mongoConnection.mongoConnection() result = wanFang.collection.find({}, {key: 1}) return result