def __init__(self): self.mongo = MongoDB() self.news_url_queue = news_url_Queue() # 存新闻url,用于多线程爬取 self.news_html_queue = news_url_Queue() # 存新闻html self.old_day_news_queue = news_url_Queue() # self.log = Logging('../helloword/static/sina').get_logging() self.log = Logging('../Sina/sina.txt').get_logging()
def __init__(self): if crawler_config.debug: self.db = MongoDB('test_stock_crawler') else: self.db = MongoDB('stock_crawler') self.url = ""
def store(self): """ A function in this class to store a media, a movie or a show into the database """ client = MongoDB.setupConnection() try: db = client[DATABASE] collection = db[COLLECTION] return collection.insert_one(self).inserted_id except Exception as error: logger.Error("Error in Media class, store function: ", str(error)) finally: MongoDB.closeConnection(client)
def remove(_id): """ A function for removing a media from the database using its id.""" client = MongoDB.setupConnection() db = client[DATABASE] collection = db[COLLECTION] try: if _id: return collection.remove({"_id": ObjectId(_id)}) except Exception as error: logger.Error("Error in Media class, remove function: ", str(error)) finally: MongoDB.closeConnection(client)
def read_data_from_coupon_db(self, id=None): flag = 0 print "id = ",id if not id: return MongoDB().read_from_coupon_collection() else: for data_entry in MongoDB().read_from_coupon_collection(): if id in data_entry.values(): print "data_entry = ", data_entry reqd_data = data_entry flag = 1 if flag == 1: del reqd_data['_id'] return reqd_data else: return 'Data not available'
def main(): m = MongoDB() tasks = [] all_pairs = m.get_all_pairs() for address in all_pairs[:100]: pair = Web3.toChecksumAddress(address) task = log_loop(pair, 60) tasks.append(task) print('{} Starting...'.format(len(tasks))) loop = asyncio.get_event_loop() try: loop.run_until_complete(asyncio.gather(*tasks)) finally: loop.close()
class Crawler: def __init__(self): if crawler_config.debug: self.db = MongoDB('test_stock_crawler') else: self.db = MongoDB('stock_crawler') self.url = "" @property def type(self): return self.__class__.__name__ def insert_or_update(self, data): log = logger.getChild( self.type + '.insert_or_update') log.setLevel(logging.INFO) log.debug('insert data: {}'.format(data)) collection = 'stock' document = self.db.query(collection).find_one({'type': data['type'], 'date': data['date'], 'code': data['code']}) if document is None: self.db.insert(collection, data=data) log.debug('insert data: {}'.format(data)) def crawling(self, url, encoding='cp949'): log = logger.getChild(self.type + 'crawling') request = Request(url, headers={ 'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) ' 'Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'}) try: handle = urlopen(request) except URLError: log.error('may be, url host changed: {}'.format(url)) return None data = handle.read() soup = BeautifulSoup(data.decode(encoding, 'ignore'), "html.parser", from_encoding="utf-8") return soup
def crawl_zhilian(self, city, keyword): #url_list = [] # todo url_list 做成堆栈形式 begin_url = 'https://fe-api.zhaopin.com/c/i/sou?start={page}&pageSize=90&cityId={city}&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={keyword}&kt=3' database = MongoDB('zhilian', self.keywords[keyword]) url_list = self._get_list(begin_url, city, keyword, page_weight=90) print(keyword, city, 'list parser done!') print(len(url_list)) self._get_content(database, url_list)
def reload(_id, key, value): """ A function to update a media based on its id. Attribute of the media as key and value of the attribute as value will be passed to this function as its arguments as well as media id.""" global movie client = MongoDB.setupConnection() db = client[DATABASE] collection = db[COLLECTION] try: if _id: movie = collection.update_one({ '_id': _id }, { '$set': { key: value } }, upsert=True) # To avoid inserting the same document more than once return movie except Exception as error: logger.Error("Error in Media class, reload function: ", str(error)) finally: MongoDB.closeConnection(client)
def retrieve(_id, key, value): """ A function to retrieve a media, a movie or a show, using the media id or an attribute of the media or both , id and an attribute. For the attribute of a media, a key and its value needs to be defined, like 'release year' as key and '2013' as its value.""" client = MongoDB.setupConnection() try: db = client[DATABASE] logger.Info("Retrieved database: ", str(db)) collection = db[COLLECTION] logger.Info("Retrieved collection: ", str(collection)) if _id is None: medias = collection.find({key: value}).sort('_id', pymongo.ASCENDING) result = [] for film in medias: # logger.Info("Retrieved media: ", str(JSONEncoder().encode(film))) result.append(film) if len(result) == 1: return JSONEncoder().encode(result[0]) elif len(result) > 1: ms = [] for r in result: ms.append(JSONEncoder().encode(r)) return ms else: return [] if key is None and value is None: media = collection.find_one({"_id": ObjectId(_id)}) return JSONEncoder().encode(media) if (_id is not None) and (key is not None) and (value is not None): media = collection.find_one({"$and": [{"_id": ObjectId(_id)}, {key: value}]}) return JSONEncoder().encode(media) except Exception as error: logger.Error("Error in Media class, retrieve function: ", str(error)) finally: MongoDB.closeConnection(client)
def handle_login(account): conta = MongoDB.login(account["username"], account["password"]) if conta: if conta["license"]['to_date'] < time.time(): Updater.expire_warning() return False conta['password'] = account["password"] conta["license"]['from_date'] = datetime.fromtimestamp( conta["license"]['from_date']).strftime('%d/%m/%Y') conta["license"]['to_date'] = datetime.fromtimestamp( conta["license"]['to_date']).strftime('%d/%m/%Y') return conta return False
def retrieveAll(key, value, limit, offset): """ A function to retrieve all media, movies or shows, using an attribute of the media such as type For the attribute of a media, a key and its value needs to be defined, like 'release year' as key and '2013' as its value.""" client = MongoDB.setupConnection() try: db = client[DATABASE] logger.Info("Retrieved database: ", str(db)) collection = db[COLLECTION] logger.Info("Retrieved collection: ", str(collection)) # Setting up pagination based on limit and offset starting = collection.find({key: value}).sort('_id', pymongo.ASCENDING) L_id = starting[offset]['_id'] medias = collection.find({"$and": [{'_id': {"$gte": L_id}}, {key: value}]}).sort('_id', pymongo.ASCENDING).limit(limit) result = [] for media in medias: result.append(JSONEncoder().encode(media)) return result except Exception as error: logger.Error("Error in Media class, retrieve function: ", str(error)) finally: MongoDB.closeConnection(client)
def crawl_liepin(self, city, keyword): begin_url = "https://www.liepin.com/city-{city}/zhaopin/pn{page}/?d_pageSize=40&jobKind=2&key={keyword}" database = MongoDB('liepin', self.keywords[keyword]) url_list = self._get_list(begin_url, city, keyword, page_begin=0, web_name='liepin') print(keyword, city, 'list parser done!') if url_list: print(len(url_list)) self._get_content(database, url_list, web_name='liepin')
def crawl_qiancheng(self, city, keyword): begin_url = 'https://search.51job.com/list/{city},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=' database = MongoDB('qiancheng', self.keywords[keyword]) url_list = self._get_list(begin_url, city, keyword, page_begin=1, web_name='qiancheng') print(keyword, city, 'list parser done!') if url_list: print(len(url_list)) self._get_content(database, url_list, web_name='qiancheng')
def get_all_status_thread(self, status_list=[], collect_name='status', trim_user=True, include_entities=True): wrapper_func = handle_exception(self.get_status) db = MongoDB().connect() collect = db[collect_name] while len(status_list) > 0: status_id = status_list.pop(0) status_obj = wrapper_func(status_id) status = self.tweetobj_to_dict(status_obj) if not status: continue try: collect.insert_one(status) except Exception as e: continue
decimals = 18 except Exception as e: traceback.print_exc() sys.exit(-1) print('{} {}({}) {}'.format(token, name, symbol, decimals)) doc = { 'address': token, 'name': name, 'symbol': symbol, 'decimals': decimals, } return doc if __name__ == '__main__': db = MongoDB() tokens = db.get_all_tokens() parse_tokens = [] for token in tokens: if db.get_token(token) is None: parse_tokens.append(token) # if len(parse_tokens) == 6: # break print(len(parse_tokens)) with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] for token in parse_tokens: futures.append(executor.submit(parse_token, token)) for future in concurrent.futures.as_completed(futures): doc = future.result()
def get_user_all_timeline(self, user_id=None, collect_name="tweets_task", screen_name=None, include_rts=True, exclude_replies=False): if user_id == None and screen_name == None: return None if user_id: try: user_id = long(user_id) except Exception as e: print e return None flag = True tweets = [0] sleep_count = 0 db = MongoDB().connect() collect = db[collect_name] get_api = self.get_api while len(tweets) > 0: try: if flag: tweets = get_api().GetUserTimeline( user_id=user_id, screen_name=screen_name, include_rts=include_rts, exclude_replies=exclude_replies, trim_user=True, count=200) flag = False else: tweets = get_api().GetUserTimeline( user_id=user_id, screen_name=screen_name, include_rts=include_rts, exclude_replies=exclude_replies, trim_user=True, count=200, max_id=tweets[-1].id - 1) except error.TwitterError as te: try: if te.message == 'Not authorized.': print 'Not authorized.' return if te.message[0]['code'] == 88: sleep_count += 1 if sleep_count >= API_COUNT: print "sleeping..." sleep_count = 0 time.sleep(300) continue else: print te break except Exception as ee: print ee break except Exception as e: break for tt in tweets: tweet = self.tweetobj_to_dict(tt) if not tweet: continue try: collect.insert_one(tweet) except Exception as e: continue
def write_data_to_product_db(self, info): MongoDB().write_to_product_collection(info)
logger.info(f'Converting: {src_collection.__name__} ({db.get_collection_size(src_collection)})' f' -> {dst_collection.__name__} ({db.get_collection_size(dst_collection)})') docs = src_collection.objects() total_count = docs.count() for current_count, src_doc in enumerate(docs): log_progress(current_count, total_count) try: mapped_doc = map_document(src_doc) except (DocumentConversionError, DocumentConstructionError) as e: logger.warning(f'Skipping: {src_doc} because of: {e}') continue mapped_doc.create_or_update() with db.connect(): logger.info(f'Total {dst_collection.__name__} count: ({db.get_collection_size(dst_collection)})') logger.info(f'Documents.Conclusion count: ({db.get_collection_size(Documents.Conclusion)})') logger.info(f'Documents.License count: ({db.get_collection_size(Documents.License)})') if __name__ == '__main__': logger = root_logger('convert_data', logging.INFO) try: db = MongoDB() # credentials for MongoDB can be set up here convert_data(Documents.FileRaw, Documents.File) logger.info('Success') except Exception as e: logger.info(e, exc_info=True)
def save_tweets(self, count=1): database = MongoDB("verificacion") coll = database.collection("tweets") tweets = self._user_timeline(count) for tweet in tweets: coll.insert({"tweet": tweet})
# influx.py import time from datetime import datetime import concurrent from helper import DeFiContract from web3 import Web3 from web3 import exceptions from kfk import KafkaDB from database import MongoDB k = KafkaDB() m = MongoDB() def scrapReserves(pair_address): pair = Web3.toChecksumAddress(pair_address) contract = DeFiContract(pair, 'Pair') r0, r1, _ = contract.getReserves() print('{} {} {}'.format(pair, r0, r1)) doc = { 'address': pair, 'r0': r0, 'r1': r1, 't': datetime.utcnow().timestamp(), } return doc while True:
async def get_and_store(self, device): """ Get snmp infomation and add to database """ mongo = MongoDB() host = device.ip community = device.snmp_community port = device.snmp_port results = await asyncio.gather( asyncio.ensure_future(get_system_info(host, community, port)), asyncio.ensure_future(get_routes(host, community, port)), asyncio.ensure_future(get_ip_addr(host, community, port)), asyncio.ensure_future(get_interfaces(host, community, port)), asyncio.ensure_future(get_cdp(host, community, port)), # asyncio.ensure_future(get_lldp(host, community, port)), # Todo ) if all(r is None for r in results): logging.debug("SNMP Server for device ip %s is gone down", host) return system_info = results[0] routes = results[1] ip_addrs = results[2] interfaces = results[3] # CDP cdp = results[4] # LLDP # lldp = results[5] # Todo optimize this # for if_index, interface in enumerate(interfaces): # for ip_index, ip_addr in enumerate(ip_addrs): # if interface['index'] == ip_addr['if_index']: # interface['ipv4_address'] = ip_addr['ipv4_address'] # interface['subnet'] = ip_addr['subnet'] for if_index in range(len(interfaces)): for ip_index in range(len(ip_addrs)): if interfaces[if_index]['index'] == ip_addrs[ip_index][ 'if_index']: interfaces[if_index]['ipv4_address'] = ip_addrs[ip_index][ 'ipv4_address'] interfaces[if_index]['subnet'] = ip_addrs[ip_index][ 'subnet'] break # print(interfaces[0]) my_device = mongo.db.device.find_one({'device_ip': host}) if my_device: for interface in interfaces: for my_interface in my_device['interfaces']: if interface['description'] == my_interface['description']: # In in_octets = interface['out_octets'] - my_interface[ 'out_octets'] in_in_time = system_info['uptime'] - my_device['uptime'] bw_in_usage_percent = sdn_utils.cal_bw_usage_percent( in_octets, interface['speed'], in_in_time) # Out out_octets = interface['out_octets'] - my_interface[ 'out_octets'] out_in_time = system_info['uptime'] - my_device[ 'uptime'] bw_out_usage_percent = sdn_utils.cal_bw_usage_percent( out_octets, interface['speed'], out_in_time) # Add information interface['bw_in_usage_octets'] = in_octets interface['bw_in_usage_percent'] = bw_in_usage_percent interface['bw_out_usage_octets'] = out_octets interface[ 'bw_out_usage_percent'] = bw_out_usage_percent interface['bw_usage_update'] = time.time() logging.debug(' || BW in usage %.3f || %d bytes', bw_in_usage_percent, in_octets) logging.debug(' || BW out usage %.3f || %d bytes', bw_out_usage_percent, out_octets) break system_info['interfaces'] = interfaces # Clear old routes mongo.db.route.delete_many({'device_ip': host}) # Insert net routes mongo.db.route.insert_many(routes) mongo.device.update_one({'ipv4_address': host}, {'$set': system_info}, upsert=True) # Insert CDP mongo.db.cdp.update_one({'device_ip': host}, {'$set': { 'device_ip': host, 'neighbor': cdp }}, upsert=True)
def drop_product_table(self): MongoDB().drop_product_collection()
def write_data_to_coupon_db(self, info): MongoDB().write_to_coupon_collection(info)
def getCookies(weibo): db = MongoDB() cookies = [] loginURL = 'https://passport.weibo.cn/signin/login' if db.Cookies.count() < 10: print '-----------------------------------------' print 'Start crawl cookies' print '-----------------------------------------' for elem in weibo: account = elem['no'] password = elem['psw'] item = {'account':account} if db.find_cookie(item): continue try: driver = webdriver.Chrome() driver.get(loginURL) time.sleep(2) failure = 0 while "登录 - 新浪微博" in driver.title and failure < 5: failure += 1 driver.set_window_size(1920, 1080) username = driver.find_element_by_id("loginName") username.clear() username.send_keys(account) psd = driver.find_element_by_id("loginPassword") psd.clear() psd.send_keys(password) commit = driver.find_element_by_id("loginAction") commit.click() time.sleep(10) # cookie=driver.get_cookies() # print cookie cookie = {} if "微博 - 随时随地发现新鲜事" in driver.title: for elem in driver.get_cookies(): cookie[elem["name"]] = elem["value"] if len(cookie) > 0: item = {'account': account, 'password': password, 'cookie': cookie} db.Cookies.insert_one(item) cookies.append(cookie) print "*******************************" print "Get Cookie Successful: %s!!!!!!" % account print "*******************************" continue print "*******************************" print "Get Cookie Failed: %s!" % account print "*******************************" except Exception, e: print "*******************************" print "%s Failure!!!!!" % account print e print "*******************************" finally:
def run_server(): global db db = MongoDB() run(host=host, port=port) db.clear_db()
# data['临床表现'] = data['临床表现'].str.replace(r'等.*?(?:\s|$)|[,。,.、;;]', ' ') # data['临床表现'] = data['临床表现'].str.replace(r'或|常伴|伴有?|发生|[轻甚]则|甚至', '') # data['临床表现'] = data['临床表现'].str.replace(r'[^\s]{9,}', ' ') # data['临床表现'] = data['临床表现'].str.replace(r'\s+', ' ') data['临床表现'] = data['临床表现'].str.strip() data.drop_duplicates('病证', 'last', inplace=True) # data = data['临床表现'].str.split().tolist() # data = [j for i in data for j in i] # counter = Counter(data) # print(data['临床表现']) # data.to_excel('bz2.xls', index=False) # bz = pd.read_excel('bz.xls')[['病症', '临床表现']] mongo = MongoDB() food_info = mongo.find_all('diet_merge', projection={'name': 1, 'ingredients': 1, 'syndrome': 1}) food_info_df = pd.DataFrame(data=[[f['name'], f['ingredients'], f['syndrome']] for f in food_info], columns=['食疗方', '食材', '主治']) food_info_df['存在关联'] = 0 food_info_df['主治'] = food_info_df['主治'].str.replace('证', '').str.replace('型', '') food_info_df.loc[food_info_df['主治'] != '', '主治'] = food_info_df['主治'] + '证' food_bz = [item['syndrome'] for item in food_info] food_bz = list(filter(None, food_bz)) print('Food(Total): {}'.format(len(food_bz))) food_bz = set([item.replace('证', '').replace('型', '') for item in food_bz]) results = [] n_valid_bz = 0 valid_bz_set = set() for item in food_bz:
from core import PastePwn from scraping.pastebin import PastebinScraper from database import MongoDB logdir_path = os.path.dirname(os.path.abspath(__file__)) logfile_path = os.path.join(logdir_path, "logs", "pastepwn.log") if not os.path.exists(os.path.join(logdir_path, "logs")): os.makedirs(os.path.join(logdir_path, "logs")) logfile_handler = logging.handlers.WatchedFileHandler(logfile_path, "a", "utf-8") logger = logging.getLogger(__name__) logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG, handlers=[logfile_handler, logging.StreamHandler()]) # Framework code database = MongoDB(ip="192.168.240.128") pastepwn = PastePwn(database) pastepwn.add_scraper(PastebinScraper()) telegram_action = TelegramAction(token="token", receiver="-1001348376474") mail_analyzer = MailAnalyzer(telegram_action) premium_analyzer = WordAnalyzer(telegram_action, "premium") pastepwn.add_analyzer(mail_analyzer) pastepwn.add_analyzer(premium_analyzer) pastepwn.start()
def drop_coupon_table(self): MongoDB().drop_coupon_collection()
# -*- coding: utf-8 -*- from database import MongoDB import check, os from telegram.ext import Updater, CommandHandler, MessageHandler, Filters Token = "376593798:AAHMNABESGpXiFGiQ8Bg-0CnHc2EwyXD1hk" updater = Updater(token=Token) dispatcher = updater.dispatcher mongodb = MongoDB() admins = ["utkucanbykl", "vlademir92", "badgeekluck"] users = ["utkucanbykl", "vlademir92", "badgeekluck"] def start(bot, update): bot.sendMessage(chat_id=update.message.chat_id, text="Bot çalışıyor.") def hello(bot, update): bot.sendMessage(chat_id=update.message.chat_id, text="Hello " + update.message.from_user.first_name) def echo(bot, update):
class getnews(object): def __init__(self): self.mongo = MongoDB() self.news_url_queue = news_url_Queue() # 存新闻url,用于多线程爬取 self.news_html_queue = news_url_Queue() # 存新闻html self.old_day_news_queue = news_url_Queue() # self.log = Logging('../helloword/static/sina').get_logging() self.log = Logging('../Sina/sina.txt').get_logging() def run(self, nums): #开始计时,爬完一次需用时 time_0 = time.time() #获取当天所有新闻url,加入news_url_queue self.get_news_url() time.sleep(5) #将数据库里以前的url信息读取到old_day_news_queue self.read_url_info() #开始逐个判断old_day_news_queue里的评论是否有更新,有则加入news_url_queue,否则就删除数据库本地信息,直到队列空 thread_list3 = [ threading.Thread(target=self.judge_comment) for i in range(nums) ] for t in thread_list3: t.start() for t in thread_list3: if t.is_alive(): t.join() time.sleep(5) #去除重复元素 xqueue = set(self.news_url_queue.queue) self.news_url_queue.queue = list(xqueue) #逐个读取news_url_queue内容,获取正文html,并存入news_html_queue,直到队列空 thread_list = [ threading.Thread(target=self.get_news_html) for i in range(nums) ] for t in thread_list: t.start() for t in thread_list: if t.is_alive(): t.join() time.sleep(5) print '新闻个数: ' + str(len(self.news_html_queue.queue)) #开始逐个解析news_html_queue里的内容,同时存正文,爬评论存评论,存url信息到数据库,直到队列空 thread_list2 = [ threading.Thread(target=self.get_message) for i in range(nums) ] for x in thread_list2: x.start() for x in thread_list2: if x.is_alive(): x.join() print("结束: ", time.time() - time_0, "\n") def get_news_url(self): URL_LIST = [ 'http://news.sina.com.cn/society/', 'http://ent.sina.com.cn/', 'http://sports.sina.com.cn/', 'http://finance.sina.com.cn/', 'http://news.sina.com.cn/china/' ] re_list = [ 'http://news.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml', 'http://ent.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml', 'http://sports.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml', 'http://finance.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml', 'http://news.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml' ] time_today = time.strftime("%Y-%m-%d", time.localtime(time.time())) for channel in range(0, 5): URL = URL_LIST[channel] while 1: print '新闻版块: ' + URL try: html = requests.get(URL, timeout=30).content break except Exception as e: self.log.info( 'can not get the source page for news urllist') # print e re_ = re_list[channel] news_url_list = re.findall(re_, html) print '本版块个数: ' + str(len(news_url_list)) for j in news_url_list: this_time = re.search('\d{4}-\d{2}-\d{2}', j).group(0) if this_time == time_today: self.news_url_queue.queue.append(j) else: pass def read_url_info(self): try: self.old_day_news_queue.queue = self.mongo.get_urls() except Exception as e: self.log.info('function read_url_info() error!') self.log.info(e) def judge_comment(self): while len(self.old_day_news_queue.queue): try: info = self.old_day_news_queue.out_queue() url = info['_id'] comment_count = self.getCommentNumber(url) flag = comment_count - info['comment_count'] if flag >= 20: self.news_url_queue.queue.append(url) else: self.mongo.delete_url(url) except Exception as e: self.log.info('function judge_comment() error') self.log.info(e) def get_news_html(self): while len(self.news_url_queue.queue): i = self.news_url_queue.out_queue() try: html = requests.get(i, timeout=30).content.decode() # print i self.news_html_queue.in_queue(html) except Exception as e: self.log.info('can not get this page of html' + i) self.log.info(e) def get_message(self): while len(self.news_html_queue.queue): try: i = self.news_html_queue.out_queue() #排除手机版网页 if re.findall(r'<meta property="og:url" content="(.*?)" />', i): news_url = re.findall( r'<meta property="og:url" content="(.*?)" />', i)[0] else: continue ping_lun_shu_liang = self.getCommentNumber(news_url) yue_du_shu = None if ping_lun_shu_liang: all_page = ping_lun_shu_liang / 20 comment_url_list = [] for page in xrange(1, all_page + 1): newsid = re.findall(r'([a-z]{7}\d{7})\.shtml', news_url)[0] channel = re.findall(r'http://(.*?).sina', news_url)[0] if (channel == 'finance'): channel = 'cj' elif (channel == 'sports'): channel = 'ty' elif (channel == 'ent'): channel = 'yl' else: channel = re.findall(r'com\.cn/([a-z]+)/', news_url)[0] if (channel == 's'): channel = 'sh' else: channel = 'gn' comment_url = 'http://comment5.news.sina.com.cn/page/info?format=js&channel=%s&newsid=comos-%s&group=&compress=1&ie=gbk&oe=gbk&page=%s&page_size=20' % ( channel, newsid, page) comment_url_list.append(comment_url) for com_url in comment_url_list: self.get_comment(news_url, com_url) else: ping_lun_shu_liang = 0 tree = etree.HTML(i) message_dict = dict() url_info = dict() # print' 文章网址' wen_zhang_wang_zhi = news_url message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi # print' # 文章标题' wen_zhang_biao_ti = pathOneNode( tree, '//div[@class="main-content w1240"]/h1/text()') message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti # print' # 发布时间' fa_bu_shi_jian = pathOneNode( tree, '//div[@class="date-source"]/span/text()') if not fa_bu_shi_jian: fa_bu_shi_jian = re.findall( '<span class="titer">(.*?)</span>', i)[0] fa_bu_shi_jian = re.findall('(\d{4}.*\d{2})', fa_bu_shi_jian)[0] # print news_url+fa_bu_shi_jian message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian # print' # 评论数量' ping_lun_shu_liang = ping_lun_shu_liang message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang # print' # 文章来源' # (//div[@class="article article_16"]/p[2]/text()) wen_zhang_lai_yuan = pathOneNode( tree, '//div[@class="date-source"]/a/text()| //div[@class="date-source"]/span[@class="source ent-source"]/text()|//div[@class="date-source"]/span[@class="source"]/text()' ) message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan # print' # 文章正文' wen_zhang_zheng_wen = tree.xpath( '//div[@class="article"]/p/text()') wen_zhang_zheng_wen = ''.join(wen_zhang_zheng_wen) #print wen_zhang_zheng_wen message_dict['wen_zhang_zheng_wen'] = wen_zhang_zheng_wen # print' # 抓取时间' do_time = time.time() message_dict['do_time'] = do_time # print' # 抓取网站' zhan_dian = u'新浪网' message_dict['zhan_dian'] = zhan_dian # print' # 图片链接' tu_pian_lian_jie = tree.xpath( '//div[@class="img_wrapper"]/img/@src') if tu_pian_lian_jie: tu_pian_lian_jie = ' '.join(tu_pian_lian_jie) if tu_pian_lian_jie.startswith('http:'): tu_pian_lian_jie = tu_pian_lian_jie else: tu_pian_lian_jie = 'http:' + tu_pian_lian_jie message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie else: message_dict['tu_pian_lian_jie'] = None # print' # 文章栏目' # wen_zhang_lan_mu = pathAllNode(tree, # '(//div[@class="bread"]/a)|(//div[@class="bread"]/span)|(//div[@class="nav-g__breadcrumb layout-fl"]/a)|(//div[@class="text notInPad"]/a)') # message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu # # print' # 文章作者' if tree.xpath( '(//p[@class="article-editor"]/text())|(//p[@class="show_author"]/text())' ): wen_zhang_zuo_zhe = pathOneNode( tree, '(//p[@class="article-editor"]/text())|(//p[@class="show_author"]/text())' ) else: wen_zhang_zuo_zhe = '佚名' message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe # print' # 关键词' if tree.xpath('//div[@class="keywords"]'): guan_jian_ci = tree.xpath( '//div[@class="keywords"]/a/text()') guan_jian_ci = ' '.join(guan_jian_ci) else: guan_jian_ci = None message_dict['guan_jian_ci'] = guan_jian_ci # print' # 相关标签' # xiang_guan_biao_qian = pathAllNode(tree,'(//section[@class="article-a_keywords"])|(//p[@class="art_keywords"])') # message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian # # print' # 阅读数量' yue_du_shu = yue_du_shu message_dict['yue_du_shu'] = yue_du_shu # print' # 主键' message_dict['_id'] = news_url # print json.dumps(message_dict, ensure_ascii=False, indent=4) print '剩余未爬取新闻个数' + str(len(self.news_html_queue.queue)) url_info['_id'] = news_url url_info['comment_count'] = ping_lun_shu_liang url_info['do_time'] = do_time self.mongo.put_url(url_info) self.mongo.put_content(message_dict) except Exception as e: self.log.info(e) # print e def getCommentNumber(self, news_url): newsid = re.findall(r'([a-z]{7}\d{7})\.shtml', news_url)[0] channel = re.findall(r'http://(.*?).sina', news_url)[0] if (channel == 'finance'): channel = 'cj' elif (channel == 'sports'): channel = 'ty' elif (channel == 'ent'): channel = 'yl' else: channel = re.findall(r'com\.cn/([a-z]+)/', news_url)[0] if (channel == 's'): channel = 'sh' else: channel = 'gn' comment_url = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=%s&newsid=comos-%s&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20' % ( channel, newsid) flag = 1 while 1: try: x = requests.get(comment_url, timeout=30).content # print x json_object = json.loads( re.findall('var data=([\S\s]+)', x)[0]) break except Exception as e: flag += 1 self.log.info('default to get this page of comment') # print e if flag > 5: return # 阅读数 # yue_du_shu = json_object['join_count'] # 评论数 try: ping_lun_shu_liang = json_object['result']['count']['show'] except Exception as e: ping_lun_shu_liang = 0 #return yue_du_shu return ping_lun_shu_liang def get_comment(self, news_url, comment_url): # print '开始获取评论' try: # print comment_url json_object = json.loads( requests.get(comment_url, timeout=30).content.replace('var data=', '')) # print json_object comment_dict = dict() for item in json_object['result']['cmntlist']: # 评论文章url news_url = news_url # 评论内容 ping_lun_nei_rong = item["content"] comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong # 评论时间 ping_lun_shi_jian = item["time"] comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian # 回复数量 hui_fu_shu = None comment_dict['hui_fu_shu'] = hui_fu_shu # 点赞数量 dian_zan_shu = item["agree"] comment_dict['dian_zan_shu'] = dian_zan_shu # 评论id ping_lun_id = item["mid"] comment_dict['ping_lun_id'] = ping_lun_id # 用户昵称 yong_hu_ming = item["nick"] comment_dict['yong_hu_ming'] = yong_hu_ming # 性别 xing_bie = None comment_dict['xing_bie'] = xing_bie # 用户等级 yong_hu_deng_ji = item["level"] comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji # 用户省份 yong_hu_sheng_fen = item["area"] comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen # 抓取时间 do_time = time.time() comment_dict['do_time'] = do_time # 抓取网站 zhan_dian = u'新浪' comment_dict['zhan_dian'] = zhan_dian # 主键 comment_dict['_id'] = ping_lun_id + news_url # print json.dumps(comment_dict, ensure_ascii=False, indent=4) self.mongo.put_comment(comment_dict) except Exception as e: self.log.info(e)