Beispiel #1
0
 def __init__(self):
     self.mongo = MongoDB()
     self.news_url_queue = news_url_Queue()  # 存新闻url,用于多线程爬取
     self.news_html_queue = news_url_Queue()  # 存新闻html
     self.old_day_news_queue = news_url_Queue()
     # self.log = Logging('../helloword/static/sina').get_logging()
     self.log = Logging('../Sina/sina.txt').get_logging()
Beispiel #2
0
    def __init__(self):
        if crawler_config.debug:
            self.db = MongoDB('test_stock_crawler')
        else:
            self.db = MongoDB('stock_crawler')

        self.url = ""
Beispiel #3
0
 def store(self):
     """ A function in this class to store a media, a movie or a show into the database """
     client = MongoDB.setupConnection()
     try:
         db = client[DATABASE]
         collection = db[COLLECTION]
         return collection.insert_one(self).inserted_id
     except Exception as error:
         logger.Error("Error in Media class, store function: ", str(error))
     finally:
         MongoDB.closeConnection(client)
Beispiel #4
0
 def remove(_id):
     """ A function for removing a media from the database using its id."""
     client = MongoDB.setupConnection()
     db = client[DATABASE]
     collection = db[COLLECTION]
     try:
         if _id:
             return collection.remove({"_id": ObjectId(_id)})
     except Exception as error:
         logger.Error("Error in Media class, remove function: ", str(error))
     finally:
         MongoDB.closeConnection(client)
Beispiel #5
0
 def read_data_from_coupon_db(self, id=None):
     flag = 0
     print "id = ",id
     if not id:
         return MongoDB().read_from_coupon_collection()
     else:
         for data_entry in MongoDB().read_from_coupon_collection():
             if id in data_entry.values():
                 print "data_entry = ", data_entry
                 reqd_data = data_entry
                 flag = 1
         if flag == 1:
             del reqd_data['_id']
             return reqd_data
         else:
             return 'Data not available'
Beispiel #6
0
def main():
    m = MongoDB()
    tasks = []

    all_pairs = m.get_all_pairs()

    for address in all_pairs[:100]:
        pair = Web3.toChecksumAddress(address)
        task = log_loop(pair, 60)
        tasks.append(task)
    print('{} Starting...'.format(len(tasks)))

    loop = asyncio.get_event_loop()
    try:
        loop.run_until_complete(asyncio.gather(*tasks))
    finally:
        loop.close()
Beispiel #7
0
class Crawler:

    def __init__(self):
        if crawler_config.debug:
            self.db = MongoDB('test_stock_crawler')
        else:
            self.db = MongoDB('stock_crawler')

        self.url = ""

    @property
    def type(self):
        return self.__class__.__name__

    def insert_or_update(self, data):
        log = logger.getChild( self.type + '.insert_or_update')
        log.setLevel(logging.INFO)

        log.debug('insert data: {}'.format(data))
        collection = 'stock'

        document = self.db.query(collection).find_one({'type': data['type'], 'date': data['date'], 'code': data['code']})
        if document is None:
            self.db.insert(collection, data=data)
            log.debug('insert data: {}'.format(data))
                

    def crawling(self, url, encoding='cp949'):
        log = logger.getChild(self.type + 'crawling')
        request = Request(url, headers={
            'User-Agent':
                'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) '
                'Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1'})
        try:
            handle = urlopen(request)
        except URLError:
            log.error('may be, url host changed: {}'.format(url))
            return None
        data = handle.read()
        soup = BeautifulSoup(data.decode(encoding, 'ignore'), "html.parser", from_encoding="utf-8")

        return soup
    def crawl_zhilian(self, city, keyword):
        #url_list = []  # todo url_list 做成堆栈形式
        begin_url = 'https://fe-api.zhaopin.com/c/i/sou?start={page}&pageSize=90&cityId={city}&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={keyword}&kt=3'
        database = MongoDB('zhilian', self.keywords[keyword])

        url_list = self._get_list(begin_url, city, keyword, page_weight=90)

        print(keyword, city, 'list parser done!')
        print(len(url_list))

        self._get_content(database, url_list)
Beispiel #9
0
 def reload(_id, key, value):
     """ A function to update a media based on its id. Attribute of the media as key and value of the attribute as
      value will be passed to this function as its arguments as well as media id."""
     global movie
     client = MongoDB.setupConnection()
     db = client[DATABASE]
     collection = db[COLLECTION]
     try:
         if _id:
             movie = collection.update_one({
                 '_id': _id
             }, {
                 '$set': {
                     key: value
                 }
             }, upsert=True)  # To avoid inserting the same document more than once
         return movie
     except Exception as error:
         logger.Error("Error in Media class, reload function: ", str(error))
     finally:
         MongoDB.closeConnection(client)
Beispiel #10
0
    def retrieve(_id, key, value):
        """ A function to retrieve a media, a movie or a show, using the media id or an attribute of the media or both
        , id and an attribute. For the attribute of a media, a key and its value needs to be defined, like
        'release year' as key and '2013' as its value."""
        client = MongoDB.setupConnection()
        try:
            db = client[DATABASE]
            logger.Info("Retrieved database: ", str(db))
            collection = db[COLLECTION]
            logger.Info("Retrieved collection: ", str(collection))
            if _id is None:
                medias = collection.find({key: value}).sort('_id', pymongo.ASCENDING)
                result = []
                for film in medias:
                    # logger.Info("Retrieved media: ", str(JSONEncoder().encode(film)))
                    result.append(film)

                if len(result) == 1:
                    return JSONEncoder().encode(result[0])
                elif len(result) > 1:
                    ms = []
                    for r in result:
                        ms.append(JSONEncoder().encode(r))
                    return ms
                else:
                    return []

            if key is None and value is None:
                media = collection.find_one({"_id": ObjectId(_id)})
                return JSONEncoder().encode(media)

            if (_id is not None) and (key is not None) and (value is not None):
                media = collection.find_one({"$and": [{"_id": ObjectId(_id)}, {key: value}]})
                return JSONEncoder().encode(media)

        except Exception as error:
            logger.Error("Error in Media class, retrieve function: ", str(error))
        finally:
            MongoDB.closeConnection(client)
def handle_login(account):
    conta = MongoDB.login(account["username"], account["password"])
    if conta:
        if conta["license"]['to_date'] < time.time():
            Updater.expire_warning()
            return False
        conta['password'] = account["password"]
        conta["license"]['from_date'] = datetime.fromtimestamp(
            conta["license"]['from_date']).strftime('%d/%m/%Y')
        conta["license"]['to_date'] = datetime.fromtimestamp(
            conta["license"]['to_date']).strftime('%d/%m/%Y')
        return conta
    return False
Beispiel #12
0
    def retrieveAll(key, value, limit, offset):
        """ A function to retrieve all media, movies or shows, using an attribute of the media such as type For the
        attribute of a media, a key and its value needs to be defined, like 'release year' as key and '2013'
        as its value."""
        client = MongoDB.setupConnection()
        try:
            db = client[DATABASE]
            logger.Info("Retrieved database: ", str(db))
            collection = db[COLLECTION]
            logger.Info("Retrieved collection: ", str(collection))
            # Setting up pagination based on limit and offset
            starting = collection.find({key: value}).sort('_id', pymongo.ASCENDING)
            L_id = starting[offset]['_id']
            medias = collection.find({"$and": [{'_id': {"$gte": L_id}}, {key: value}]}).sort('_id', pymongo.ASCENDING).limit(limit)
            result = []
            for media in medias:
                result.append(JSONEncoder().encode(media))

            return result

        except Exception as error:
            logger.Error("Error in Media class, retrieve function: ", str(error))
        finally:
            MongoDB.closeConnection(client)
    def crawl_liepin(self, city, keyword):
        begin_url = "https://www.liepin.com/city-{city}/zhaopin/pn{page}/?d_pageSize=40&jobKind=2&key={keyword}"
        database = MongoDB('liepin', self.keywords[keyword])

        url_list = self._get_list(begin_url,
                                  city,
                                  keyword,
                                  page_begin=0,
                                  web_name='liepin')

        print(keyword, city, 'list parser done!')
        if url_list:
            print(len(url_list))

        self._get_content(database, url_list, web_name='liepin')
    def crawl_qiancheng(self, city, keyword):
        begin_url = 'https://search.51job.com/list/{city},000000,0000,00,9,99,{keyword},2,{page}.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
        database = MongoDB('qiancheng', self.keywords[keyword])

        url_list = self._get_list(begin_url,
                                  city,
                                  keyword,
                                  page_begin=1,
                                  web_name='qiancheng')

        print(keyword, city, 'list parser done!')
        if url_list:
            print(len(url_list))

        self._get_content(database, url_list, web_name='qiancheng')
Beispiel #15
0
    def get_all_status_thread(self,
                              status_list=[],
                              collect_name='status',
                              trim_user=True,
                              include_entities=True):

        wrapper_func = handle_exception(self.get_status)

        db = MongoDB().connect()
        collect = db[collect_name]

        while len(status_list) > 0:
            status_id = status_list.pop(0)
            status_obj = wrapper_func(status_id)

            status = self.tweetobj_to_dict(status_obj)

            if not status:
                continue

            try:
                collect.insert_one(status)
            except Exception as e:
                continue
Beispiel #16
0
        decimals = 18
    except Exception as e:
        traceback.print_exc()
        sys.exit(-1)
    print('{} {}({}) {}'.format(token, name, symbol, decimals))
    doc = {
        'address': token,
        'name': name,
        'symbol': symbol,
        'decimals': decimals,
    }
    return doc


if __name__ == '__main__':
    db = MongoDB()
    tokens = db.get_all_tokens()
    parse_tokens = []
    for token in tokens:
        if db.get_token(token) is None:
            parse_tokens.append(token)
            # if len(parse_tokens) == 6:
            #     break
    print(len(parse_tokens))

    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for token in parse_tokens:
            futures.append(executor.submit(parse_token, token))
        for future in concurrent.futures.as_completed(futures):
            doc = future.result()
Beispiel #17
0
    def get_user_all_timeline(self,
                              user_id=None,
                              collect_name="tweets_task",
                              screen_name=None,
                              include_rts=True,
                              exclude_replies=False):

        if user_id == None and screen_name == None:
            return None

        if user_id:
            try:
                user_id = long(user_id)
            except Exception as e:
                print e
                return None

        flag = True
        tweets = [0]
        sleep_count = 0

        db = MongoDB().connect()
        collect = db[collect_name]
        get_api = self.get_api

        while len(tweets) > 0:
            try:
                if flag:
                    tweets = get_api().GetUserTimeline(
                        user_id=user_id,
                        screen_name=screen_name,
                        include_rts=include_rts,
                        exclude_replies=exclude_replies,
                        trim_user=True,
                        count=200)
                    flag = False

                else:
                    tweets = get_api().GetUserTimeline(
                        user_id=user_id,
                        screen_name=screen_name,
                        include_rts=include_rts,
                        exclude_replies=exclude_replies,
                        trim_user=True,
                        count=200,
                        max_id=tweets[-1].id - 1)

            except error.TwitterError as te:
                try:
                    if te.message == 'Not authorized.':
                        print 'Not authorized.'
                        return

                    if te.message[0]['code'] == 88:
                        sleep_count += 1

                        if sleep_count >= API_COUNT:
                            print "sleeping..."
                            sleep_count = 0
                            time.sleep(300)
                        continue

                    else:
                        print te
                        break
                except Exception as ee:
                    print ee
                    break
            except Exception as e:
                break

            for tt in tweets:
                tweet = self.tweetobj_to_dict(tt)

                if not tweet:
                    continue

                try:
                    collect.insert_one(tweet)
                except Exception as e:
                    continue
Beispiel #18
0
 def write_data_to_product_db(self, info):
     MongoDB().write_to_product_collection(info)
        logger.info(f'Converting: {src_collection.__name__} ({db.get_collection_size(src_collection)})'
                    f' -> {dst_collection.__name__} ({db.get_collection_size(dst_collection)})')

    docs = src_collection.objects()
    total_count = docs.count()
    for current_count, src_doc in enumerate(docs):
        log_progress(current_count, total_count)

        try:
            mapped_doc = map_document(src_doc)
        except (DocumentConversionError, DocumentConstructionError) as e:
            logger.warning(f'Skipping: {src_doc} because of: {e}')
            continue

        mapped_doc.create_or_update()

    with db.connect():
        logger.info(f'Total {dst_collection.__name__} count: ({db.get_collection_size(dst_collection)})')
        logger.info(f'Documents.Conclusion count: ({db.get_collection_size(Documents.Conclusion)})')
        logger.info(f'Documents.License count: ({db.get_collection_size(Documents.License)})')


if __name__ == '__main__':
    logger = root_logger('convert_data', logging.INFO)
    try:
        db = MongoDB()  # credentials for MongoDB can be set up here
        convert_data(Documents.FileRaw, Documents.File)
        logger.info('Success')
    except Exception as e:
        logger.info(e, exc_info=True)
 def save_tweets(self, count=1):
     database = MongoDB("verificacion")
     coll = database.collection("tweets")
     tweets = self._user_timeline(count)
     for tweet in tweets:
         coll.insert({"tweet": tweet})
Beispiel #21
0
# influx.py
import time
from datetime import datetime
import concurrent

from helper import DeFiContract
from web3 import Web3
from web3 import exceptions
from kfk import KafkaDB
from database import MongoDB

k = KafkaDB()
m = MongoDB()


def scrapReserves(pair_address):
    pair = Web3.toChecksumAddress(pair_address)
    contract = DeFiContract(pair, 'Pair')
    r0, r1, _ = contract.getReserves()

    print('{} {} {}'.format(pair, r0, r1))
    doc = {
        'address': pair,
        'r0': r0,
        'r1': r1,
        't': datetime.utcnow().timestamp(),
    }
    return doc


while True:
Beispiel #22
0
    async def get_and_store(self, device):
        """ Get snmp infomation and add to database
        """
        mongo = MongoDB()

        host = device.ip
        community = device.snmp_community
        port = device.snmp_port

        results = await asyncio.gather(
            asyncio.ensure_future(get_system_info(host, community, port)),
            asyncio.ensure_future(get_routes(host, community, port)),
            asyncio.ensure_future(get_ip_addr(host, community, port)),
            asyncio.ensure_future(get_interfaces(host, community, port)),
            asyncio.ensure_future(get_cdp(host, community, port)),
            # asyncio.ensure_future(get_lldp(host, community, port)), # Todo
        )

        if all(r is None for r in results):
            logging.debug("SNMP Server for device ip %s is gone down", host)
            return

        system_info = results[0]
        routes = results[1]
        ip_addrs = results[2]
        interfaces = results[3]
        # CDP
        cdp = results[4]
        # LLDP
        # lldp = results[5]

        # Todo optimize this
        # for if_index, interface in enumerate(interfaces):
        #     for ip_index, ip_addr in enumerate(ip_addrs):
        #         if interface['index'] == ip_addr['if_index']:
        #             interface['ipv4_address'] = ip_addr['ipv4_address']
        #             interface['subnet'] = ip_addr['subnet']

        for if_index in range(len(interfaces)):
            for ip_index in range(len(ip_addrs)):
                if interfaces[if_index]['index'] == ip_addrs[ip_index][
                        'if_index']:
                    interfaces[if_index]['ipv4_address'] = ip_addrs[ip_index][
                        'ipv4_address']
                    interfaces[if_index]['subnet'] = ip_addrs[ip_index][
                        'subnet']
                    break

        # print(interfaces[0])
        my_device = mongo.db.device.find_one({'device_ip': host})

        if my_device:
            for interface in interfaces:
                for my_interface in my_device['interfaces']:
                    if interface['description'] == my_interface['description']:
                        # In
                        in_octets = interface['out_octets'] - my_interface[
                            'out_octets']
                        in_in_time = system_info['uptime'] - my_device['uptime']
                        bw_in_usage_percent = sdn_utils.cal_bw_usage_percent(
                            in_octets, interface['speed'], in_in_time)
                        # Out
                        out_octets = interface['out_octets'] - my_interface[
                            'out_octets']
                        out_in_time = system_info['uptime'] - my_device[
                            'uptime']
                        bw_out_usage_percent = sdn_utils.cal_bw_usage_percent(
                            out_octets, interface['speed'], out_in_time)

                        # Add information
                        interface['bw_in_usage_octets'] = in_octets
                        interface['bw_in_usage_percent'] = bw_in_usage_percent

                        interface['bw_out_usage_octets'] = out_octets
                        interface[
                            'bw_out_usage_percent'] = bw_out_usage_percent

                        interface['bw_usage_update'] = time.time()

                        logging.debug(' || BW in usage %.3f || %d bytes',
                                      bw_in_usage_percent, in_octets)

                        logging.debug(' || BW out usage %.3f || %d bytes',
                                      bw_out_usage_percent, out_octets)
                        break

        system_info['interfaces'] = interfaces

        # Clear old routes
        mongo.db.route.delete_many({'device_ip': host})

        # Insert net routes
        mongo.db.route.insert_many(routes)
        mongo.device.update_one({'ipv4_address': host}, {'$set': system_info},
                                upsert=True)

        # Insert CDP
        mongo.db.cdp.update_one({'device_ip': host},
                                {'$set': {
                                    'device_ip': host,
                                    'neighbor': cdp
                                }},
                                upsert=True)
Beispiel #23
0
 def drop_product_table(self):
     MongoDB().drop_product_collection()
Beispiel #24
0
 def write_data_to_coupon_db(self, info):
     MongoDB().write_to_coupon_collection(info)
Beispiel #25
0
def getCookies(weibo):
    db = MongoDB()
    cookies = []
    loginURL = 'https://passport.weibo.cn/signin/login'

    if db.Cookies.count() < 10:

        print '-----------------------------------------'
        print 'Start crawl cookies'
        print '-----------------------------------------'

        for elem in weibo:
            account = elem['no']
            password = elem['psw']
            item = {'account':account}
            if db.find_cookie(item):
                continue
            try:
                driver = webdriver.Chrome()
                driver.get(loginURL)
                time.sleep(2)

                failure = 0
                while "登录 - 新浪微博" in driver.title and failure < 5:
                    failure += 1
                    driver.set_window_size(1920, 1080)
                    username = driver.find_element_by_id("loginName")
                    username.clear()
                    username.send_keys(account)

                    psd = driver.find_element_by_id("loginPassword")
                    psd.clear()
                    psd.send_keys(password)

                    commit = driver.find_element_by_id("loginAction")
                    commit.click()
                    time.sleep(10)

                # cookie=driver.get_cookies()
                # print cookie
                cookie = {}
                if "微博 - 随时随地发现新鲜事" in driver.title:
                    for elem in driver.get_cookies():
                        cookie[elem["name"]] = elem["value"]
                    if len(cookie) > 0:
                        item = {'account': account,
                                'password': password, 'cookie': cookie}
                        db.Cookies.insert_one(item)
                        cookies.append(cookie)
                        print "*******************************"
                        print "Get Cookie Successful: %s!!!!!!" % account
                        print "*******************************"
                        continue
                print "*******************************"
                print "Get Cookie Failed: %s!" % account
                print "*******************************"

            except Exception, e:
                print "*******************************"
                print "%s Failure!!!!!" % account
                print e
                print "*******************************"

            finally:
Beispiel #26
0
def run_server():
    global db
    db = MongoDB()
    run(host=host, port=port)
    db.clear_db()
    # data['临床表现'] = data['临床表现'].str.replace(r'等.*?(?:\s|$)|[,。,.、;;]', ' ')
    # data['临床表现'] = data['临床表现'].str.replace(r'或|常伴|伴有?|发生|[轻甚]则|甚至', '')
    # data['临床表现'] = data['临床表现'].str.replace(r'[^\s]{9,}', ' ')
    # data['临床表现'] = data['临床表现'].str.replace(r'\s+', ' ')
    data['临床表现'] = data['临床表现'].str.strip()


    data.drop_duplicates('病证', 'last', inplace=True)
    # data = data['临床表现'].str.split().tolist()
    # data = [j for i in data for j in i]
    # counter = Counter(data)
    
    # print(data['临床表现'])
    # data.to_excel('bz2.xls', index=False)
    # bz = pd.read_excel('bz.xls')[['病症', '临床表现']]
    mongo = MongoDB()
    food_info = mongo.find_all('diet_merge', projection={'name': 1, 'ingredients': 1, 'syndrome': 1})

    food_info_df = pd.DataFrame(data=[[f['name'], f['ingredients'], f['syndrome']] for f in food_info], columns=['食疗方', '食材', '主治'])
    
    food_info_df['存在关联'] = 0
    food_info_df['主治'] = food_info_df['主治'].str.replace('证', '').str.replace('型', '')
    food_info_df.loc[food_info_df['主治'] != '', '主治'] = food_info_df['主治'] + '证'
    food_bz = [item['syndrome'] for item in food_info]
    food_bz = list(filter(None, food_bz))
    print('Food(Total): {}'.format(len(food_bz)))
    food_bz = set([item.replace('证', '').replace('型', '') for item in food_bz])
    results = []
    n_valid_bz = 0
    valid_bz_set = set()
    for item in food_bz:
Beispiel #28
0
from core import PastePwn
from scraping.pastebin import PastebinScraper
from database import MongoDB

logdir_path = os.path.dirname(os.path.abspath(__file__))
logfile_path = os.path.join(logdir_path, "logs", "pastepwn.log")

if not os.path.exists(os.path.join(logdir_path, "logs")):
    os.makedirs(os.path.join(logdir_path, "logs"))

logfile_handler = logging.handlers.WatchedFileHandler(logfile_path, "a", "utf-8")

logger = logging.getLogger(__name__)
logging.basicConfig(format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.DEBUG, handlers=[logfile_handler, logging.StreamHandler()])

# Framework code
database = MongoDB(ip="192.168.240.128")

pastepwn = PastePwn(database)
pastepwn.add_scraper(PastebinScraper())

telegram_action = TelegramAction(token="token", receiver="-1001348376474")

mail_analyzer = MailAnalyzer(telegram_action)
premium_analyzer = WordAnalyzer(telegram_action, "premium")

pastepwn.add_analyzer(mail_analyzer)
pastepwn.add_analyzer(premium_analyzer)

pastepwn.start()
Beispiel #29
0
 def drop_coupon_table(self):
     MongoDB().drop_coupon_collection()
Beispiel #30
0
# -*- coding: utf-8 -*-

from database import MongoDB
import check, os
from telegram.ext import Updater, CommandHandler, MessageHandler, Filters

Token = "376593798:AAHMNABESGpXiFGiQ8Bg-0CnHc2EwyXD1hk"

updater = Updater(token=Token)

dispatcher = updater.dispatcher

mongodb = MongoDB()

admins = ["utkucanbykl", "vlademir92", "badgeekluck"]

users = ["utkucanbykl", "vlademir92", "badgeekluck"]


def start(bot, update):

    bot.sendMessage(chat_id=update.message.chat_id, text="Bot çalışıyor.")


def hello(bot, update):

    bot.sendMessage(chat_id=update.message.chat_id,
                    text="Hello " + update.message.from_user.first_name)


def echo(bot, update):
Beispiel #31
0
class getnews(object):
    def __init__(self):
        self.mongo = MongoDB()
        self.news_url_queue = news_url_Queue()  # 存新闻url,用于多线程爬取
        self.news_html_queue = news_url_Queue()  # 存新闻html
        self.old_day_news_queue = news_url_Queue()
        # self.log = Logging('../helloword/static/sina').get_logging()
        self.log = Logging('../Sina/sina.txt').get_logging()

    def run(self, nums):
        #开始计时,爬完一次需用时
        time_0 = time.time()

        #获取当天所有新闻url,加入news_url_queue
        self.get_news_url()
        time.sleep(5)

        #将数据库里以前的url信息读取到old_day_news_queue
        self.read_url_info()

        #开始逐个判断old_day_news_queue里的评论是否有更新,有则加入news_url_queue,否则就删除数据库本地信息,直到队列空
        thread_list3 = [
            threading.Thread(target=self.judge_comment) for i in range(nums)
        ]
        for t in thread_list3:
            t.start()
        for t in thread_list3:
            if t.is_alive():
                t.join()
        time.sleep(5)

        #去除重复元素
        xqueue = set(self.news_url_queue.queue)
        self.news_url_queue.queue = list(xqueue)

        #逐个读取news_url_queue内容,获取正文html,并存入news_html_queue,直到队列空
        thread_list = [
            threading.Thread(target=self.get_news_html) for i in range(nums)
        ]
        for t in thread_list:
            t.start()
        for t in thread_list:
            if t.is_alive():
                t.join()
        time.sleep(5)
        print '新闻个数:   ' + str(len(self.news_html_queue.queue))

        #开始逐个解析news_html_queue里的内容,同时存正文,爬评论存评论,存url信息到数据库,直到队列空
        thread_list2 = [
            threading.Thread(target=self.get_message) for i in range(nums)
        ]
        for x in thread_list2:
            x.start()
        for x in thread_list2:
            if x.is_alive():
                x.join()
        print("结束: ", time.time() - time_0, "\n")

    def get_news_url(self):
        URL_LIST = [
            'http://news.sina.com.cn/society/', 'http://ent.sina.com.cn/',
            'http://sports.sina.com.cn/', 'http://finance.sina.com.cn/',
            'http://news.sina.com.cn/china/'
        ]
        re_list = [
            'http://news.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml',
            'http://ent.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml',
            'http://sports.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml',
            'http://finance.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml',
            'http://news.sina.com.cn/[a-z]+/[a-z]+/\d{4}-\d{2}-\d{2}/doc-[a-z]{8}\d{7}.shtml'
        ]

        time_today = time.strftime("%Y-%m-%d", time.localtime(time.time()))
        for channel in range(0, 5):
            URL = URL_LIST[channel]
            while 1:
                print '新闻版块:     ' + URL
                try:
                    html = requests.get(URL, timeout=30).content
                    break
                except Exception as e:
                    self.log.info(
                        'can not get the source page for news urllist')
                    # print e
            re_ = re_list[channel]
            news_url_list = re.findall(re_, html)
            print '本版块个数:     ' + str(len(news_url_list))
            for j in news_url_list:
                this_time = re.search('\d{4}-\d{2}-\d{2}', j).group(0)
                if this_time == time_today:
                    self.news_url_queue.queue.append(j)
                else:
                    pass

    def read_url_info(self):
        try:
            self.old_day_news_queue.queue = self.mongo.get_urls()
        except Exception as e:
            self.log.info('function read_url_info() error!')
            self.log.info(e)

    def judge_comment(self):
        while len(self.old_day_news_queue.queue):
            try:
                info = self.old_day_news_queue.out_queue()
                url = info['_id']
                comment_count = self.getCommentNumber(url)
                flag = comment_count - info['comment_count']
                if flag >= 20:
                    self.news_url_queue.queue.append(url)
                else:
                    self.mongo.delete_url(url)
            except Exception as e:
                self.log.info('function judge_comment() error')
                self.log.info(e)

    def get_news_html(self):
        while len(self.news_url_queue.queue):
            i = self.news_url_queue.out_queue()
            try:
                html = requests.get(i, timeout=30).content.decode()
                # print i
                self.news_html_queue.in_queue(html)
            except Exception as e:
                self.log.info('can not get this page of html' + i)
                self.log.info(e)

    def get_message(self):
        while len(self.news_html_queue.queue):
            try:
                i = self.news_html_queue.out_queue()
                #排除手机版网页
                if re.findall(r'<meta property="og:url" content="(.*?)" />',
                              i):
                    news_url = re.findall(
                        r'<meta property="og:url" content="(.*?)" />', i)[0]
                else:
                    continue

                ping_lun_shu_liang = self.getCommentNumber(news_url)
                yue_du_shu = None
                if ping_lun_shu_liang:
                    all_page = ping_lun_shu_liang / 20
                    comment_url_list = []
                    for page in xrange(1, all_page + 1):
                        newsid = re.findall(r'([a-z]{7}\d{7})\.shtml',
                                            news_url)[0]
                        channel = re.findall(r'http://(.*?).sina', news_url)[0]
                        if (channel == 'finance'):
                            channel = 'cj'
                        elif (channel == 'sports'):
                            channel = 'ty'
                        elif (channel == 'ent'):
                            channel = 'yl'
                        else:
                            channel = re.findall(r'com\.cn/([a-z]+)/',
                                                 news_url)[0]
                            if (channel == 's'):
                                channel = 'sh'
                            else:
                                channel = 'gn'
                        comment_url = 'http://comment5.news.sina.com.cn/page/info?format=js&channel=%s&newsid=comos-%s&group=&compress=1&ie=gbk&oe=gbk&page=%s&page_size=20' % (
                            channel, newsid, page)
                        comment_url_list.append(comment_url)
                    for com_url in comment_url_list:
                        self.get_comment(news_url, com_url)

                else:
                    ping_lun_shu_liang = 0

                tree = etree.HTML(i)
                message_dict = dict()
                url_info = dict()

                # print' 文章网址'
                wen_zhang_wang_zhi = news_url
                message_dict['wen_zhang_wang_zhi'] = wen_zhang_wang_zhi
                # print'  # 文章标题'
                wen_zhang_biao_ti = pathOneNode(
                    tree, '//div[@class="main-content w1240"]/h1/text()')

                message_dict['wen_zhang_biao_ti'] = wen_zhang_biao_ti

                # print'  # 发布时间'
                fa_bu_shi_jian = pathOneNode(
                    tree, '//div[@class="date-source"]/span/text()')
                if not fa_bu_shi_jian:
                    fa_bu_shi_jian = re.findall(
                        '<span class="titer">(.*?)</span>', i)[0]

                fa_bu_shi_jian = re.findall('(\d{4}.*\d{2})',
                                            fa_bu_shi_jian)[0]
                # print news_url+fa_bu_shi_jian
                message_dict['fa_bu_shi_jian'] = fa_bu_shi_jian
                # print'  # 评论数量'
                ping_lun_shu_liang = ping_lun_shu_liang
                message_dict['ping_lun_shu_liang'] = ping_lun_shu_liang

                # print'  # 文章来源'
                # (//div[@class="article article_16"]/p[2]/text())
                wen_zhang_lai_yuan = pathOneNode(
                    tree,
                    '//div[@class="date-source"]/a/text()| //div[@class="date-source"]/span[@class="source ent-source"]/text()|//div[@class="date-source"]/span[@class="source"]/text()'
                )
                message_dict['wen_zhang_lai_yuan'] = wen_zhang_lai_yuan

                # print'  # 文章正文'
                wen_zhang_zheng_wen = tree.xpath(
                    '//div[@class="article"]/p/text()')
                wen_zhang_zheng_wen = ''.join(wen_zhang_zheng_wen)
                #print wen_zhang_zheng_wen
                message_dict['wen_zhang_zheng_wen'] = wen_zhang_zheng_wen

                # print' # 抓取时间'
                do_time = time.time()
                message_dict['do_time'] = do_time

                # print' # 抓取网站'
                zhan_dian = u'新浪网'
                message_dict['zhan_dian'] = zhan_dian

                # print'  # 图片链接'
                tu_pian_lian_jie = tree.xpath(
                    '//div[@class="img_wrapper"]/img/@src')
                if tu_pian_lian_jie:
                    tu_pian_lian_jie = ' '.join(tu_pian_lian_jie)
                    if tu_pian_lian_jie.startswith('http:'):
                        tu_pian_lian_jie = tu_pian_lian_jie
                    else:
                        tu_pian_lian_jie = 'http:' + tu_pian_lian_jie
                    message_dict['tu_pian_lian_jie'] = tu_pian_lian_jie
                else:
                    message_dict['tu_pian_lian_jie'] = None

                    # print'  # 文章栏目'
                # wen_zhang_lan_mu = pathAllNode(tree,
                #                                '(//div[@class="bread"]/a)|(//div[@class="bread"]/span)|(//div[@class="nav-g__breadcrumb layout-fl"]/a)|(//div[@class="text notInPad"]/a)')
                # message_dict['wen_zhang_lan_mu'] = wen_zhang_lan_mu
                #
                # print'   # 文章作者'
                if tree.xpath(
                        '(//p[@class="article-editor"]/text())|(//p[@class="show_author"]/text())'
                ):
                    wen_zhang_zuo_zhe = pathOneNode(
                        tree,
                        '(//p[@class="article-editor"]/text())|(//p[@class="show_author"]/text())'
                    )
                else:
                    wen_zhang_zuo_zhe = '佚名'
                message_dict['wen_zhang_zuo_zhe'] = wen_zhang_zuo_zhe

                # print' # 关键词'
                if tree.xpath('//div[@class="keywords"]'):
                    guan_jian_ci = tree.xpath(
                        '//div[@class="keywords"]/a/text()')

                    guan_jian_ci = ' '.join(guan_jian_ci)
                else:
                    guan_jian_ci = None
                message_dict['guan_jian_ci'] = guan_jian_ci

                # print'  # 相关标签'
                # xiang_guan_biao_qian = pathAllNode(tree,'(//section[@class="article-a_keywords"])|(//p[@class="art_keywords"])')
                # message_dict['xiang_guan_biao_qian'] = xiang_guan_biao_qian
                #
                # print'  # 阅读数量'
                yue_du_shu = yue_du_shu
                message_dict['yue_du_shu'] = yue_du_shu

                # print'  # 主键'
                message_dict['_id'] = news_url

                # print json.dumps(message_dict, ensure_ascii=False, indent=4)
                print '剩余未爬取新闻个数' + str(len(self.news_html_queue.queue))
                url_info['_id'] = news_url
                url_info['comment_count'] = ping_lun_shu_liang
                url_info['do_time'] = do_time
                self.mongo.put_url(url_info)
                self.mongo.put_content(message_dict)
            except Exception as e:
                self.log.info(e)
                # print e

    def getCommentNumber(self, news_url):
        newsid = re.findall(r'([a-z]{7}\d{7})\.shtml', news_url)[0]
        channel = re.findall(r'http://(.*?).sina', news_url)[0]
        if (channel == 'finance'):
            channel = 'cj'
        elif (channel == 'sports'):
            channel = 'ty'
        elif (channel == 'ent'):
            channel = 'yl'
        else:
            channel = re.findall(r'com\.cn/([a-z]+)/', news_url)[0]
            if (channel == 's'):
                channel = 'sh'
            else:
                channel = 'gn'
        comment_url = 'http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=%s&newsid=comos-%s&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20' % (
            channel, newsid)
        flag = 1
        while 1:
            try:
                x = requests.get(comment_url, timeout=30).content
                # print x
                json_object = json.loads(
                    re.findall('var data=([\S\s]+)', x)[0])
                break
            except Exception as e:
                flag += 1
                self.log.info('default to get this page of comment')
                # print e
            if flag > 5:
                return
        # 阅读数
        # yue_du_shu = json_object['join_count']
        # 评论数
        try:
            ping_lun_shu_liang = json_object['result']['count']['show']
        except Exception as e:
            ping_lun_shu_liang = 0
        #return yue_du_shu
        return ping_lun_shu_liang

    def get_comment(self, news_url, comment_url):
        # print '开始获取评论'
        try:
            # print comment_url
            json_object = json.loads(
                requests.get(comment_url,
                             timeout=30).content.replace('var data=', ''))
            # print json_object
            comment_dict = dict()
            for item in json_object['result']['cmntlist']:
                # 评论文章url
                news_url = news_url
                # 评论内容
                ping_lun_nei_rong = item["content"]
                comment_dict['ping_lun_nei_rong'] = ping_lun_nei_rong

                # 评论时间
                ping_lun_shi_jian = item["time"]
                comment_dict['ping_lun_shi_jian'] = ping_lun_shi_jian

                # 回复数量
                hui_fu_shu = None
                comment_dict['hui_fu_shu'] = hui_fu_shu

                # 点赞数量
                dian_zan_shu = item["agree"]
                comment_dict['dian_zan_shu'] = dian_zan_shu

                # 评论id
                ping_lun_id = item["mid"]
                comment_dict['ping_lun_id'] = ping_lun_id

                # 用户昵称
                yong_hu_ming = item["nick"]
                comment_dict['yong_hu_ming'] = yong_hu_ming

                # 性别
                xing_bie = None
                comment_dict['xing_bie'] = xing_bie

                # 用户等级
                yong_hu_deng_ji = item["level"]
                comment_dict['yong_hu_deng_ji'] = yong_hu_deng_ji

                # 用户省份
                yong_hu_sheng_fen = item["area"]
                comment_dict['yong_hu_sheng_fen'] = yong_hu_sheng_fen

                # 抓取时间
                do_time = time.time()
                comment_dict['do_time'] = do_time

                # 抓取网站
                zhan_dian = u'新浪'
                comment_dict['zhan_dian'] = zhan_dian

                # 主键
                comment_dict['_id'] = ping_lun_id + news_url

                # print json.dumps(comment_dict, ensure_ascii=False, indent=4)
                self.mongo.put_comment(comment_dict)
        except Exception as e:
            self.log.info(e)