Esempio n. 1
0
class LandAuctionExtractor(DefaultExtractor):
    def __init__(self, topic_info, log):
        DefaultExtractor.__init__(self, topic_info, log)
        self.lst_schema = [u'building_usage', u'years', u'decoration']
        self.province_parser = ProvinceParser(province_city, phone_city,
                                              region_city, city_city)

    def format_extract_data(self, extract_data, topic_id):
        '''格式化数据'''

        entity_data = copy.deepcopy(extract_data)

        # 面积单位
        acreage = extract_data.get('acreage', '')
        acreage_unit = re.sub(u'\d|\.', '', acreage)
        entity_data['acreage_unites'] = acreage_unit

        # 省分和城市
        text = entity_data.get('approved_unit', '') + entity_data.get(
            'district', '')
        province = entity_data.get('province', '')
        city = entity_data.get('city', '')

        if not province:
            province = self.province_parser.get_province(text, 1)
        if not city:
            city = self.province_parser.get_region(text, 1)
        if province in city:
            city = city.replace(province, '')
        entity_data['province'] = province
        entity_data['city'] = city

        return entity_data
Esempio n. 2
0
 def __init__(self):
     self.db_insert = MongDb(mongo_conf_insert['host'], mongo_conf_insert['port'], mongo_conf_insert['final_db'],
                        mongo_conf_insert['username'],
                        mongo_conf_insert['password'])
     self.db = MongDb(mongo_conf['host'], mongo_conf['port'], mongo_conf['final_db'],
                      mongo_conf['username'],
                      mongo_conf['password'])
     self.province_parser = ProvinceParser(province_city, phone_city, region_city, city_city)
     self.sourceTable = 'land_auction'
     self.targetTable = 'land_auction'
Esempio n. 3
0
class CleanLandAuction:
    def __init__(self):
        self.db_insert = MongDb(mongo_conf_insert['host'], mongo_conf_insert['port'], mongo_conf_insert['final_db'],
                           mongo_conf_insert['username'],
                           mongo_conf_insert['password'])
        self.db = MongDb(mongo_conf['host'], mongo_conf['port'], mongo_conf['final_db'],
                         mongo_conf['username'],
                         mongo_conf['password'])
        self.province_parser = ProvinceParser(province_city, phone_city, region_city, city_city)
        self.sourceTable = 'land_auction'
        self.targetTable = 'land_auction'

    def do_clean(self, entity_data):

        # 省分和城市
        text = entity_data.get('approved_unit', '') + entity_data.get('district', '')
        province = entity_data.get('province', '')
        city = entity_data.get('city', '')
        if not province:
            province = self.province_parser.get_province(text, 1)
        if not city:
            city = self.province_parser.get_region(text, 1)
        if province in city:
            city = city.replace(province, '')
        entity_data['province'] = province
        entity_data['city'] = city

        q_data.put(entity_data)

    def insert_info_batch(self, table, lst, is_order=False, insert=False):
        if lst != None and len(lst) == 0:
            return
        dbtemp = self.db_insert.db[table]
        bulk = dbtemp.initialize_ordered_bulk_op() if is_order else dbtemp.initialize_unordered_bulk_op()
        for item in lst:
            _record_id = item.get("_record_id", "")
            print _record_id
            item["_utime"] = toolsutil.get_now_time()
            if insert:
                bulk.insert(item)
            else:
                _id = item.pop('_id')
                bulk.find({'_id': _id}).update({'$set': item})
        try:
            bulk.execute({'w': 0})
            print ('insert_logs:' + str(len(lst)))

        except:
            print traceback.format_exc()
Esempio n. 4
0
        content_ret = self.address_index.query(content)
        if content_ret:
            pos = content_ret[-1][0][0]
            address_content = content[pos:pos + self.content_length]
            region = self.province_parser.get_region(address_content,1)

        return region


if __name__ == "__main__":
    from i_entity_extractor.common_parser_lib.province_parser import ProvinceParser
    from pymongo import MongoClient

    litigants = ""
    sys.path.append('../../')
    province_parser = ProvinceParser('../../dict/province_city.conf', '../../dict/phonenum_city.conf',
                                     '../../dict/region_city.conf',"../../dict/city.conf")

    obj = BidRegionParser(province_parser)

    host = '101.201.102.37'
    port = 28019
    database = 'final_data'
    coll = 'bid_detail'
    client = MongoClient(host, port)
    db = client[database][coll]
    cursor = db.find({'bid_type': '中标'}).limit(1000).skip(100)
    num = 0
    begin_time = time.time()
    content = "山东\t招标网\t(\thttp://www.sdzbw.com\t)与你携手共同发展!\t\t\t\t\t\t\t一、采购项目名称:医用臭氧治疗仪\t\t二、采购项目编号:JNCZJZ-2015-795\t\t三、采购项目分包情况:\t\t\t\t\t\t\t包号\t\t\t\t货物服务名称\t\t\t\t供应商资格要求\t\t\t\t本包预算金额\t\t\t\t\t\t未分包\t\t\t\t医用臭氧治疗仪\t\t\t\t(1)符合《政府采购法》第二十二条规定的条件;(2)具有本次招标项目的生产或经营范围,有能力提供本次采购项目及所要求的服务\t\t\t\t48万元\t\t\t\t\t\t\t四、获取谈判(磋商)文件:\t\t1.时间:2016年03月18日至2016年03月25日16点整\t\t2.地点:济南市政务服务中心政府采购部网站\t\t3.方式:网站“采购公告”栏目中,在对应项目公告最下方自行下载谈判文件及报名\t\t4.售价:免费\t\t五、递交响应文件时间及地点\t\t时间:2016年03月31日 08:30—09:30 (北京时间)\t\t地点:济南市市中区站前路9号市政务服务中心1号楼\t\t六、谈判(开启)时间及地点\t\t时间:2016年03月31日 09:30 (北京时间)\t\t地点:济南市市中区站前路9号市政务服务中心1号楼\t\t七、联系方式\t\t1.采购人:济南市民族医院\t\t地址:济南市民族医院\t\t联系人:陈俊杰\t\t联系方式:0531-86060159\t\t2.采购代理机构:济南市政务服务中心政府采购部\t\t地址:济南市市中区站前路9号市政务服务中心1号楼\t\t联系人:曹伯军\t\t联系方式:0531-68967547\t\t\t\t\t\t\t\t\t\t\t\t来源:山东招标网-山东最具权威网站,专业的招标采购网站\t\t\t[打印本页]\t\t\t[关闭本页]"

    ret = obj.do_parser(content)
Esempio n. 5
0
 def __init__(self, topic_info, log):
     DefaultExtractor.__init__(self, topic_info, log)
     self.lst_schema = [u'building_usage', u'years', u'decoration']
     self.province_parser = ProvinceParser(province_city, phone_city,
                                           region_city, city_city)
Esempio n. 6
0
    def __init__(self, logger, mysql_conf, crawl_conf = {}):
        self.logger = logger
        self.mysql_conf = mysql_conf;
        self.crawl_conf = crawl_conf
        self.mongodb_webpage = {};
        self.mongodb_linkbase = {};
        self.table_name = '__STATISTICS__'
        self.schedule_map = {
            0:1,
            1:7,
            2:30
        }
        
	    #初始化linkbase
        self.mongodb_linkbase = self.mongo_config("linkbase");
        self.linkbase_conn = pymongo.MongoClient(
        	host=self.mongodb_linkbase['host'],
            port=int(self.mongodb_linkbase['port'])
        );
        self.linkbase_db = self.linkbase_conn[self.mongodb_linkbase['database']];
        username = self.mongodb_linkbase.get('username','')
        password = self.mongodb_linkbase.get('password','')
        if username and password:
            self.linkbase_db.authenticate(username, password)
        #初始化webpage
        self.mongodb_webpage = self.mongo_config("webpage");
        self.webpage_conn = pymongo.MongoClient(
                                            host=self.mongodb_webpage['host'],
                                            port=int(self.mongodb_webpage['port'])
                            );
        self.webpage_db = self.webpage_conn[self.mongodb_webpage['database']];
        username = self.mongodb_webpage.get('username','')
        password = self.mongodb_webpage.get('password','')
        if username and password:
            self.webpage_db.authenticate(username, password)

        #初始化final_data
        self.mongodb_final_data = self.mongo_config("mongodb");
        self.final_data_conn = pymongo.MongoClient(
        	host=self.mongodb_final_data['host'],
            port=int(self.mongodb_final_data['port'])
        );
        self.final_data_db = self.final_data_conn[self.mongodb_final_data['database']];
        username = self.mongodb_final_data.get('username','')
        password = self.mongodb_final_data.get('password','')
        if username and password:
            self.final_data_db.authenticate(username, password)
        self.enterprise_data = self.final_data_db['enterprise_data_gov']

        province_city_conf = crawler_basic_path + '/i_entity_extractor/dict/province_city.conf'
        phonenum_conf_path = crawler_basic_path + '/i_entity_extractor/dict/phonenum_city.conf'
        region_conf_path   = crawler_basic_path + '/i_entity_extractor/dict/region_city.conf'
        city_conf_path     = crawler_basic_path + '/i_entity_extractor/dict/city.conf'

        self.province_parser = ProvinceParser(province_city_conf, phonenum_conf_path, region_conf_path, city_conf_path)


        #始化schedule_data
        self.mongodb_schedule_data = self.mongo_config("schedule_data")
        self.schedule_data_conn = pymongo.MongoClient(
        	host=self.mongodb_schedule_data['host'],
            port=int(self.mongodb_schedule_data['port'])
        );
        self.schedule_data_db = self.webpage_conn[self.mongodb_schedule_data['database']]
        username = self.mongodb_schedule_data.get('username','')
        password = self.mongodb_schedule_data.get('password','')
        if username and password:
            self.schedule_data_db.authenticate(username, password)
        self.schedule_data = self.schedule_data_db['enterprise_data']       
        self.enterprise_data_pending = self.schedule_data_db['enterprise_data_pending']       
Esempio n. 7
0
class ScheduleDebug:
    def __init__(self, logger, mysql_conf, crawl_conf = {}):
        self.logger = logger
        self.mysql_conf = mysql_conf;
        self.crawl_conf = crawl_conf
        self.mongodb_webpage = {};
        self.mongodb_linkbase = {};
        self.table_name = '__STATISTICS__'
        self.schedule_map = {
            0:1,
            1:7,
            2:30
        }
        
	    #初始化linkbase
        self.mongodb_linkbase = self.mongo_config("linkbase");
        self.linkbase_conn = pymongo.MongoClient(
        	host=self.mongodb_linkbase['host'],
            port=int(self.mongodb_linkbase['port'])
        );
        self.linkbase_db = self.linkbase_conn[self.mongodb_linkbase['database']];
        username = self.mongodb_linkbase.get('username','')
        password = self.mongodb_linkbase.get('password','')
        if username and password:
            self.linkbase_db.authenticate(username, password)
        #初始化webpage
        self.mongodb_webpage = self.mongo_config("webpage");
        self.webpage_conn = pymongo.MongoClient(
                                            host=self.mongodb_webpage['host'],
                                            port=int(self.mongodb_webpage['port'])
                            );
        self.webpage_db = self.webpage_conn[self.mongodb_webpage['database']];
        username = self.mongodb_webpage.get('username','')
        password = self.mongodb_webpage.get('password','')
        if username and password:
            self.webpage_db.authenticate(username, password)

        #初始化final_data
        self.mongodb_final_data = self.mongo_config("mongodb");
        self.final_data_conn = pymongo.MongoClient(
        	host=self.mongodb_final_data['host'],
            port=int(self.mongodb_final_data['port'])
        );
        self.final_data_db = self.final_data_conn[self.mongodb_final_data['database']];
        username = self.mongodb_final_data.get('username','')
        password = self.mongodb_final_data.get('password','')
        if username and password:
            self.final_data_db.authenticate(username, password)
        self.enterprise_data = self.final_data_db['enterprise_data_gov']

        province_city_conf = crawler_basic_path + '/i_entity_extractor/dict/province_city.conf'
        phonenum_conf_path = crawler_basic_path + '/i_entity_extractor/dict/phonenum_city.conf'
        region_conf_path   = crawler_basic_path + '/i_entity_extractor/dict/region_city.conf'
        city_conf_path     = crawler_basic_path + '/i_entity_extractor/dict/city.conf'

        self.province_parser = ProvinceParser(province_city_conf, phonenum_conf_path, region_conf_path, city_conf_path)


        #始化schedule_data
        self.mongodb_schedule_data = self.mongo_config("schedule_data")
        self.schedule_data_conn = pymongo.MongoClient(
        	host=self.mongodb_schedule_data['host'],
            port=int(self.mongodb_schedule_data['port'])
        );
        self.schedule_data_db = self.webpage_conn[self.mongodb_schedule_data['database']]
        username = self.mongodb_schedule_data.get('username','')
        password = self.mongodb_schedule_data.get('password','')
        if username and password:
            self.schedule_data_db.authenticate(username, password)
        self.schedule_data = self.schedule_data_db['enterprise_data']       
        self.enterprise_data_pending = self.schedule_data_db['enterprise_data_pending']       
 
    def mongo_config(self, mongo_item = "mongodb"):
        try:
            mongodb_conf = {}
            engine = create_engine(self.mysql_conf)
            Settings.metadata.create_all(engine)
            self.Dsession = sessionmaker(bind=engine)
            session = self.Dsession()
            query = session.query(Settings)
            records = query.filter(Settings.item == mongo_item).all()
            for record in records:
                mongodb_conf['host'] = '172.17.1.119'
                mongodb_conf['port'] = record.value['port']
                mongodb_conf['database'] = record.value['database']
                mongodb_conf['username'] = record.value['user']
                mongodb_conf['password'] = record.value['password']
                break;
            session.close();
            return mongodb_conf;
        except Exception as e:
            self.logger.error(traceback.format_exc())
            os._exit(1) 
    def to_LinkAttr(self, body):
        link_info = LinkAttr()
        try:
            tMemory_o = TMemoryBuffer(body)  
            tBinaryProtocol_o = TBinaryProtocol(tMemory_o)  
            link_info.read(tBinaryProtocol_o)
        except:
            print traceback.format_exc()
    def find_linkbase(self, link_url = "http://www.baidu.com/"):
        url_struct = get_url_info(link_url);
        domain = url_struct.get("domain", "baidu.com");
        urls_info =  self.linkbase_db[domain].find({'url':link_url})
        for url_info in urls_info:
            json_dict = {};
            link_str =  url_info['link_attr']
            link_attr = pickle.loads(link_str)
            if not link_attr:
                return None
            if not link_attr.url:
                link_attr.url = link_url
            link_attr = vars(link_attr)
            for key,val in link_attr.items():
                if not val:
                    json_dict[key] = None
                elif key == "crawl_info" or key == "parent_info" or key == "page_info" or key == 'extract_message':
                    json_dict[key] = vars(val)
                elif key == "normal_crawl_his" and val:
                    json_dict[key] = [] 
                    for his in val:
                        json_dict[key].append(vars(his))
                else:
                    json_dict[key] = str(val)
            return json_dict
    def find_webpage(self, link_url = "http://www.baidu.com/"):
        url_struct = get_url_info(link_url);
        domain = url_struct.get("domain", "baidu.com");
        urls_info =  self.webpage_db[domain].find({'url':link_url})
        for url_info in urls_info:
            url_info['_id'] = str(url_info['_id'])
            return url_info;
    def schedule_company(self, company_name, level, province):
        #if not province:
        province = self.province_parser.get_province(company_name)
        company_stat = {"update":0, 'for_schedule':0, 'in_schedule':0, 'company':company_name, 'level':level, 'province':province}
        company_infos = self.data_db[self.enterprise_table].find({"company":company_name})
        for company_info in company_infos:
            company_info['_id'] = str(company_info['_id'])
            company_stat['update'] += 1
            company_stat['in_schedule'] += 1
            company_stat['level'] = 1
            company_stat['province'] = company_info['province']
            company_stat['type'] = 'need_update'
            return company_stat

        branch_infos = self.company_data['enterprise_list_branch_invest'].find({"_id":company_name})
        for branch in branch_infos:
            company_stat['in_schedule'] += 1
            company_stat['level'] = 0
            company_stat['crawl_status'] = branch['crawl_status']
            company_stat['type'] = 'need_invest'
            return company_stat

        branch_infos = self.company_data['enterprise_list_zonggongsi'].find({"_id":company_name})
        for branch in branch_infos:
            company_stat['in_schedule'] += 1
            company_stat['level'] = 2
            company_stat['crawl_status'] = branch['crawl_status']
            company_stat['type'] = 'need_zonggongsi'
            return company_stat

        branch_infos = self.company_data['enterprise_list_diff_11_02'].find({"_id":company_name})
        for branch in branch_infos:
            company_stat['in_schedule'] += 1
            company_stat['level'] = 3
            company_stat['crawl_status'] = branch['crawl_status']
            company_stat['type'] = 'new_crawl'
            return company_stat

        branch_infos = self.company_data['enterprise_list_all'].find({"_id":company_name})
        for branch in branch_infos:
            company_stat['in_schedule'] += 1
            company_stat['level'] = 3
            company_stat['type'] = 'new_crawl'
            return company_stat
        """
        update_time = time.strftime('%Y-%m-%d %H:%M:%S')
        data = {"company":company_name, "level":level, 'province':province, "_utime" : update_time}
        company_infos = self.data_db[self.enterprise_schedule_table].find({"company":company_name})
        for company_info in company_infos:
            data['_in_time'] = company_info.get('_in_time', update_time)
        if not data.has_key('_in_time'):
            data['_in_time'] = update_time
            self.data_db[self.enterprise_schedule_table].insert(data)
            company_stat['level']  = 4
            company_stat['for_schedule'] += 1
        else:
            company_stat['in_schedule'] += 1
            self.data_db[self.enterprise_schedule_table].update({"company":company_name}, data)
        """
        company_stat['level']  = 4
        company_stat['type'] = 'not_find'
        return company_stat

    def import_schedule(self, company = "", province = "", level = 0,  user = "", need_crawl=True):
        update_time = time.strftime('%Y-%m-%d %H:%M:%S')
        company_infos = self.enterprise_data.find({'company':company})
        data = {"company":company, 'in_base':0}
        if not province:
            province = self.province_parser.get_province(company) 
        is_exit = 0
        for company_info in company_infos:
            data['in_base'] = 1
            is_exit = 1
        schedule_data = {"_id":company, "users":[], "_utime":update_time, 'province':province, 'level':level, 'exit':is_exit}
        company_infos = self.schedule_data.find({"_id":company})
        is_in = False
        for company_info in company_infos:
            is_in = True
            schedule_data["users"] = company_info["users"]
            if not (user in company_info["users"]) and user:
                schedule_data["users"].append(user)
        if not is_exit and self.enterprise_data_pending.count({'_id':company}) <= 0:
            self.enterprise_data_pending.insert(schedule_data)
        if not is_in:
            if user:
                schedule_data["users"].append(user)
            self.schedule_data.insert(schedule_data)
        else:
            self.schedule_data.update({"_id":company}, {'$set':schedule_data})
        if need_crawl:
            data['need_crawl'] = True
            #result = self.realtime_crawl(company);
            #data['crawl_status'] = result
        return data

    def get_schedule_list(self, user = "", start = 0, limit = 10):
        start = int(start)
        limit = int(limit)
        data = {"user":user, 'size':0, "total":0, "start":start, "limit":limit, "result":[]}
        data["total"] = self.schedule_data.find({"users":user}).count()
        company_infos = self.schedule_data.find({"users":user}).skip(start).limit(limit)
        for company_info in company_infos:
            data["size"] += 1
            update_time = company_info.get("_utime", "")
            company_name = company_info['_id'];
            level = company_info.get('level', 2)
            data["result"].append({"_name":company_name, "_utime":update_time, 'level':level})
            
        return data


    def import_companies(self, companies = "", user = ""):
        stat_info = {"in_base":0, "total":0}
        infos = companies.split("\n")
        company_name = ""
        province = "";
        level = 2
        for company in infos:
            pars = company.strip().split('\t')
            company_name = pars[0]
            if len(pars) >=2 and len(pars[1]) > 0:
                province = pars[1]
            if len(pars) >=3 and len(pars[2]) > 0:
                level = int(pars[2])
            stat_info["total"] += 1
            data = self.import_schedule(company_name, province, level, user, False)
            stat_info["in_base"] += data['in_base']
        return stat_info

    def schedule_companies(self, companies_str):
        companies_info = {}
        companies_name = []
        pars = companies_str.split('\n')
        for par in pars:
            company_info = {}
            company_info['level'] = 4
            company_info['province'] = ''
            company_info['company'] = ''
            companies_pars = par.split('\t');
            level = 4;
            if len(companies_pars) >= 1:
                company_info['company'] = companies_pars[0].strip()
            if len(companies_pars) >= 2:
                company_info['level'] = int(companies_pars[1].strip())
            if len(companies_pars) >= 3:
                company_info['province'] = companies_pars[2].strip()
            if not company_info['province']:
                company_info['province'] = self.province_parser.get_province(company_info['company'])
            if company_info['company']:
                company = company_info['company'];
                if not companies_info.has_key(company):
                    companies_name.append(company)
                companies_info[company] = company_info
        company_stat = {"update":0, 'for_schedule':0, 'in_schedule':0, 'all':len(companies_info)};
        cursor = self.data_db[self.enterprise_table].find({"company":{'$in':companies_name}},{'company':1, '_in_time':1, '_utime':1})
        for company_info in cursor:
            company_stat['update'] += 1
            company = company_info['company'].encode('utf8')
            u_time = time.mktime(dateparser.parse(company_info['_utime']).timetuple())
            if companies_info.has_key(company) and time.time() - u_time < 86400 * 7:
                del companies_info[company]
        update_time = time.strftime('%Y-%m-%d %H:%M:%S')
        for company, company_info in companies_info.items():
            cursor = self.data_db[self.enterprise_schedule_table].find({"company":company})
            for info in cursor:
                company_info['_in_time'] = info.get('_in_time', update_time)
                break;
            if company_info.has_key('_in_time'):
                company_stat['in_schedule'] += 1
                company_info['_utime'] = update_time
                self.data_db[self.enterprise_schedule_table].update({"company":company}, company_info)
            else:
                company_info['_in_time'] = update_time
                company_info['_in_time'] = update_time
                self.data_db[self.enterprise_schedule_table].insert(company_info)
                company_stat['for_schedule'] += 1

        return company_stat

    def get_company(self, company):
        data_db = self.data_conn['final_data'];
        company_infos = []
        if company:
            cursor = data_db[self.enterprise_table].find({"company":company})
        else:
            cursor = data_db[self.enterprise_table].find().limit(5)
        for company_info in cursor:
            info = {}
            info['company'] = company_info['company']
            info['registered_code'] = company_info['registered_code']
            info['registered_date'] = company_info['registered_date']
            company_infos.append(info)
        return company_infos
    
    def get_news(self, company):
        data_db = self.data_conn['marketing'];
        company_infos = []
        if company:
            cursor = data_db['marketing_news'].find({"company":company})
        else:
            cursor = data_db[self.enterprise_table].find().limit(5)
        for company_info in cursor:
            del company_info['_id']
            company_infos.append(company_info)
        return company_infos

    def realtime_crawl(self, company):
        url = self.crawl_conf.get('api', '')
        query = self.crawl_conf.get('query', '')
        if url and query and company:
            url = url + "?" + query + "=" + company
            try:
                data = urllib2.urlopen(url, timeout = 10).read()
                if data and data.find('crawling') >= 0:
                    return {"status":True, "msg":data}
                else:
                    return {"status":False, "msg":data}
            except Exception as e:
                return {"status":False, "msg":traceback.format_exc()}
        return {"status":False, "msg":"not crawl_conf or company empty"}

    def clear_schedule_data(self):
        #级别为0:天级别更新  1:7天级别更新, 2:30天内更新
        company_infos = self.schedule_data.find({}).sort([("level",pymongo.ASCENDING),("_utime",pymongo.ASCENDING)])
        for company_info in company_infos:
            company_name = company_info.get('_id', '') 
            update_time = company_info.get('_utime', '') 
            level = int(company_info.get('level', '2'))
            if level < 0:
                level = 2 
            days_num = 30
            if level in self.schedule_map:
                days_num = self.schedule_map[level]
            now_time = time.time()
            company_info = self.enterprise_data.find({'company':company_name})
            need_crawl = 0
            mod_value = {}
            if company_info.count() > 0:
                update_time = company_info[0].get('_utime', "1985-12-17 00:00:00")
                province = company_info[0].get('province', "")
                utime = time.mktime(time.strptime(update_time,'%Y-%m-%d %H:%M:%S'))
                if utime < now_time -  86400 * days_num:
                    need_crawl = 1 
                mod_value['need_crawl'] = need_crawl
                if province:
                    mod_value['province'] = province
                self.schedule_data.update({'_id':company_name},{"$set":mod_value})
                self.logger.info("update\tcompany:%s\texit:1\tlevel:%s" % (company_name, level))
            else:
                mod_value = {}
                mod_value['need_crawl'] = need_crawl
                self.schedule_data.update({'_id':company_name},{"$set":mod_value})
                self.logger.info("crawl\tcompany:%s\texit:0\tneed_crawl:%s" % (company_name, 1)) 
        return None