def load_definitions(self, filename): print '\nLoading page definitions... (in Mongo)' mongo = MongoDB() if self.progress: num_lines = sum(1 for line in open(filename) if '<page' in line) elementnr = 1 pipe = mongo.pipeline(transaction=False) for pageid, title, markup in wikidumps.extract_pages(filename): elementnr += 1 if self.progress and elementnr % 10 is 0: cli_progress(elementnr, num_lines) try: if markup is not None: definition = generate_markup_definition(markup) else: definition = '' pipe.set(self.ns.page_definition(str(pageid)), definition) if len(pipe) >= 100: pipe.execute() except Exception, e: print "Error loading on element: ", elementnr, str(e) continue
def __init__(self, source_table, aim_table, key_map, unique_key=None): ''' @summary: 初始化 --------- @param source_table: 源table @param aim_table: 目标table @param key_map: 目标table 和 源table 的键的映射 eg: key_map = { 'aim_key1' : 'str_source_key2', # 目标键 = 源键对应的值 类型为str 'aim_key2' : 'int_source_key3', # 目标键 = 源键对应的值 类型为int 'aim_key3' : 'date_source_key4', # 目标键 = 源键对应的值 类型为date 'aim_key4' : 'vint_id', # 目标键 = 值 类型为int 'aim_key5' : 'vstr_name', # 目标键 = 值 类型为str 'aim_key6' : 'sint_select id from xxx' # 目标键 = 值为sql 查询出的结果 类型为int 'aim_key7' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str } @param unique_key: 唯一的key 目标数据库根据该key去重 --------- @result: ''' super(ExportData, self).__init__() self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._mongodb = MongoDB() self._is_oracle = False self._export_count = 0
def __init__(self, tab_urls): super(Collector, self).__init__() self._lock = threading.RLock() self._db = MongoDB() self._thread_stop = False self._urls = [] self._null_times = 0 self._read_pos = -1 self._write_pos = -1 self._tab_urls = tab_urls self._depth = int( tools.get_conf_value('config.conf', "collector", "depth")) self._max_size = int( tools.get_conf_value('config.conf', "collector", "max_size")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) #初始时将正在做的任务至为未做 self._db.update(self._tab_urls, {'status': Constance.DOING}, {'status': Constance.TODO}) self._finished_callback = None
def export_to_oracle(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback='', sync_to_es=False): if aim_table: if self._aim_table != aim_table: self._is_set_unique_key = False self._es = ES() if sync_to_es else '' self._mongodb = MongoDB() if source_table else '' self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key self._update_read_status = update_read_status if not datas else False self._condition = condition self._datas = datas self._callback = callback self._sync_to_es = sync_to_es self._es = None self._aim_db = OracleDB() self._is_oracle = True return self.__export()
def calc_words_dist(): mongo = MongoDB(host='localhost', port=27017, db_name='mery') mongo.load_database() articles = mongo.load_collection('articles') word_distributions = {} categories = [u"ファッション", u"メイク・コスメ", u"ヘアスタイル", u"ネイル", u"美容", u"グルメ", u"旅行・おでかけ", u"恋愛", u"ライフスタイル", ] total_words_dist = WordDistribution() for category in categories[:]: word_distributions[category] = WordDistribution() category_articles = articles.find({'category': category}) print "====== {}: {} ======".format(category.encode('utf-8'), category_articles.count()) for article in category_articles: sp = SentenceParser(article["title"]) sp.parse() nouns = sp.extract_nouns() word_distributions[category].update_dist(nouns) # カテゴリーの文書全体の単語頻度更新 total_words_dist.update_dist(nouns) # 文書全体の単語頻度更新 word_distributions[category].calc_total_words_dist() dump_object(word_distributions, 'mery_category_word_dist.pkl') dump_object(word_distributions, 'mery_total_word_dist.pkl') # 上位0.1%以上の出現頻度の単語を取り除く total_words_dist.calc_total_words_dist() top_n_percent_words = total_words_dist.extract_top_n_percent_words(n=0.1) print "=== removed words ===" for word in top_n_percent_words: print word.encode('utf-8') # 結果を表示 for category in categories[:]: word_distributions[category].remove_words(top_n_percent_words) # 出現頻度が多い単語を削除 print "==== {} =====".format(category.encode('utf-8')) for word, count in sorted(word_distributions[category].total_words_dist.items(), key=lambda x: x[1], reverse=True)[:20]: print word.encode('utf-8'), count
def __init__(self, source_table='', aim_table='', key_map='', unique_key=None, unique_key_mapping_source_key=None, update_read_status=True, condition={'read_status': 0}, datas=[], callback=''): ''' @summary: 初始化 --------- @param source_table: 源table mongo数据库 @param aim_table: 目标table @param key_map: 目标table 和 源table 的键的映射 eg: key_map = { 'aim_key1' : 'str_source_key2', # 目标键 = 源键对应的值 类型为str 'aim_key2' : 'int_source_key3', # 目标键 = 源键对应的值 类型为int 'aim_key3' : 'date_source_key4', # 目标键 = 源键对应的值 类型为date 'aim_key4' : 'vint_id', # 目标键 = 值 类型为int 'aim_key5' : 'vstr_name', # 目标键 = 值 类型为str 'aim_key6' : 'vdate_name', # 目标键 = 值 类型为date 'aim_key7' : 'sint_select id from xxx' # 目标键 = 值为sql 查询出的结果 类型为int 'aim_key8' : 'sstr_select name from xxx' # 目标键 = 值为sql 查询出的结果 类型为str 'aim_key9' : 'clob_key8' # 目标键 = 源键对应的值 类型为clob 'aim_key10' : 'clob_key8' # 目标键 = 源键对应的值 类型为str } @param unique_key: 唯一的key 目标数据库根据该key去重 @param unique_key_mapping_source_key: 目标表中唯一的key所对应的源表中的key 该值不为空时 更新目标表中已有的数据 eg: unique_key_mapping_source_key = { 'url':'str_url' # 目标键 = 源键对应的值 类型为str } @param condition: 导出满足什么样条件的数据 默认是read_status = 0 的 @param datas: 要导出的数据,格式为[{...},{...}] 或者 {}用于直接将json数组导入到目标表,为空时默认导出mongodb的数据 @param callback 导出数据的回调,导出一组,执行一次,callback(execute_type, sql) execute_type为执行类型(ExportData.INSERT、ExportData.UPDATE、ExportData.EXCEPTION) sql 为执行的语句 --------- @result: ''' super(ExportData, self).__init__() self._source_table = source_table self._aim_table = aim_table self._key_map = key_map self._unique_key = unique_key self._update_read_status = update_read_status self._condition = condition self._mongodb = MongoDB() if self._source_table else '' self._datas = datas self._is_oracle = False self._is_set_unique_key = False self._export_count = 0 self._update_count = 0 self._unique_key_mapping_source_key = unique_key_mapping_source_key
def __init__(self, tab_urls, tab_site='', tab_content='', parser_count=None, parser_params={}, begin_callback=None, end_callback=None, content_unique_key='url', delete_tab_urls=False): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的參數 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_urls self._db = MongoDB() if delete_tab_urls: self._db.delete(tab_urls) self._db.set_unique_key(tab_urls, 'url') if tab_site: self._db.set_unique_key(tab_site, 'site_id') if tab_content: self._db.set_unique_key(tab_content, content_unique_key) #设置索引 加快查询速度 self._db.set_ensure_index(tab_urls, 'depth') self._db.set_ensure_index(tab_urls, 'status') if tab_site: self._db.set_ensure_index(tab_site, 'read_status') if tab_content: self._db.set_ensure_index(tab_content, 'read_status') self._collector = Collector(tab_urls) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',')
def __init__(self, tab_urls, tab_site, tab_content, parser_count=None, search_keyword1=[], search_keyword2=[], search_keyword3=[], begin_callback=None, end_callback=None, content_unique_key=None): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param search_keyword1: 搜索关键字(列表)全部包含 @param search_keyword2: 搜索关键字(列表)至少包含一个 @param search_keyword3: 搜索关键字(列表)一个都不能包含 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_urls self._db = MongoDB() self._db.set_unique_key(tab_urls, 'url') self._db.set_unique_key(tab_site, 'site_id') self._db.set_unique_key( tab_content, 'url' if not content_unique_key else content_unique_key) self._collector = Collector(tab_urls) self._parsers = [] self._search_keyword1 = search_keyword1 self._search_keyword2 = search_keyword2 self._search_keyword3 = search_keyword3 self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',')
def __init__(self, tab_list, tab_unique_key_list, tab_ensure_index_list, parser_count=None, site_parsers=None, parser_params={}, begin_callback=None, end_callback=None, delete_tab_urls=False): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的参数 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._db = MongoDB() self._tab_urls = tab_list[0] if delete_tab_urls: self._db.delete(self._tab_urls) self._site_parsers = site_parsers for tab_index in range(len(tab_list)): self._db.set_unique_key(tab_list[tab_index], tab_unique_key_list[tab_index]) # 设置索引 加快查询速度 for ensure_index in tab_ensure_index_list[tab_index]: self._db.set_ensure_index(tab_list[tab_index], ensure_index) self._collector = Collector(self._tab_urls, self._site_parsers) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',')
def main(): db = MongoDB() def begin_callback(): log.info('\n********** wp begin **********') db.delete('WP_urls', {}) def end_callback(): # 更新关键词状态 做完 log.info('\n********** wp end **********') export_data.main() # 配置spider spider = Spider(tab_urls = 'WP_urls', tab_site = 'WP_site_info', tab_content = 'WP_content_info', parser_count = 20, begin_callback = begin_callback, end_callback = end_callback, content_unique_key = 'title') # 添加parser spider.add_parser(dongmanla_parser) # spider.add_parser(zx_novel_parser) # spider.add_parser(jisu_cartoon_parser) # spider.add_parser(ximalaya_parser) spider.start()
def main(): db = MongoDB() while True: def begin_callback(): log.info('\n********** proxies begin **********') db.delete('proxies_urls') def end_callback(): log.info('\n********** proxies end **********') # 更新任务状态 done # 导出数据 # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '') # export_data.export_to_oracle() # 配置spider spider = Spider(tab_urls='proxies_urls', tab_site='proxies_site_info', tab_content='proxies_content_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params={}, content_unique_key='ip') # 添加parser spider.add_parser(gaoni_parser) spider.start() # time.sleep(60) break
class KeyWordsController(object): _mongodb = MongoDB() def __init__(self): pass def get_keywords_by_sex(self): filter = WordsFilter() #男性去掉停顿词后的集合 male_total = filter.get_filter_result( self._mongodb.get_all_tweets_with_sex_flag(1)) #女性去掉停顿词后的集合 female_total = filter.get_filter_result( self._mongodb.get_all_tweets_with_sex_flag(2)) #男性关键词 maleKeyWordsList = filter.get_keywords_with_tag(male_total, 200) # 女性关键词 femaleKeyWordsList = filter.get_keywords_with_tag(female_total, 200) with codecs.open( "C:\Users\chenyx\Desktop\NewDataSet\male_keyword_count.txt", "w", "utf-8") as f: for item in maleKeyWordsList: string = "%s\r\n" % (item) f.writelines(string) with codecs.open( "C:\Users\chenyx\Desktop\NewDataSet\_female_keyword_count.txt", "w", "utf-8") as f: for item in femaleKeyWordsList: string = "%s\r\n" % (item) f.writelines(string)
def main(): db = MongoDB() def begin_callback(): log.info('\n********** live_app begin **********') db.delete('LiveApp_urls', {}) db.update('LiveApp_anchor_info', {}, {"live_view": 0}) db.update('LiveApp_anchor_info', {}, {"watched_count": 0}) db.update('LiveApp_anchor_info', {}, {'read_status': 0}) def end_callback(): # 更新关键词状态 做完 log.info('\n********** live_app end **********') export_data.main() # 配置spider spider = Spider(tab_urls='LiveApp_urls', tab_site='LiveApp_site_info', tab_content='LiveApp_anchor_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, content_unique_key='room_id') # 添加parser spider.add_parser(inke_parser) spider.add_parser(huajiao_parser) spider.add_parser(momo_parser) spider.start()
def __init__(self): super().__init__() self.base_url = 'http://www.sosi55.com' self.rule = { 'page_url': '/guochantaotu/list_22_%page.html', 'page_rule': { "list": '.yuanma_downlist_box .pic a' }, 'post_rule': { 'title': '.single h1' }, 'base_url': self.base_url, } self.charset = 'gbk' self.table = 'ii_sousi' self.cc = OpenCC('t2s') self.db = MongoDB(os.environ.get('MONGO'), 'sousi')
def crawl_article(dicts): """ :param dicts: """ for article_dict in dicts: sess = requests.Session() headers = get_header() url = article_dict.get('url') print("开始爬取:%s" % url) res = sess.get(url, headers=headers) selector = etree.HTML(res.text) rich_media = selector.xpath( "//div[@class='rich_media_inner']/div[@id='page-content']/div[1]/div[2]")[0] author = selector.xpath("//div[@id='meta_content']/span[@class='rich_media_meta rich_media_meta_text']")[ 0].xpath( "string(.)") __biz = url2dict(url).get('__biz', '') # 正文文字 content = rich_media.xpath("string(.)") # 图片集合 picture_urls = selector.xpath("//img/@data-src") # 视频集合 video_urls = selector.xpath("//iframe[@class='video_iframe']/@data-src") json_info = get_article_info(url) if json_info is not None: like_num = json_info.get('data', {}).get('zannums', 0) read_num = json_info.get('data', {}).get('readnums', 0) mongodb = MongoDB() article_item = {'title': article_dict.get('title', ""), 'author': author, 'summary': article_dict.get('summary', ""), 'cover': article_dict.get('cover', ""), 'content': content, 'like_num': like_num, 'read_num': read_num, 'comment': "", 'url': url, 'receive_time': article_dict.get('receive_time', ""), 'account': article_dict.get('account', ""), '__biz': __biz} mongodb.add("wechat_article", article_item) try: download_pictures(dict_info=article_item, picture_urls=picture_urls) _thread.start_new_thread(download_videos, (article_item, '', video_urls)) except: print("下载多媒体内容失败") sleep(60) pass
def download(title=None): if title is not None: zhuishu = ZhuiShuSpider() # babadushu = BaBaDuShuSpider() # liewen = LieWenSpider() mongodb = MongoDB('zhuishu') # 想要爬取哪个网站,就把哪个网站的爬虫实现类传入,比如,我这里传入的是 追书网实例 novel = spider.Spider(zhuishu, mongodb) novel.search(title) else: print('请输入要下载的小说名')
def main(): db = OracleDB() mongodb = MongoDB() sql = 'select t.KEYWORD, t.monitor_type from TAB_MVMS_SEARCH_INFO t where sysdate >= t.monitor_start_time and sysdate <= t.monitor_end_time and search_type = 702' result_list = db.find(sql, fetch_one=False) if not result_list: log.debug('无任务 结束') return parser_params = {'result_list': result_list} # parser_params = [] # for i in result: # parser_params.extend(str(i[0]).split(',')) def begin_callback(): log.info('\n********** WWA_weibo_user begin **********') mongodb.delete('WWA_weibo_user_urls') def end_callback(): # 导出数据 key_map = { 'id': 'int__id', 'name': 'str_name', 'sex': 'int_sex', 'summary': 'str_summary', 'fans_count': 'int_fans_count', 'blog_verified': 'str_blog_verified', 'is_verified': 'int_is_verified', 'account_url': 'str_url', 'follow_count': 'int_follow_count', 'image_url': 'str_image_url', 'monitor_status': 'vint_401', 'SEARCH_TYPE' : 'vint_702', 'region' : 'str_area' } export = ExportData('WWA_weibo_user_info', 'tab_mvms_weibo_info', key_map, 'account_url') export.export_to_oracle() log.info('\n********** WWA_weibo_user end **********') # 配置spider spider = Spider(tab_urls = 'WWA_weibo_user_urls', tab_site = 'WWA_site_info', tab_content = 'WWA_weibo_user_info', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback, parser_params = parser_params) # 添加parser spider.add_parser(weibo_user_parser) spider.start()
class ImplicateController(object): _mongodb = MongoDB() def __init__(self): pass def cal_implicate_degree(self): peopleList = self._mongodb.get_information_set_with_sex_flag(-1) collection = self._mongodb.getDB()["Implicate"] for person in peopleList: print "current" + person["_id"] #初始化标签 total, higherThan50, lessThan50 = 0, 0, 0 #获取数据 tweetsList = self._mongodb.get_tweets_by_id(person["_id"]) #没有tweets做的处理 if len(tweetsList) == 0: continue for tweet in tweetsList: degree = items.cal_implicit_degree(tweet["Content"]) if degree > 50: higherThan50 += 1 else: lessThan50 += 1 total += degree avg = total / float(len(tweetsList)) sex = "" try: if person["Gender"]: sex = person["Gender"] else: sex = "空" except: print "dont have sex tag" data = { "_id": person["_id"], "implicit_degree": avg, "tweets_count": len(tweetsList), "high_50": higherThan50, "less_50": lessThan50, "sex": sex } collection.insert(data)
class UtMongoDB(unittest.TestCase): '''Unit Test driver for db.mongodb.MongoDB''' db = MongoDB('esm') def tearDown(self): "Delete seed data from testing database" try: connection = Connection(host="localhost", port=27017) except ConnectionFailure, e: sys.stderr.write("Could not connect to MongoDB: %s" % e) sys.exit(1) db_handler = connection["esm"] assert db_handler.connection == connection db_handler['justniffer_events'].drop() connection.end_request()
def main(): db = MongoDB() def begin_callback(): log.info('\n********** template begin **********') db.delete('op_urls', {}) db.delete('op_content_info', {}) def end_callback(): log.info('\n********** template end **********') # 更新任务状态 done # 导出数据 # export_data = ExportData(source_table = '', aim_table = '', key_map = '', unique_key = '') # export_data.export_to_oracle() # 配置spider spider = Spider(tab_urls='op_urls', tab_site='op_site_info', tab_content='op_content_info', parser_count=20, begin_callback=begin_callback, end_callback=end_callback, parser_params={}) #添加parser spider.add_parser(luzhou_parser) spider.add_parser(longmatan_parser) spider.add_parser(naxi_parser) spider.add_parser(luxian_parser) spider.add_parser(hejiang_parser) spider.add_parser(gulin_parser) spider.add_parser(luzhouzhiye_parser) spider.add_parser(sichuanhuagong_parser) spider.add_parser(luzhougaozhong_parser) spider.add_parser(xuyong_parser) spider.add_parser(jiangyang_parser) spider.add_parser(luzhoutianli_parser) spider.add_parser(sichuanluxian_parser) spider.add_parser(sichuan_police_parser) spider.add_parser(sichuanyikeda_parser) spider.add_parser(luzhoubaidu_parser) spider.start()
def main(): db = MongoDB() oracle = OracleDB() def begin_callback(): #db.update('WWA_app_urls',{'depth':0}, {'status':0}) db.delete('WWA_search_app_urls') log.info('\n********** wwa begin **********') def end_callback(): log.info('\n********** wwa end **********') export_data.main() keywords = [] result_list = oracle.find( 'select keyword from TAB_MVMS_SEARCH_INFO where MONITOR_START_TIME <= sysdate AND MONITOR_END_TIME >= sysdate and search_type=703' ) if not result_list: log.debug('无任务 结束') return keywords = [] for result in result_list: keywords.extend(result[0].split(',')) parser_params = {'keywords': keywords} # 配置spider spider = Spider(tab_urls='WWA_search_app_urls', tab_site='WWA_search_app_site_info', tab_content='WWA_search_app_content_info', content_unique_key='title', begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(yingyongbao_parser) spider.add_parser(android_market_parser) spider.add_parser(baidu_mobile_assistant_parser) spider.add_parser(mobile360_assistant_parser) spider.start()
class RecipeRepository(): recipe_collection = MongoDB.get_collection('HelloFresh', 'Recipes') ''' Provides access to the recipes collection ''' @classmethod async def create_recipe(cls, new_recipe: RecipeRequest) -> str: recipe_doc = new_recipe.dict(exclude={'_id'}) result = await cls.recipe_collection.insert_one(recipe_doc) if not result.acknowledged: # TODO: define custom exception raise Exception('Database did not acknowledge') return str(result.inserted_id) @classmethod async def read_recipe_by_id(cls, recipe_id) -> RecipeDB: recipe_document = await cls.recipe_collection.find_one({"_id": recipe_id}) if recipe_document is None: raise ValueError('Invalid Recipe ID') return RecipeDB.parse_obj(recipe_document) @classmethod async def update_recipe(cls, recipe_id: str, new_recipe: RecipeRequest) -> str: recipe_doc = new_recipe.dict(exclude={'_id', 'id'}, by_alias=True) result = await cls.recipe_collection.replace_one({"_id": ObjectId(recipe_id)}, recipe_doc) if not result.acknowledged: # TODO: define custom exception raise Exception('Database did not acknowledge') elif result.modified_count != 1: raise Exception('Document was not updated') return recipe_id @classmethod async def delete_recipe(cls, recipe_id: str): result = await cls.recipe_collection.delete_one({"_id": ObjectId(recipe_id)}) if not result.acknowledged: # TODO: define custom exception raise Exception('Database did not acknowledge') elif result.deleted_count != 1: raise Exception('Document was not deleted') return
class UtMongoQuery(unittest.TestCase): '''complex query test driver''' logger = Logger().getLogger("test.UtMongoQuery") db = MongoDB('esm') def setUp(self): # self.logger.debug('UtMongoQuery.setUp()') current_path = os.getcwd() self.runBash(current_path+'/import.sh') def tearDown(self): # self.logger.debug('UtMongoQuery.tearDown()') self.db.removeAll('events') def runBash(self,file_name): '''run a shell script.''' try: subprocess.call([file_name], shell=True) except OSError, e: self.logger.exception("bash script "+file_name+" execution failed:" + e)
def main(): db = MongoDB() db.set_unique_key('WWA_app_vioation_content_info', 'url') db.set_ensure_index('WWA_app_vioation_content_info', 'read_status') def begin_callback(): log.info('\n********** WWA_APP begin **********') db.delete('WWA_app_urls', {}) def end_callback(): export_data.main() log.info('\n********** WWA_APP end **********') # 配置spider spider = Spider(tab_urls = 'WWA_app_urls', tab_site = 'WWA_app_site_info', tab_content = 'WWA_app_content_info', parser_count = 1, begin_callback = begin_callback, end_callback = end_callback, parser_params = {}) # 添加parser spider.add_parser(headline_parser) spider.add_parser(kuaibao_parser) spider.start()
def main(): db = OracleDB() mongodb = MongoDB() sql = 'select t.ID, t.monitor_type from TAB_MVMS_WEIBO_INFO t where monitor_status = 402' result_list = db.find(sql, fetch_one=False) if not result_list: log.debug('无任务 结束') return parser_params = result_list # for i in result: # parser_params.extend(str(i[0]).split(',')) def begin_callback(): log.info('\n********** WWA_weibo_info begin **********') mongodb.delete('WWA_weibo_info_urls') def end_callback(): # 导出数据 key_map = { 'id': 'int__id', 'release_time': 'date_release_time', 'come_from': 'str_come_from', 'content': 'clob_content', 'image_url': 'str_image_url', 'video_url': 'str_video_url', 'transpond_count': 'int_transpond_count', 'praise_count': 'int_praise_count', 'check_status': 'vint_301', 'weibo_id': 'int_weibo_id', 'article_url': 'str_url', 'violate_status': 'int_violate_id', 'sensitive_id': 'int_sensitive_id', 'record_time': 'date_record_time', 'SEXY_IMAGE_STATUS': 'str_sexy_image_status' } export = ExportData('WWA_weibo_info_info', 'tab_mvms_weibo_article_info', key_map, unique_key='ARTICLE_url', condition={ 'read_status': 0, "image_pron_status": 2 }) export.export_to_oracle() log.info('\n********** WWA_weibo_info end **********') # 配置spider spider = Spider(tab_urls='WWA_weibo_info_urls', tab_site='WWA_site_info', tab_content='WWA_weibo_info_info', parser_count=1, begin_callback=begin_callback, end_callback=end_callback, parser_params=parser_params) # 添加parser spider.add_parser(weibo_info_parser) spider.start()
def begin_callback(): log.info('\n********** WWA_wechat_article begin **********') db = MongoDB() db.delete('WWA_wechat_article_url', {})
class Spider(threading.Thread): def __init__(self, tab_list, tab_unique_key_list, tab_ensure_index_list, parser_count=None, site_parsers=None, parser_params={}, begin_callback=None, end_callback=None): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的参数 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_list[0] self._site_parsers = site_parsers self._db = MongoDB() for tab_index in range(len(tab_list)): self._db.set_unique_key(tab_list[tab_index], tab_unique_key_list[tab_index]) # 设置索引 加快查询速度 for ensure_index in tab_ensure_index_list[tab_index]: self._db.set_ensure_index(tab_list[tab_index], ensure_index) self._collector = Collector(self._tab_urls, self._site_parsers) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',') def add_parser(self, parser): if self._spider_site_name[0] == 'all': for except_site_name in self._except_site_name: if parser.NAME != except_site_name.strip(): self._parsers.append(parser) else: for spider_site_name in self._spider_site_name: if parser.NAME == spider_site_name.strip(): self._parsers.append(parser) def run(self): self.__start() def __start(self): if self._begin_callback: self._begin_callback() if not self._parsers: if self._end_callabck: self._end_callabck() return # 启动parser 的add site 和 add root # print(self._parser_params) for parser in self._parsers: parser.add_site_info() parser.add_root_url(self._parser_params) print('添加跟url完毕') # 启动collector self._collector.add_finished_callback(self._end_callabck) self._collector.start() # 启动parser control while self._parser_count: parser_control = PaserControl(self._collector, self._tab_urls) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_count -= 1
class Collector(threading.Thread): def __init__(self, tab_urls): super(Collector, self).__init__() self._lock = threading.RLock() self._db = MongoDB() self._thread_stop = False self._urls = [] self._null_times = 0 self._read_pos = -1 self._write_pos = -1 self._tab_urls = tab_urls self._depth = int( tools.get_conf_value('config.conf', "collector", "depth")) self._max_size = int( tools.get_conf_value('config.conf', "collector", "max_size")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) #初始时将正在做的任务至为未做 self._db.update(self._tab_urls, {'status': Constance.DOING}, {'status': Constance.TODO}) self._finished_callback = None def run(self): while not self._thread_stop: self.__input_data() time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() @tools.log_function_time def __input_data(self): log.debug('read_pos %d, write_pos %d buffer size %d' % (self._read_pos, self._write_pos, self.get_max_read_size())) log.debug('buffer can write size = %d' % self.get_max_write_size()) if self.get_max_write_size() == 0: log.debug("collector 已满 size = %d" % self.get_max_read_size()) return url_count = self._url_count if self._url_count <= self.get_max_write_size( ) else self.get_max_write_size() urls_list = [] if self._depth: urls_list = self._db.find(self._tab_urls, { "status": Constance.TODO, "depth": { "$lte": self._depth } }, limit=url_count) else: urls_list = self._db.find(self._tab_urls, {"status": Constance.TODO}, limit=url_count) #更新已取到的url状态为doing for url in urls_list: self._db.update(self._tab_urls, url, {'status': Constance.DOING}) # 存url self.put_urls(urls_list) if self.is_all_have_done(): self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): if self.get_max_read_size() == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times: #检查数据库中有没有正在做的url urls_doing = self._db.find(self._tab_urls, {'status': Constance.DOING}) if urls_doing: self._null_times = 0 return False else: return True else: self._null_times = 0 return False def get_max_write_size(self): size = 0 if self._read_pos == self._write_pos: size = self._max_size elif self._read_pos < self._write_pos: size = self._max_size - (self._write_pos - self._read_pos) else: size = self._read_pos - self._write_pos return size - 1 def get_max_read_size(self): return self._max_size - 1 - self.get_max_write_size() @tools.log_function_time def put_urls(self, urls_list): if urls_list == []: return # 添加url 到 _urls url_count = len((urls_list)) end_pos = url_count + self._write_pos + 1 # 判断是否超出队列容量 超出的话超出的部分需要从头写 # 超出部分 overflow_end_pos = end_pos - self._max_size # 没超出部分 in_pos = end_pos if end_pos <= self._max_size else self._max_size # 没超出部分的数量 urls_listCutPos = in_pos - self._write_pos - 1 self._lock.acquire() #加锁 self._urls[self._write_pos + 1:in_pos] = urls_list[:urls_listCutPos] if overflow_end_pos > 0: self._urls[:overflow_end_pos] = urls_list[urls_listCutPos:] self._lock.release() self._write_pos += url_count self._write_pos %= self._max_size # -1 取余时问题 -1 % 1000 = 999 这样can write size 为0 urls_list为空时返回 规避了这个问题 @tools.log_function_time def get_urls(self, count): self._lock.acquire() #加锁 urls = [] count = count if count <= self.get_max_read_size( ) else self.get_max_read_size() end_pos = self._read_pos + count + 1 if end_pos > self._max_size: urls.extend(self._urls[self._read_pos + 1:]) urls.extend(self._urls[:end_pos % self._max_size]) else: urls.extend(self._urls[self._read_pos + 1:end_pos]) if urls: self._read_pos += len(urls) self._read_pos %= self._max_size self._lock.release() return urls
class Spider(threading.Thread): def __init__(self, tab_urls, tab_site, tab_content, parser_count=None, parser_params={}, begin_callback=None, end_callback=None, content_unique_key='url'): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的參數 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_urls self._db = MongoDB() self._db.set_unique_key(tab_urls, 'url') self._db.set_unique_key(tab_site, 'site_id') self._db.set_unique_key(tab_content, content_unique_key) #设置索引 加快查询速度 self._db.set_ensure_index(tab_urls, 'depth') self._db.set_ensure_index(tab_urls, 'status') self._db.set_ensure_index(tab_site, 'read_status') self._db.set_ensure_index(tab_content, 'read_status') self._collector = Collector(tab_urls) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',') def add_parser(self, parser): if self._spider_site_name[0] == 'all': for except_site_name in self._except_site_name: if parser.NAME != except_site_name.strip(): self._parsers.append(parser) else: for spider_site_name in self._spider_site_name: if parser.NAME == spider_site_name.strip(): self._parsers.append(parser) def run(self): self.__start() def __start(self): if self._begin_callback: self._begin_callback() if not self._parsers: if self._end_callabck: self._end_callabck() return # 启动collector self._collector.add_finished_callback(self._end_callabck) self._collector.start() # 启动parser 的add site 和 add root #print(self._parser_params) for parser in self._parsers: threading.Thread(target=parser.add_site_info).start() threading.Thread(target=parser.add_root_url, args=(self._parser_params, )).start() # 启动parser control while self._parser_count: parser_control = PaserControl(self._collector, self._tab_urls) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_count -= 1
import base.constance as Constance import utils.tools as tools from utils.log import log from db.mongodb import MongoDB db = MongoDB() def remove_table(tab_list): for tab in tab_list: db.delete(tab) def reset_table(tab_list): for tab in tab_list: db.update(tab, {'status': 3}, {'status': 0}) def add_url(table, site_id='', url='', depth=0, remark='', status=Constance.TODO, title='', origin='', domain='', retrieval_layer=0, image_url='', release_time=''): url_dict = {
def begin_callback(): log.info('\n********** WWA_wechat_account begin **********') db = MongoDB() db.delete('WWA_wechat_account_url', {})