def add_instance(): instance = manager_by_plan_name(request.form['plan']).add_instance( request.form['name']) from storage import MongoStorage storage = MongoStorage() storage.add_instance(instance) return "", 201
def remove_instance(name): from storage import MongoStorage storage = MongoStorage() instance = storage.find_instance_by_name(name) manager_by_instance(instance).remove_instance(instance) storage.remove_instance(instance) return "", 200
def status(name): from storage import MongoStorage storage = MongoStorage() instance = storage.find_instance_by_name(name) ok, msg = manager_by_instance(instance).is_ok() if ok: return msg, 204 return msg, 500
def initExtendQuery(self): def handler(item): synonyms = item['synonyms']; for synonym in synonyms: self.addPosting(item['_id'], Posting(synonym['word'], synonym['tf'])) storage = MongoStorage() storage.handleSynonyms(handler) storage.close()
def saveInvertedIndex(self): """ @describe: 依次将普通倒排记录表和nickname倒排记录表保存在磁盘上 """ storage = MongoStorage() self.__saveInvertedIndex(self.hash_vocabulary, storage.savePostingList) self.__saveInvertedIndex(self.hash_nickname, storage.saveNickNamePostingList) storage.close()
def add_instance(): plan = request.form.get('plan') if not plan: return "plan is required", 400 instance = manager_by_plan_name(plan).add_instance(request.form['name']) from storage import MongoStorage storage = MongoStorage() storage.add_instance(instance) return "", 201
def __init__(self): self.parser = AdvertisementParser() self.storage = MongoStorage( 'adv_data') if storage_type == 'mongo' else FileStorage('adv_data') if isinstance(self.storage, MongoStorage): self.links = self.storage.load('adv_links', {'flag': False}) else: self.links = self.storage.load('lnk') self.queue = self.create_queue()
def get_port_by_host(self, host): storage = MongoStorage() instances = storage.find_instances_by_host(host) if instances: ports = [] for instance in instances: for endpoint in instance.endpoints: ports.append(int(endpoint["port"])) return max(ports) + 1 return self.port_range_start
def loadInvertedIndex(self): """ @describe: 从磁盘记录上依次加载普通排记录表和nickname倒排记录表 """ storage = MongoStorage() self.__loadInvertedIndex(self.hash_vocabulary, storage.handlePostingLists) self.__loadInvertedIndex(self.hash_nickname, storage.handleNickNamePostingLists) storage.close()
def add_instance(): plan = request.form.get("plan") if not plan: return "plan is required", 400 instance = manager_by_plan_name(plan).add_instance(request.form["name"]) from storage import MongoStorage storage = MongoStorage() storage.add_instance(instance) return "", 201
def get_book(site): ms = MongoStorage() if site in crawl_container: record = ms.get_today(site) if record and WEB_DEBUG == False: platform = record['platform'] else: crawl = Crawl(site) platform = crawl.run() ms.save(site, platform) return platform
def initInvertedIndex(self): """ @describe: 初始化准备倒排记录表,根据需要选择是重新生成还是从磁盘中加载之前的 """ storage = MongoStorage() if storage.loadFromDisk(): self.loadInvertedIndex() else: self.reGenerateInvertedIndex() self.saveInvertedIndex() storage.close()
def unbind_unit(name): unit_host = request.form.get("unit-host") if not unit_host: return "unit-host is required", 400 storage = MongoStorage() instance = storage.find_instance_by_name(name) manager = manager_by_instance(instance) try: manager.revoke(instance, unit_host) except AttributeError: pass return "", 200
def unbind_unit(name): unit_host = request.form.get('unit-host') if not unit_host: return "unit-host is required", 400 storage = MongoStorage() instance = storage.find_instance_by_name(name) manager = manager_by_instance(instance) try: manager.revoke(instance, unit_host) except AttributeError: pass return "", 200
def __tokenization(self): """ @describe: 词条化,对每篇文档词条化之后,自动对词条进行归一化、构建倒排处理 """ storage = MongoStorage() def handler(weibo): """ 处理微博 """ tokens = jieba.cut_for_search(weibo['mt']) # 微博静态评分计算:a*log(转发数)+b*log(评论数) static_grade = math.log(weibo['rc'] + 1) + math.log(weibo['cc'] + 1) # 对生成的词条进行归一化处理 self.__normalization(weibo['_id'], tokens, static_grade, weibo['ct']) # 添加nickname到nickname倒排链表 nicknames = weibo['nc'] nicknames.append(weibo['sn']) for nickname in nicknames: self.__addNickNamePosting( nickname, Posting(weibo['_id'], 1, static_grade, weibo['ct'])) storage.handleWeibos(handler) storage.close()
class LinkCrawler(BaseCrawl): def __init__(self, cities=default_cities, link=base_link): self.cities = cities self.link = link self.storage = MongoStorage( 'adv_links') if storage_type == 'mongo' else FileStorage( 'adv_links') @staticmethod def get_link(html): soup = BeautifulSoup(html, 'html.parser') return soup.find_all('a', attrs={'class': 'hdrlnk'}) def crawl_city(self, url_init): start = 0 crawl = True adv_links = [] while crawl: response = self.get_page(url_init + str(start)) response_links = self.get_link(response.text) if not response_links: break adv_links.extend([{ 'url': lnk.get('href'), 'flag': False } for lnk in response_links]) start += 120 return adv_links def store(self, data, *args): self.storage.store(data, *args) def start(self): adv_links = list() for city in self.cities: links = self.crawl_city(self.link.format(city)) print(f"{city} : {len(links)} advertisements") adv_links.extend(links) self.store(adv_links, 'adv_links')
def initExtendQuery(self): def handler(item): synonyms = item['synonyms'] for synonym in synonyms: self.addPosting(item['_id'], Posting(synonym['word'], synonym['tf'])) storage = MongoStorage() storage.handleSynonyms(handler) storage.close()
class DataCrawler(BaseCrawl): def __init__(self): self.parser = AdvertisementParser() self.storage = MongoStorage( 'adv_data') if storage_type == 'mongo' else FileStorage('adv_data') if isinstance(self.storage, MongoStorage): self.links = self.storage.load('adv_links', {'flag': False}) else: self.links = self.storage.load('lnk') self.queue = self.create_queue() def store(self, data, *args): self.storage.store(data, *args) def create_queue(self): queue = Queue() for link in self.links: queue.put(link) return queue def crawl(self): while True: link = self.queue.get() response = self.get_page(link['url']) data = self.parser.parse(response.text) print('data received') download_image.delay(data) self.store(data, data.get('post_id', 'no id!')) if isinstance(self.storage, MongoStorage): self.storage.update_flag(link) self.queue.task_done() if self.queue.empty(): break def start(self): for _ in range(10): thread = Thread(target=self.crawl) thread.start() self.queue.join()
def dc(): def run_callbacks(callbacks): for callback in callbacks: callback() global give_ups try: create = create_cookie_file() fetcher = CnFetcher(account, pwd, cookie_file if not create else None) if create: fetcher.login(cookie_filename=cookie_file) while give_ups > 0: n = 0 while len(tokens) == 0: if give_ups > 0: n += 1 time.sleep(n); else: return token = tokens.pop() cb = callback(token) soc = create_socket() try: data = json.loads(soc.recv(buf_size)) if data == None: time.sleep(15) cb() continue elif len(data) == 0: give_ups -= 1 continue user = data['user'] is_uid = data['is_uid'] crawled = data.get('crawled', False) follow = data.get('follow', None) # monitor callback register_heartbeat(user)() register_rm_cb = register_heartbeat(user, True) # success callbacks success_callbacks = (register_rm_cb, reset_error_callback) error_callbacks = (error_callback, register_rm_cb) try: crawler = UserCrawler(user, is_uid=is_uid, fetcher=fetcher, fetch_fans=follow is None, callbacks=cb, success_callbacks=success_callbacks, error_callbacks=error_callbacks) # the user not exist if crawler.user_not_exist or crawler.uid == 'attention': cb() run_callbacks(success_callbacks) continue uid = crawler.uid storage = MongoStorage(uid, follow, user=user) if crawled or storage.crawled: cb() run_callbacks(success_callbacks) storage.close() continue else: crawler.set_storage(storage) crawler.start() except Exception, e: cb() run_callbacks(error_callbacks) # raise e logger.exception(e) finally: soc.close() finally: # When run over, call stop heartbeat stop_heartbeat()
def bind_app(name): storage = MongoStorage() instance = storage.find_instance_by_name(name) result = manager_by_instance(instance).bind(instance) return json.dumps(result), 201
def initAutoComplate(self): storage = MongoStorage() storage.handleHotQuerys(self.addNewQuery) storage.close()
def bind(name): storage = MongoStorage() instance = storage.find_instance_by_name(name) result = manager_by_instance(instance).bind(instance) return json.dumps(result), 201
def __set_storage(): if STORAGE_TYPE == 'mongo': return MongoStorage() return FileStorage()
def __init__(self, cities=default_cities, link=base_link): self.cities = cities self.link = link self.storage = MongoStorage( 'adv_links') if storage_type == 'mongo' else FileStorage( 'adv_links')