Ejemplo n.º 1
0
def add_instance():
    instance = manager_by_plan_name(request.form['plan']).add_instance(
        request.form['name'])
    from storage import MongoStorage
    storage = MongoStorage()
    storage.add_instance(instance)
    return "", 201
Ejemplo n.º 2
0
def remove_instance(name):
    from storage import MongoStorage
    storage = MongoStorage()
    instance = storage.find_instance_by_name(name)
    manager_by_instance(instance).remove_instance(instance)
    storage.remove_instance(instance)
    return "", 200
Ejemplo n.º 3
0
def remove_instance(name):
    from storage import MongoStorage
    storage = MongoStorage()
    instance = storage.find_instance_by_name(name)
    manager_by_instance(instance).remove_instance(instance)
    storage.remove_instance(instance)
    return "", 200
Ejemplo n.º 4
0
def add_instance():
    instance = manager_by_plan_name(request.form['plan']).add_instance(
        request.form['name'])
    from storage import MongoStorage
    storage = MongoStorage()
    storage.add_instance(instance)
    return "", 201
Ejemplo n.º 5
0
def status(name):
    from storage import MongoStorage
    storage = MongoStorage()
    instance = storage.find_instance_by_name(name)
    ok, msg = manager_by_instance(instance).is_ok()
    if ok:
        return msg, 204
    return msg, 500
Ejemplo n.º 6
0
def status(name):
    from storage import MongoStorage
    storage = MongoStorage()
    instance = storage.find_instance_by_name(name)
    ok, msg = manager_by_instance(instance).is_ok()
    if ok:
        return msg, 204
    return msg, 500
Ejemplo n.º 7
0
 def initExtendQuery(self):
     def handler(item):
         synonyms = item['synonyms'];
         for synonym in synonyms:
             self.addPosting(item['_id'], Posting(synonym['word'], synonym['tf']))
             
     storage = MongoStorage()
     storage.handleSynonyms(handler)
     storage.close()
Ejemplo n.º 8
0
 def saveInvertedIndex(self):
     """
         @describe: 依次将普通倒排记录表和nickname倒排记录表保存在磁盘上
     """
     storage = MongoStorage()
     self.__saveInvertedIndex(self.hash_vocabulary, storage.savePostingList)
     self.__saveInvertedIndex(self.hash_nickname,
                              storage.saveNickNamePostingList)
     storage.close()
Ejemplo n.º 9
0
def add_instance():
    plan = request.form.get('plan')
    if not plan:
        return "plan is required", 400
    instance = manager_by_plan_name(plan).add_instance(request.form['name'])
    from storage import MongoStorage
    storage = MongoStorage()
    storage.add_instance(instance)
    return "", 201
Ejemplo n.º 10
0
 def __init__(self):
     self.parser = AdvertisementParser()
     self.storage = MongoStorage(
         'adv_data') if storage_type == 'mongo' else FileStorage('adv_data')
     if isinstance(self.storage, MongoStorage):
         self.links = self.storage.load('adv_links', {'flag': False})
     else:
         self.links = self.storage.load('lnk')
     self.queue = self.create_queue()
Ejemplo n.º 11
0
 def get_port_by_host(self, host):
     storage = MongoStorage()
     instances = storage.find_instances_by_host(host)
     if instances:
         ports = []
         for instance in instances:
             for endpoint in instance.endpoints:
                 ports.append(int(endpoint["port"]))
         return max(ports) + 1
     return self.port_range_start
Ejemplo n.º 12
0
 def loadInvertedIndex(self):
     """
         @describe: 从磁盘记录上依次加载普通排记录表和nickname倒排记录表
     """
     storage = MongoStorage()
     self.__loadInvertedIndex(self.hash_vocabulary,
                              storage.handlePostingLists)
     self.__loadInvertedIndex(self.hash_nickname,
                              storage.handleNickNamePostingLists)
     storage.close()
Ejemplo n.º 13
0
def add_instance():
    plan = request.form.get("plan")
    if not plan:
        return "plan is required", 400
    instance = manager_by_plan_name(plan).add_instance(request.form["name"])
    from storage import MongoStorage

    storage = MongoStorage()
    storage.add_instance(instance)
    return "", 201
Ejemplo n.º 14
0
def get_book(site):
    ms = MongoStorage()
    if site in crawl_container:
        record = ms.get_today(site)
        if record and WEB_DEBUG == False:
            platform = record['platform']
        else:
            crawl = Crawl(site)
            platform = crawl.run()
            ms.save(site, platform)
        return platform
Ejemplo n.º 15
0
def get_book(site):
    ms = MongoStorage()
    if site in crawl_container:
        record = ms.get_today(site)
        if record and WEB_DEBUG == False:
            platform = record['platform']
        else:
            crawl = Crawl(site)
            platform = crawl.run()
            ms.save(site, platform)
        return platform
Ejemplo n.º 16
0
 def initInvertedIndex(self):
     """
         @describe: 初始化准备倒排记录表,根据需要选择是重新生成还是从磁盘中加载之前的
     """
     storage = MongoStorage()
     if storage.loadFromDisk():
         self.loadInvertedIndex()
     else:
         self.reGenerateInvertedIndex()
         self.saveInvertedIndex()
     storage.close()
Ejemplo n.º 17
0
def unbind_unit(name):
    unit_host = request.form.get("unit-host")
    if not unit_host:
        return "unit-host is required", 400
    storage = MongoStorage()
    instance = storage.find_instance_by_name(name)
    manager = manager_by_instance(instance)
    try:
        manager.revoke(instance, unit_host)
    except AttributeError:
        pass
    return "", 200
Ejemplo n.º 18
0
def unbind_unit(name):
    unit_host = request.form.get('unit-host')
    if not unit_host:
        return "unit-host is required", 400
    storage = MongoStorage()
    instance = storage.find_instance_by_name(name)
    manager = manager_by_instance(instance)
    try:
        manager.revoke(instance, unit_host)
    except AttributeError:
        pass
    return "", 200
Ejemplo n.º 19
0
    def __tokenization(self):
        """
            @describe: 词条化,对每篇文档词条化之后,自动对词条进行归一化、构建倒排处理
        """
        storage = MongoStorage()

        def handler(weibo):
            """ 处理微博 """
            tokens = jieba.cut_for_search(weibo['mt'])
            # 微博静态评分计算:a*log(转发数)+b*log(评论数)
            static_grade = math.log(weibo['rc'] + 1) + math.log(weibo['cc'] +
                                                                1)
            # 对生成的词条进行归一化处理
            self.__normalization(weibo['_id'], tokens, static_grade,
                                 weibo['ct'])
            # 添加nickname到nickname倒排链表
            nicknames = weibo['nc']
            nicknames.append(weibo['sn'])
            for nickname in nicknames:
                self.__addNickNamePosting(
                    nickname,
                    Posting(weibo['_id'], 1, static_grade, weibo['ct']))

        storage.handleWeibos(handler)
        storage.close()
Ejemplo n.º 20
0
class LinkCrawler(BaseCrawl):
    def __init__(self, cities=default_cities, link=base_link):
        self.cities = cities
        self.link = link
        self.storage = MongoStorage(
            'adv_links') if storage_type == 'mongo' else FileStorage(
                'adv_links')

    @staticmethod
    def get_link(html):
        soup = BeautifulSoup(html, 'html.parser')
        return soup.find_all('a', attrs={'class': 'hdrlnk'})

    def crawl_city(self, url_init):
        start = 0
        crawl = True
        adv_links = []
        while crawl:
            response = self.get_page(url_init + str(start))
            response_links = self.get_link(response.text)
            if not response_links:
                break
            adv_links.extend([{
                'url': lnk.get('href'),
                'flag': False
            } for lnk in response_links])
            start += 120

        return adv_links

    def store(self, data, *args):
        self.storage.store(data, *args)

    def start(self):
        adv_links = list()
        for city in self.cities:
            links = self.crawl_city(self.link.format(city))
            print(f"{city} : {len(links)} advertisements")
            adv_links.extend(links)
        self.store(adv_links, 'adv_links')
Ejemplo n.º 21
0
    def initExtendQuery(self):
        def handler(item):
            synonyms = item['synonyms']
            for synonym in synonyms:
                self.addPosting(item['_id'],
                                Posting(synonym['word'], synonym['tf']))

        storage = MongoStorage()
        storage.handleSynonyms(handler)
        storage.close()
Ejemplo n.º 22
0
class DataCrawler(BaseCrawl):
    def __init__(self):
        self.parser = AdvertisementParser()
        self.storage = MongoStorage(
            'adv_data') if storage_type == 'mongo' else FileStorage('adv_data')
        if isinstance(self.storage, MongoStorage):
            self.links = self.storage.load('adv_links', {'flag': False})
        else:
            self.links = self.storage.load('lnk')
        self.queue = self.create_queue()

    def store(self, data, *args):
        self.storage.store(data, *args)

    def create_queue(self):
        queue = Queue()
        for link in self.links:
            queue.put(link)
        return queue

    def crawl(self):
        while True:
            link = self.queue.get()

            response = self.get_page(link['url'])
            data = self.parser.parse(response.text)
            print('data received')
            download_image.delay(data)

            self.store(data, data.get('post_id', 'no id!'))
            if isinstance(self.storage, MongoStorage):
                self.storage.update_flag(link)
            self.queue.task_done()
            if self.queue.empty():
                break

    def start(self):
        for _ in range(10):
            thread = Thread(target=self.crawl)
            thread.start()
        self.queue.join()
Ejemplo n.º 23
0
def dc():
    def run_callbacks(callbacks):
        for callback in callbacks:
            callback()
    
    global give_ups
    
    try:
        create = create_cookie_file()
        fetcher = CnFetcher(account, pwd, cookie_file if not create else None)
        if create:
            fetcher.login(cookie_filename=cookie_file)
        while give_ups > 0:
            n = 0
            while len(tokens) == 0:
                if give_ups > 0:
                    n += 1
                    time.sleep(n);
                else:
                    return
            
            token = tokens.pop()
            cb = callback(token)
            
            soc = create_socket()
            try:
                data = json.loads(soc.recv(buf_size))
                if data == None:
                    time.sleep(15)
                    cb()
                    continue
                elif len(data) == 0:
                    give_ups -= 1
                    continue
                
                user = data['user']
                is_uid = data['is_uid']
                crawled = data.get('crawled', False)
                follow = data.get('follow', None)
                
                # monitor callback
                register_heartbeat(user)()
                register_rm_cb = register_heartbeat(user, True)
                
                # success callbacks
                success_callbacks = (register_rm_cb, reset_error_callback)
                error_callbacks = (error_callback, register_rm_cb)
                
                try:
                    crawler = UserCrawler(user, is_uid=is_uid, fetcher=fetcher, 
                                          fetch_fans=follow is None, 
                                          callbacks=cb, 
                                          success_callbacks=success_callbacks,
                                          error_callbacks=error_callbacks)
                    # the user not exist
                    if crawler.user_not_exist or crawler.uid == 'attention':
                        cb()
                        run_callbacks(success_callbacks)
                        continue
                    
                    uid = crawler.uid
                    storage = MongoStorage(uid, follow, user=user)
                    
                    if crawled or storage.crawled: 
                        cb()
                        run_callbacks(success_callbacks)
                        storage.close()
                        continue
                    else:
                        crawler.set_storage(storage)
                        crawler.start()
                except Exception, e:
                    cb()
                    run_callbacks(error_callbacks)
                    # raise e
                    logger.exception(e)
            finally:
                soc.close()
    finally:
        # When run over, call stop heartbeat
        stop_heartbeat()
Ejemplo n.º 24
0
def bind_app(name):
    storage = MongoStorage()
    instance = storage.find_instance_by_name(name)
    result = manager_by_instance(instance).bind(instance)
    return json.dumps(result), 201
Ejemplo n.º 25
0
 def initAutoComplate(self):
     storage = MongoStorage()
     storage.handleHotQuerys(self.addNewQuery)
     storage.close()
Ejemplo n.º 26
0
 def initAutoComplate(self):                
     storage = MongoStorage()
     storage.handleHotQuerys(self.addNewQuery)
     storage.close()
Ejemplo n.º 27
0
def bind(name):
    storage = MongoStorage()
    instance = storage.find_instance_by_name(name)
    result = manager_by_instance(instance).bind(instance)
    return json.dumps(result), 201
Ejemplo n.º 28
0
 def __set_storage():
     if STORAGE_TYPE == 'mongo':
         return MongoStorage()
     return FileStorage()
Ejemplo n.º 29
0
 def __init__(self, cities=default_cities, link=base_link):
     self.cities = cities
     self.link = link
     self.storage = MongoStorage(
         'adv_links') if storage_type == 'mongo' else FileStorage(
             'adv_links')