Exemple #1
0
    def prepare_first_pages(host):
        """ Prepare link on first page in MongoDB. Add root url if urls for this host not exists.  """
        pid = Registry().get('pData')['id']

        coll = Registry().get('mongo').spider_urls
        coll.drop()

        Urls = UrlsModel()
        urls = Urls.list_by_host_name_for_spider(pid, host)
        if not len(urls):
            Registry().get('logger').log("Spider: Root URL was added automaticaly")
            Urls.add(
                pid, HostsModel().get_id_by_name(pid, host), '/', who_add='spider'
            )
            urls = Urls.list_by_host_name_for_spider(pid, host)

        for url in urls:
            url = urlparse(url['url'])
            data = {
                'hash': md5(str(url.path + url.query)),
                'path': url.path,
                'query': url.query,
                'time': 0,
                'code':0,
                'checked': 0,
                'getted' : 0,
                'referer': '',
                'size': 0,
                'founder': 'spider'
            }

            coll.insert(data)

        coll.create_index([('hash', 1)], unique=True, dropDups=True)
        coll.create_index([('checked', 1)])
Exemple #2
0
    def prepare_first_pages(host):
        """ Prepare link on first page in MongoDB. Add root url if urls for this host not exists.  """
        pid = Registry().get('pData')['id']

        coll = Registry().get('mongo').spider_urls
        coll.drop()

        Urls = UrlsModel()
        urls = Urls.list_by_host_name_for_spider(pid, host)
        if not len(urls):
            Registry().get('logger').log(
                "Spider: Root URL was added automaticaly")
            Urls.add(pid,
                     HostsModel().get_id_by_name(pid, host),
                     '/',
                     who_add='spider')
            urls = Urls.list_by_host_name_for_spider(pid, host)

        for url in urls:
            url = urlparse(url['url'])
            data = {
                'hash': md5(str(url.path + url.query)),
                'path': url.path,
                'query': url.query,
                'time': 0,
                'code': 0,
                'checked': 0,
                'getted': 0,
                'referer': '',
                'size': 0,
                'founder': 'spider'
            }

            coll.insert(data)

        coll.create_index([('hash', 1)], unique=True, dropDups=True)
        coll.create_index([('checked', 1)])
Exemple #3
0
class MongoJob(WSJob):
    """ Common class for jobs works with MongoDB """
    unique = True
    collection = None
    select_limit = 50
    skip_blank_rows = True
    counter = 0
    collection_name = None

    def __init__(self, maxsize=0):
        WSJob.__init__(self, maxsize)
        self.collection = Registry().get('mongo')[self.collection_name]

    def build_row(self, _str):
        """ Common build row method for MongoDB """
        return {
            "name": _str.strip(),
            "checked": 0,
            "getted": 0
        }

    def qsize(self):
        """ Size of queue """
        return self.collection.find({"checked": 0}).count()

    def set_unique(self, unique=True):
        """ Enable remove dups in queue """
        self.unique = unique

    def set_skip_blank_rows(self, value=True):
        """ If True - we will skip blank rows then fill queue from dict or file """
        self.skip_blank_rows = value

    def task_done(self, name):
        """ Mark current row as done """
        self.counter += 1
        self.collection.update({'name': str(unicode(name)), "getted": 1}, {"$set": {"checked": 1}})
        WSJob.task_done(self)

    def get(self, block=False, timeout=None):
        """ Get next item from queue """
        if self.empty() or self.qsize() < 50:
            self.load_data()

        if self.empty():
            raise Queue.Empty

        return WSJob.get(self, block, timeout)

    def load_data(self):
        """ Load data into queue from MongoDB """
        data = self.collection.find(
            {"checked": 0, "getted": 0},
            limit=int(Registry().get('config')['main']['mongo_data_load_per_once'])
        )

        for row in data:
            self.put(row['name'])
            self.collection.update({"name": row['name']}, {"$set": {"getted": 1}})

        return True

    def load_dict(self, dict_for_load, drop=True):
        """ Fill collection from dict """
        if drop:
            self.collection.drop()

        counter = 0
        last = "START OF FILE"

        for line in dict_for_load:
            try:
                line = line.strip()
                unicode(line)
                self.collection.insert(self.build_row(line))
            except UnicodeDecodeError:
                _str = " UNICODE ERROR: In file '{0}' skip word '{1}', after word '{2}' !".format(file, line, last)
                if Registry().isset('logger'):
                    Registry().get('logger').log(_str)
                else:
                    print _str

                continue

            counter += 1
            last = line

        self.load_data()

        return counter

    def load_dom(self, dom):
        """ Fill queue from DictOfMask """
        self.collection.drop()
        while True:
            word = dom.get()
            if word is None:
                break
            self.collection.insert(self.build_row(word))
        self.collection.create_index('name', drop_dups=True, unique=self.unique)

        self.load_data()
        return self.collection.count()

    def load_file(self, _file):
        """ Fill queue from text file """
        self.collection.drop()

        fh = open(_file)

        last = "START OF FILE"
        while True:
            line = fh.readline()
            if not line:
                break
            if not line.strip() and self.skip_blank_rows:
                continue

            try:
                line = line.strip()
                unicode(line)
                self.collection.insert(self.build_row(line))
            except UnicodeDecodeError:
                _str = " UNICODE ERROR: In file '{0}' skip word '{1}', after word '{2}' !".format(_file, line, last)
                if Registry().isset('logger'):
                    Registry().get('logger').log(_str)
                else:
                    print _str
                continue

            last = line

        fh.close()

        self.collection.create_index('name', drop_dups=True, unique=self.unique)

        self.load_data()

        return self.collection.count()

    # 2 метода ниже взяты с
    # http://stackoverflow.com/questions/1581895/how-check-if-a-task-is-already-in-python-queue
    # Рецепт для уникальных задачь в очереди
    def _init(self, maxsize):
        WSJob._init(self, maxsize)
        if self.unique:
            self.all_items = set()

    def _put(self, item):
        if self.unique:
            if item not in self.all_items:
                WSJob._put(self, item)
                self.all_items.add(item)
            else:
                _str = "WARNING: try to add not unique item `{0}`".format(item)

                if Registry().isset('logger'):
                    #Registry().get('logger').log(_str)
                    pass
                else:
                    #print _str
                    pass
        else:
            WSJob._put(self, item)
Exemple #4
0
class MongoJob(WSJob):
    """ Common class for jobs works with MongoDB """
    unique = True
    collection = None
    select_limit = 50
    skip_blank_rows = True
    counter = 0
    collection_name = None

    def __init__(self, maxsize=0):
        WSJob.__init__(self, maxsize)
        self.collection = Registry().get('mongo')[self.collection_name]

    def build_row(self, _str):
        """ Common build row method for MongoDB """
        return {
            "name": _str.strip(),
            "checked": 0,
            "getted": 0
        }

    def qsize(self):
        """ Size of queue """
        return self.collection.find({"checked": 0}).count()

    def set_unique(self, unique=True):
        """ Enable remove dups in queue """
        self.unique = unique

    def set_skip_blank_rows(self, value=True):
        """ If True - we will skip blank rows then fill queue from dict or file """
        self.skip_blank_rows = value

    def task_done(self, name):
        """ Mark current row as done """
        self.counter += 1
        self.collection.update({'name': str(unicode(name)), "getted": 1}, {"$set": {"checked": 1}})
        WSJob.task_done(self)

    def get(self, block=False, timeout=None):
        """ Get next item from queue """
        if self.empty() or self.qsize() < 50:
            self.load_data()

        if self.empty():
            raise Queue.Empty

        return WSJob.get(self, block, timeout)

    def load_data(self):
        """ Load data into queue from MongoDB """
        data = self.collection.find(
            {"checked": 0, "getted": 0},
            limit=int(Registry().get('config')['main']['mongo_data_load_per_once'])
        )

        for row in data:
            self.put(row['name'])
            self.collection.update({"name": row['name']}, {"$set": {"getted": 1}})

        return True

    def load_dict(self, dict_for_load, drop=True):
        """ Fill collection from dict """
        if drop:
            self.collection.drop()

        counter = 0
        last = "START OF FILE"

        for line in dict_for_load:
            try:
                line = line.strip()
                unicode(line)
                self.collection.insert(self.build_row(line))
            except UnicodeDecodeError:
                _str = " UNICODE ERROR: In file '{0}' skip word '{1}', after word '{2}' !".format(file, line, last)
                if Registry().isset('logger'):
                    Registry().get('logger').log(_str)
                else:
                    print _str

                continue

            counter += 1
            last = line

        self.load_data()

        return counter

    def load_dom(self, dom):
        """ Fill queue from DictOfMask """
        self.collection.drop()
        while True:
            word = dom.get()
            if word is None:
                break
            self.collection.insert(self.build_row(word))
        self.collection.create_index('name', drop_dups=True, unique=self.unique)

        self.load_data()
        return self.collection.count()

    def load_file(self, _file):
        """ Fill queue from text file """
        self.collection.drop()

        fh = open(_file)

        last = "START OF FILE"
        while True:
            line = fh.readline()
            if not line:
                break
            if not line.strip() and self.skip_blank_rows:
                continue

            try:
                line = line.strip()
                unicode(line)
                self.collection.insert(self.build_row(line))
            except UnicodeDecodeError:
                _str = " UNICODE ERROR: In file '{0}' skip word '{1}', after word '{2}' !".format(_file, line, last)
                if Registry().isset('logger'):
                    Registry().get('logger').log(_str)
                else:
                    print _str
                continue

            last = line

        fh.close()

        self.collection.create_index('name', drop_dups=True, unique=self.unique)

        self.load_data()

        return self.collection.count()

    # 2 метода ниже взяты с
    # http://stackoverflow.com/questions/1581895/how-check-if-a-task-is-already-in-python-queue
    # Рецепт для уникальных задачь в очереди
    def _init(self, maxsize):
        WSJob._init(self, maxsize)
        if self.unique:
            self.all_items = set()

    def _put(self, item):
        if self.unique:
            if item not in self.all_items:
                WSJob._put(self, item)
                self.all_items.add(item)
            else:
                _str = "WARNING: try to add not unique item `{0}`".format(item)

                if Registry().isset('logger'):
                    #Registry().get('logger').log(_str)
                    pass
                else:
                    #print _str
                    pass
        else:
            WSJob._put(self, item)