コード例 #1
0
class TreeTestCase(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        Redis().flushdb()

    def setUp(self):
        self.redis_client = Redis()
        self.tree = Tree(self.redis_client)

    def test_set_get(self):
        position = 'some_node'
        level = cns.NODE_TOPIC_LEVEL
        data = {'name': 'topic name', 'description': '\tописание топика\n'}
        parent = 'root'
        node = Node(position, data, parent, level)
        self.tree._set(node)

        node = self.tree.get_node(position)
        self.assertEqual(node.position, position)
        self.assertEqual(node.level, level)
        self.assertEqual(node.data, data)
        self.assertEqual(node.parent, parent)

    def test_get_parents(self):
        root = Node(position='root')
        subcategory = Node(position='subcategory position',
                           level=cns.NODE_SUBCATEGORY_LEVEL,
                           data={
                               'name': 'subcategory name',
                               'description': '\tописание топика\n'
                           },
                           parent=root.position)
        topic = Node(position='topic position',
                     level=cns.NODE_TOPIC_LEVEL,
                     data={
                         'name': 'topic name',
                         'author': 'Auth1'
                     },
                     parent=subcategory.position)
        self.tree.add_nodes((root, subcategory, topic))

        nodes = self.tree.get_parents(topic.position)
        self.assertEqual(nodes[0].position, topic.position)
        self.assertEqual(nodes[0].level, topic.level)
        self.assertEqual(nodes[0].data, topic.data)
        self.assertEqual(nodes[0].parent, topic.parent)

        self.assertEqual(nodes[1].position, subcategory.position)
        self.assertEqual(nodes[1].level, subcategory.level)
        self.assertEqual(nodes[1].data, subcategory.data)
        self.assertEqual(nodes[1].parent, subcategory.parent)

        self.assertEqual(nodes[2].position, root.position)
        self.assertEqual(nodes[2].level, root.level)
        self.assertEqual(nodes[2].data, root.data)
        self.assertEqual(nodes[2].parent, root.parent)

    def tearDown(self):
        self.redis_client.flushdb()
コード例 #2
0
class Worker(Process):

    YET_NO_DATA = (None, None)

    def __init__(self, options, stop_flag, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.stop_flag = stop_flag
        self.use_lxml = options.use_lxml
        self.mongo_host = options.mongo_host
        self.mongo_port = options.mongo_port
        self.redis_host = options.redis_host
        self.redis_port = options.redis_port
        logger.info('Mongo connection {}:{}. Redis connection {}:{}'\
                    .format(options.mongo_host, options.mongo_port, options.redis_host, options.redis_port))

    def run(self):
        logger.info('Started')
        signal.signal(signal.SIGINT, signal.SIG_IGN)
        signal.signal(signal.SIGTERM, signal.SIG_DFL)

        # must place here to be in forked process memory
        mc = MongoClient(self.mongo_host, self.mongo_port)
        db = mc[cns.MONGO_DB]
        self.collection = db[cns.MONGO_COLLECTION]

        self.rc = Redis(self.redis_host, self.redis_port)
        self.tree = Tree(self.rc)

        self.documents = [
        ]  # to collect during a whole lifecircle for a batching
        while not self.stop_flag.value:
            try:
                start_time = time.time()

                data, base_url = self.get_data_and_base_url()
                if (data, base_url) == self.YET_NO_DATA:
                    logger.info(
                        'Continue after waiting data getting. Blocking is expired'
                    )
                    continue
                elif not data:
                    logger.error('No data')
                    continue
                soup = BeautifulSoup(
                    data.decode(), 'lxml' if self.use_lxml else 'html.parser')
                self.parse_posts_fill_documents(soup, base_url)
                self.parse_topics(soup, base_url)
                self.parse_cards(soup, base_url)
                self.save_documents(base_url)

                end_time = time.time()
                logger.debug('Job duration: {}'.format(end_time - start_time))
            except:
                logger.exception('Work error')

        self.save_documents(finish=True)

        logger.info('Stopped')

    def get_data_and_base_url(self):
        queuekey_datakey = self.rc.brpop(cns.DATA_QUEUE_KEY,
                                         cns.BLOCKING_TIMEOUT)
        if not queuekey_datakey:
            return self.YET_NO_DATA

        data_key = queuekey_datakey[1].decode()
        with self.rc.pipeline() as pipeline:
            pipeline.get(data_key)
            pipeline.delete(data_key)
            data, del_count = pipeline.execute()
        base_url = self._extract_base_url(data_key)
        logger.info('Data is gotten')
        return data, base_url

    def parse_cards(self, soup, base_url):
        child_nodes = []
        new_urls = []
        for card_source in soup.find_all(class_='ForumCard'):
            href = card_source['href']
            card = {}
            card['url'] = urljoin(base_url, href)
            name = card_source.find(class_='ForumCard-heading')
            if name:
                card['name'] = name.get_text()
            description = card_source.find(class_='ForumCard-description')
            if description:
                card['description'] = description.get_text()

            n = Node(position=card['url'],
                     data=card,
                     level=cns.NODE_SUBCATEGORY_LEVEL,
                     parent=base_url)
            child_nodes.append(n)
            new_urls.append(card['url'])

        self.add_child_nodes(child_nodes)
        self.add_new_urls(new_urls)

    def parse_topics(self, soup, base_url):
        self._add_paginated_topics_link(soup, base_url)

        child_nodes = []
        new_urls = []
        for topic_source in soup.find_all(class_='ForumTopic'):
            topic = {}
            topic['url'] = urljoin(base_url, topic_source['href'])
            topic['name'] = topic_source.find(
                class_='ForumTopic-heading').get_text()
            topic['author'] = topic_source.find(
                class_='ForumTopic-author').get_text()
            topic['replies'] = topic_source.find(
                class_='ForumTopic-replies').get_text()

            n = Node(position=topic['url'],
                     data=topic,
                     level=cns.NODE_TOPIC_LEVEL,
                     parent=base_url)
            child_nodes.append(n)
            new_urls.append(topic['url'])

        self.add_child_nodes(child_nodes)
        self.add_new_urls(new_urls)

    def parse_posts_fill_documents(self, soup, base_url):
        self._add_paginated_posts_links(soup, base_url)

        for post_source in soup.find_all(class_='TopicPost'):
            document = defaultdict(dict)
            document['post']['id'] = post_source['id']
            document['post']['user'] = post_source.find(
                class_='Author-name').get_text()
            document['post']['created'] = post_source.find(
                class_='TopicPost-timestamp')['data-tooltip-content']
            document['post']['rank'] = post_source.find(
                class_='TopicPost-rank').get_text()
            document['post']['text'] = post_source.find(
                class_='TopicPost-bodyContent').get_text()
            for node in self.tree.get_parents(base_url):
                if node.level == cns.NODE_TOPIC_LEVEL:
                    document['topic'] = node.data
                elif node.level == cns.NODE_SUBCATEGORY_LEVEL:
                    document['subcategory'] = node.data
            self.documents.append(document)

    def add_child_nodes(self, nodes):
        self.tree.add_nodes(nodes)

    def add_new_urls(self, urls):
        """Вставка ссылок в очередь пачкой. Ссылки сохраняются
        в множество запрошенных, а в очередь только в него не входящие.
        """

        if not isinstance(urls, tuple, list):
            urls = [urls]

        pipeline = self.rc.pipeline()
        for url in urls:
            pipeline.sismember(cns.PARSED_URLS_KEY, url)
        members = pipeline.execute()

        urls = [url for url, ismember in zip(urls, members) if not ismember]
        if urls:
            pipeline.sadd(cns.PARSED_URLS_KEY, *urls)
            pipeline.lpush(cns.URL_QUEUE_KEY, *urls)
            pipeline.execute()
            logger.debug('Put new urls: {}'.format(urls))
        pipeline.reset()

    def save_documents(self,
                       base_url=None,
                       finish=False,
                       batch_size=cns.INSERT_BATCH_SIZE):
        if len(self.documents) >= batch_size:
            self.collection.insert_many(self.documents)
            self.documents.clear()
            logger.info('Data is written')
        elif finish and self.documents:
            self.collection.insert_many(self.documents)
            logger.info('Finished step. Data is written')
        elif not self.documents:
            logger.warning('No documents for url {}'.format(base_url))

    def _extract_base_url(self, key):
        return key.replace(cns.DATA_KEY_PREFIX, '')

    def _add_paginated_posts_links(self, soup, base_url):
        new_urls = []
        pagination = soup.select(
            '.Topic-pagination--header .Pagination-button--ordinal')
        if pagination:
            last_page = int(pagination[-1]['data-page-number'])
            urls = [
                urljoin(base_url, '?page=%d' % n)
                for n in range(1, last_page + 1)
            ]
            new_urls.extend(urls)
        self.add_new_urls(new_urls)

    def _add_paginated_topics_link(self, soup, base_url):
        href = soup.select('.Pagination-button--next')[0]['href']
        new_url = urljoin(base_url, href)
        self.add_new_urls(new_url)
コード例 #3
0
class B_and_B():
    """
    this class run and solve RSSP problem using branch and bound algorithm.
    to solve RSSP problem this class need to resive the RSSP problem in linear equations format.
    this equations will be send to cplex object who know how to solve it using LP.
    """
    def __init__(self,
                 obj,
                 ub,
                 lb,
                 ctype,
                 colnames,
                 rhs,
                 rownames,
                 sense,
                 rows,
                 cols,
                 vals,
                 x_names,
                 LB=0,
                 UB=float("inf"),
                 use_SP=True):
        # will be used after adding parallel run option
        self.UB_lock = Lock()
        self.best_equation = None
        # this parameters will e same for all the equations
        Equations.init_global_data(obj, ub, lb, ctype, colnames, rownames,
                                   sense, len(x_names))
        # tree = heap, use to save all still not opened nodes
        self.tree = Tree()
        self.UB = UB
        self.LB = LB
        self.use_SP = use_SP
        self.SP_len = 0
        # the B&B algorithm can be solve using SP
        if use_SP:
            self.__create_SPs(1, rhs, rows, cols, vals, x_names)
            self.SP_len = len(self.tree.queue)
            print("|SPs| =", len(self.tree.queue))

        else:
            equation = Equations(cols, rows, vals, rhs, x_names, {}, {})
            self.__init_equation(equation, file_name="problem.lp")

    def __create_SPs(self, op, rhs, rows, cols, vals, x_names, needed_x=[]):
        """
        recursive function, for every SP take all operations and choice one mode.
        all SPs must be difference.
        """
        mode = 1
        sub = [s for s in x_names if "X" + str(op) + "," + str(mode) in s]
        # select mode for each operation
        while sub:
            needed_x_copy = needed_x[:]
            needed_x_copy += sub
            self.__create_SPs(op + 1, rhs, rows, cols, vals, x_names,
                              needed_x_copy)
            mode += 1
            sub = [s for s in x_names if "X" + str(op) + "," + str(mode) in s]

        # if all mode selected, create the equations
        if not [s for s in x_names if "X" + str(op) in s]:
            equation = Equations(
                cols[:], rows[:], vals[:], rhs[:], x_names[:], {},
                {elem: 0
                 for elem in x_names if elem not in needed_x})
            self.__init_equation(equation)

    def __update_UB(self, equation):
        """
        check the equation solution and if it's better then the UB, update UB,
        save the equation and set the number of the solutions to one.
        if its equals to the UB increase the number of the solutions.
        use lock to check the UB to avoid conflicts.
        equatiosn: Equation, an equation with integer solution
        return: None
        """
        self.UB_lock.acquire()
        solution = equation.solution
        if solution and solution <= self.UB:
            print("found UB that is eqauls to %10f" % solution)
            self.UB = solution
            self.best_equation = equation

        self.UB_lock.release()

    def __try_bound(self):
        """
        try to take node from the queue, if its solution worth then the UB drop this node,
        repeat until node solution better or equals to the UB or until the queue is empty.
        if the queue is empty but there are threads that not finished yet, wait for them.
        return: Node if fuond better or equals solution then UB or None if the queue is empty
        """
        next_node = self.tree.get_queue_head()
        while next_node and self.LB < self.UB:  # while the queue not empty
            # if the node worth then the UB, take another node
            if next_node.get_solution() > self.UB:
                next_node = self.tree.get_queue_head(
                )  # take another node from the queue

            # if the node equals to the UB and the UB isn't the predicted UB
            # (we already have node that gave integer solution that equals to the UB)
            elif next_node.get_solution() == self.UB and self.best_equation:
                next_node = self.tree.get_queue_head(
                )  # take another node from the queue

            else:
                return next_node

        # return None if the queue is empty and all the tree was bound
        return None

    def __init_equation(self, equation, depth=0, file_name=None):
        """
        create new node from the equation and add it to the queue.
        equation: Equation, cplex equation
        depth: int, next node depth
        file_name: string, where save the cplex solution, or None
        return: None
        """
        # solve LP using cplex
        solution = equation.solve_milp(file_name)
        # if the solution is integer, check the UB
        if solution and equation.integer_solution:
            self.__update_UB(equation)

        # if the solution better or equals to the UB, add it to the queue
        elif solution and solution <= self.UB:
            self.tree.add_nodes(equation, depth)

    def create_node(self, node, col_dict):
        """
        create new equation, solve it and add it to the B&B queue
        node: Node, father node
        col_dict: dict, a dict of parameters name and the selected value for them
        return: None
        """
        eq = node.equation
        equation = Equations(eq.cols[:], eq.rows[:], eq.vals[:], eq.rhs[:],
                             eq.cols_to_remove[:], eq.choices.copy(), col_dict)
        self.__init_equation(equation, node.depth + 1)

    def set_x_to_one(self, node, x_one):
        """
        take all the not set Xi,m,r,l from the node and set the chosen Xi,m,r,l to
        value of one and set all blocked (by the equations) Xi,m,r,l to zero.
        all the choices will be save at a dictionary that will contain Xi,m,r,l : chosen value
        node: Node, from that node we will create new node
        x_one: string, the chosen Xi,m,r,l
        return None
        """
        choices = {}
        choices[x_one] = 1
        i, m, r, l = x_one[1:].split(",")
        # if Xi,m,r,l = 1
        for x in node.equation.cols_to_remove:
            other_i, other_m, other_r, other_l = x[1:].split(",")
            # Xj,n,t,k = 0 | j = i, n = m, t = r and k != l
            if i == other_i and m == other_m and r == other_r and l != other_l:
                choices[x] = 0

            # Xj,n,t,k = 0 | j != i, n = or != m, t = r and k = l
            elif i != other_i and r == other_r and l == other_l:
                choices[x] = 0

            # Xj,n,t,k = 0 | j = i, n != m, t = or != r and k = or != l
            elif i == other_i and m != other_m:
                choices[x] = 0

        self.create_node(node, choices)

    def zero_one_initialize(self, node):
        # create dictionary with one Xi,m,r,l equals to zero
        col_dict = {node.equation.cols_to_remove[0]: 0}
        # son with Xi,m,r,l = 0
        self.create_node(node, col_dict)
        # son with Xi,m,r,l = 1
        self.set_x_to_one(node, node.equation.cols_to_remove[0])

    def choice_resource(self, node):
        """
        the B&B tree can be set by labels, for Xi,m,r,l set all l's options, from 1 to |R_l|
        node: Node, the node with all equations
        return: None
        """
        # step over X and the l not need in the split
        i, m, r, _ = node.equation.cols_to_remove[0][1:].split(",")
        zero_choices = {}
        for x in node.equation.cols_to_remove:
            # step over X and the other_l not need in the split
            other_i, other_m, other_r, _ = x[1:].split(",")
            if i == other_i and m == other_m and r == other_r:
                self.set_x_to_one(node, x)
                zero_choices[x] = 0

        # if SP not selected there is option that this mode m is not selected
        # so we will try set l to be zero
        # in SP, we already selected the mode and its can't be zero
        if not self.use_SP:
            self.create_node(node, zero_choices)

    def solve_algorithem(self,
                         init_resource_labels=False,
                         disable_prints=True,
                         cplex_auto_solution=False):
        """
        run the branch and bound algorithm to find the best solution for the equation.
        after the node where created/began to created, take node from the queue.
        when the queue is empty and the algorithm end, solve the best equation one more time
        to get all the chosen value for the Xi,m,r,l.
        return: dict, string: dict - the parameters name and chosen values,
            string - number of created nodes, max depth and max queue size
        """
        if init_resource_labels:
            initialize_x_function = self.choice_resource

        else:
            initialize_x_function = self.zero_one_initialize

        next_node = self.tree.get_queue_head()
        # run while the node not None which mean that the algorithm not end
        while next_node:
            # TODO check if this condition is necessary
            if next_node.equation.cols_to_remove:
                initialize_x_function(next_node)
            # check if we can do bound on the tree and take next node from the queue
            next_node = self.__try_bound()

        try:
            choices, nodes = self.best_equation.cplex_solution(disable_prints)
            if not cplex_auto_solution:
                nodes = self.tree.num_of_nodes

            return choices, nodes, self.tree.max_queue_size, self.SP_len, self.best_equation.solution, Equations.MIP_infeasible

        except:
            print("cann't find integer solution")
            return None, 0, 0, 0, 0, True