class TreeTestCase(unittest.TestCase): @classmethod def setUpClass(cls): Redis().flushdb() def setUp(self): self.redis_client = Redis() self.tree = Tree(self.redis_client) def test_set_get(self): position = 'some_node' level = cns.NODE_TOPIC_LEVEL data = {'name': 'topic name', 'description': '\tописание топика\n'} parent = 'root' node = Node(position, data, parent, level) self.tree._set(node) node = self.tree.get_node(position) self.assertEqual(node.position, position) self.assertEqual(node.level, level) self.assertEqual(node.data, data) self.assertEqual(node.parent, parent) def test_get_parents(self): root = Node(position='root') subcategory = Node(position='subcategory position', level=cns.NODE_SUBCATEGORY_LEVEL, data={ 'name': 'subcategory name', 'description': '\tописание топика\n' }, parent=root.position) topic = Node(position='topic position', level=cns.NODE_TOPIC_LEVEL, data={ 'name': 'topic name', 'author': 'Auth1' }, parent=subcategory.position) self.tree.add_nodes((root, subcategory, topic)) nodes = self.tree.get_parents(topic.position) self.assertEqual(nodes[0].position, topic.position) self.assertEqual(nodes[0].level, topic.level) self.assertEqual(nodes[0].data, topic.data) self.assertEqual(nodes[0].parent, topic.parent) self.assertEqual(nodes[1].position, subcategory.position) self.assertEqual(nodes[1].level, subcategory.level) self.assertEqual(nodes[1].data, subcategory.data) self.assertEqual(nodes[1].parent, subcategory.parent) self.assertEqual(nodes[2].position, root.position) self.assertEqual(nodes[2].level, root.level) self.assertEqual(nodes[2].data, root.data) self.assertEqual(nodes[2].parent, root.parent) def tearDown(self): self.redis_client.flushdb()
class Worker(Process): YET_NO_DATA = (None, None) def __init__(self, options, stop_flag, *args, **kwargs): super().__init__(*args, **kwargs) self.stop_flag = stop_flag self.use_lxml = options.use_lxml self.mongo_host = options.mongo_host self.mongo_port = options.mongo_port self.redis_host = options.redis_host self.redis_port = options.redis_port logger.info('Mongo connection {}:{}. Redis connection {}:{}'\ .format(options.mongo_host, options.mongo_port, options.redis_host, options.redis_port)) def run(self): logger.info('Started') signal.signal(signal.SIGINT, signal.SIG_IGN) signal.signal(signal.SIGTERM, signal.SIG_DFL) # must place here to be in forked process memory mc = MongoClient(self.mongo_host, self.mongo_port) db = mc[cns.MONGO_DB] self.collection = db[cns.MONGO_COLLECTION] self.rc = Redis(self.redis_host, self.redis_port) self.tree = Tree(self.rc) self.documents = [ ] # to collect during a whole lifecircle for a batching while not self.stop_flag.value: try: start_time = time.time() data, base_url = self.get_data_and_base_url() if (data, base_url) == self.YET_NO_DATA: logger.info( 'Continue after waiting data getting. Blocking is expired' ) continue elif not data: logger.error('No data') continue soup = BeautifulSoup( data.decode(), 'lxml' if self.use_lxml else 'html.parser') self.parse_posts_fill_documents(soup, base_url) self.parse_topics(soup, base_url) self.parse_cards(soup, base_url) self.save_documents(base_url) end_time = time.time() logger.debug('Job duration: {}'.format(end_time - start_time)) except: logger.exception('Work error') self.save_documents(finish=True) logger.info('Stopped') def get_data_and_base_url(self): queuekey_datakey = self.rc.brpop(cns.DATA_QUEUE_KEY, cns.BLOCKING_TIMEOUT) if not queuekey_datakey: return self.YET_NO_DATA data_key = queuekey_datakey[1].decode() with self.rc.pipeline() as pipeline: pipeline.get(data_key) pipeline.delete(data_key) data, del_count = pipeline.execute() base_url = self._extract_base_url(data_key) logger.info('Data is gotten') return data, base_url def parse_cards(self, soup, base_url): child_nodes = [] new_urls = [] for card_source in soup.find_all(class_='ForumCard'): href = card_source['href'] card = {} card['url'] = urljoin(base_url, href) name = card_source.find(class_='ForumCard-heading') if name: card['name'] = name.get_text() description = card_source.find(class_='ForumCard-description') if description: card['description'] = description.get_text() n = Node(position=card['url'], data=card, level=cns.NODE_SUBCATEGORY_LEVEL, parent=base_url) child_nodes.append(n) new_urls.append(card['url']) self.add_child_nodes(child_nodes) self.add_new_urls(new_urls) def parse_topics(self, soup, base_url): self._add_paginated_topics_link(soup, base_url) child_nodes = [] new_urls = [] for topic_source in soup.find_all(class_='ForumTopic'): topic = {} topic['url'] = urljoin(base_url, topic_source['href']) topic['name'] = topic_source.find( class_='ForumTopic-heading').get_text() topic['author'] = topic_source.find( class_='ForumTopic-author').get_text() topic['replies'] = topic_source.find( class_='ForumTopic-replies').get_text() n = Node(position=topic['url'], data=topic, level=cns.NODE_TOPIC_LEVEL, parent=base_url) child_nodes.append(n) new_urls.append(topic['url']) self.add_child_nodes(child_nodes) self.add_new_urls(new_urls) def parse_posts_fill_documents(self, soup, base_url): self._add_paginated_posts_links(soup, base_url) for post_source in soup.find_all(class_='TopicPost'): document = defaultdict(dict) document['post']['id'] = post_source['id'] document['post']['user'] = post_source.find( class_='Author-name').get_text() document['post']['created'] = post_source.find( class_='TopicPost-timestamp')['data-tooltip-content'] document['post']['rank'] = post_source.find( class_='TopicPost-rank').get_text() document['post']['text'] = post_source.find( class_='TopicPost-bodyContent').get_text() for node in self.tree.get_parents(base_url): if node.level == cns.NODE_TOPIC_LEVEL: document['topic'] = node.data elif node.level == cns.NODE_SUBCATEGORY_LEVEL: document['subcategory'] = node.data self.documents.append(document) def add_child_nodes(self, nodes): self.tree.add_nodes(nodes) def add_new_urls(self, urls): """Вставка ссылок в очередь пачкой. Ссылки сохраняются в множество запрошенных, а в очередь только в него не входящие. """ if not isinstance(urls, tuple, list): urls = [urls] pipeline = self.rc.pipeline() for url in urls: pipeline.sismember(cns.PARSED_URLS_KEY, url) members = pipeline.execute() urls = [url for url, ismember in zip(urls, members) if not ismember] if urls: pipeline.sadd(cns.PARSED_URLS_KEY, *urls) pipeline.lpush(cns.URL_QUEUE_KEY, *urls) pipeline.execute() logger.debug('Put new urls: {}'.format(urls)) pipeline.reset() def save_documents(self, base_url=None, finish=False, batch_size=cns.INSERT_BATCH_SIZE): if len(self.documents) >= batch_size: self.collection.insert_many(self.documents) self.documents.clear() logger.info('Data is written') elif finish and self.documents: self.collection.insert_many(self.documents) logger.info('Finished step. Data is written') elif not self.documents: logger.warning('No documents for url {}'.format(base_url)) def _extract_base_url(self, key): return key.replace(cns.DATA_KEY_PREFIX, '') def _add_paginated_posts_links(self, soup, base_url): new_urls = [] pagination = soup.select( '.Topic-pagination--header .Pagination-button--ordinal') if pagination: last_page = int(pagination[-1]['data-page-number']) urls = [ urljoin(base_url, '?page=%d' % n) for n in range(1, last_page + 1) ] new_urls.extend(urls) self.add_new_urls(new_urls) def _add_paginated_topics_link(self, soup, base_url): href = soup.select('.Pagination-button--next')[0]['href'] new_url = urljoin(base_url, href) self.add_new_urls(new_url)
class B_and_B(): """ this class run and solve RSSP problem using branch and bound algorithm. to solve RSSP problem this class need to resive the RSSP problem in linear equations format. this equations will be send to cplex object who know how to solve it using LP. """ def __init__(self, obj, ub, lb, ctype, colnames, rhs, rownames, sense, rows, cols, vals, x_names, LB=0, UB=float("inf"), use_SP=True): # will be used after adding parallel run option self.UB_lock = Lock() self.best_equation = None # this parameters will e same for all the equations Equations.init_global_data(obj, ub, lb, ctype, colnames, rownames, sense, len(x_names)) # tree = heap, use to save all still not opened nodes self.tree = Tree() self.UB = UB self.LB = LB self.use_SP = use_SP self.SP_len = 0 # the B&B algorithm can be solve using SP if use_SP: self.__create_SPs(1, rhs, rows, cols, vals, x_names) self.SP_len = len(self.tree.queue) print("|SPs| =", len(self.tree.queue)) else: equation = Equations(cols, rows, vals, rhs, x_names, {}, {}) self.__init_equation(equation, file_name="problem.lp") def __create_SPs(self, op, rhs, rows, cols, vals, x_names, needed_x=[]): """ recursive function, for every SP take all operations and choice one mode. all SPs must be difference. """ mode = 1 sub = [s for s in x_names if "X" + str(op) + "," + str(mode) in s] # select mode for each operation while sub: needed_x_copy = needed_x[:] needed_x_copy += sub self.__create_SPs(op + 1, rhs, rows, cols, vals, x_names, needed_x_copy) mode += 1 sub = [s for s in x_names if "X" + str(op) + "," + str(mode) in s] # if all mode selected, create the equations if not [s for s in x_names if "X" + str(op) in s]: equation = Equations( cols[:], rows[:], vals[:], rhs[:], x_names[:], {}, {elem: 0 for elem in x_names if elem not in needed_x}) self.__init_equation(equation) def __update_UB(self, equation): """ check the equation solution and if it's better then the UB, update UB, save the equation and set the number of the solutions to one. if its equals to the UB increase the number of the solutions. use lock to check the UB to avoid conflicts. equatiosn: Equation, an equation with integer solution return: None """ self.UB_lock.acquire() solution = equation.solution if solution and solution <= self.UB: print("found UB that is eqauls to %10f" % solution) self.UB = solution self.best_equation = equation self.UB_lock.release() def __try_bound(self): """ try to take node from the queue, if its solution worth then the UB drop this node, repeat until node solution better or equals to the UB or until the queue is empty. if the queue is empty but there are threads that not finished yet, wait for them. return: Node if fuond better or equals solution then UB or None if the queue is empty """ next_node = self.tree.get_queue_head() while next_node and self.LB < self.UB: # while the queue not empty # if the node worth then the UB, take another node if next_node.get_solution() > self.UB: next_node = self.tree.get_queue_head( ) # take another node from the queue # if the node equals to the UB and the UB isn't the predicted UB # (we already have node that gave integer solution that equals to the UB) elif next_node.get_solution() == self.UB and self.best_equation: next_node = self.tree.get_queue_head( ) # take another node from the queue else: return next_node # return None if the queue is empty and all the tree was bound return None def __init_equation(self, equation, depth=0, file_name=None): """ create new node from the equation and add it to the queue. equation: Equation, cplex equation depth: int, next node depth file_name: string, where save the cplex solution, or None return: None """ # solve LP using cplex solution = equation.solve_milp(file_name) # if the solution is integer, check the UB if solution and equation.integer_solution: self.__update_UB(equation) # if the solution better or equals to the UB, add it to the queue elif solution and solution <= self.UB: self.tree.add_nodes(equation, depth) def create_node(self, node, col_dict): """ create new equation, solve it and add it to the B&B queue node: Node, father node col_dict: dict, a dict of parameters name and the selected value for them return: None """ eq = node.equation equation = Equations(eq.cols[:], eq.rows[:], eq.vals[:], eq.rhs[:], eq.cols_to_remove[:], eq.choices.copy(), col_dict) self.__init_equation(equation, node.depth + 1) def set_x_to_one(self, node, x_one): """ take all the not set Xi,m,r,l from the node and set the chosen Xi,m,r,l to value of one and set all blocked (by the equations) Xi,m,r,l to zero. all the choices will be save at a dictionary that will contain Xi,m,r,l : chosen value node: Node, from that node we will create new node x_one: string, the chosen Xi,m,r,l return None """ choices = {} choices[x_one] = 1 i, m, r, l = x_one[1:].split(",") # if Xi,m,r,l = 1 for x in node.equation.cols_to_remove: other_i, other_m, other_r, other_l = x[1:].split(",") # Xj,n,t,k = 0 | j = i, n = m, t = r and k != l if i == other_i and m == other_m and r == other_r and l != other_l: choices[x] = 0 # Xj,n,t,k = 0 | j != i, n = or != m, t = r and k = l elif i != other_i and r == other_r and l == other_l: choices[x] = 0 # Xj,n,t,k = 0 | j = i, n != m, t = or != r and k = or != l elif i == other_i and m != other_m: choices[x] = 0 self.create_node(node, choices) def zero_one_initialize(self, node): # create dictionary with one Xi,m,r,l equals to zero col_dict = {node.equation.cols_to_remove[0]: 0} # son with Xi,m,r,l = 0 self.create_node(node, col_dict) # son with Xi,m,r,l = 1 self.set_x_to_one(node, node.equation.cols_to_remove[0]) def choice_resource(self, node): """ the B&B tree can be set by labels, for Xi,m,r,l set all l's options, from 1 to |R_l| node: Node, the node with all equations return: None """ # step over X and the l not need in the split i, m, r, _ = node.equation.cols_to_remove[0][1:].split(",") zero_choices = {} for x in node.equation.cols_to_remove: # step over X and the other_l not need in the split other_i, other_m, other_r, _ = x[1:].split(",") if i == other_i and m == other_m and r == other_r: self.set_x_to_one(node, x) zero_choices[x] = 0 # if SP not selected there is option that this mode m is not selected # so we will try set l to be zero # in SP, we already selected the mode and its can't be zero if not self.use_SP: self.create_node(node, zero_choices) def solve_algorithem(self, init_resource_labels=False, disable_prints=True, cplex_auto_solution=False): """ run the branch and bound algorithm to find the best solution for the equation. after the node where created/began to created, take node from the queue. when the queue is empty and the algorithm end, solve the best equation one more time to get all the chosen value for the Xi,m,r,l. return: dict, string: dict - the parameters name and chosen values, string - number of created nodes, max depth and max queue size """ if init_resource_labels: initialize_x_function = self.choice_resource else: initialize_x_function = self.zero_one_initialize next_node = self.tree.get_queue_head() # run while the node not None which mean that the algorithm not end while next_node: # TODO check if this condition is necessary if next_node.equation.cols_to_remove: initialize_x_function(next_node) # check if we can do bound on the tree and take next node from the queue next_node = self.__try_bound() try: choices, nodes = self.best_equation.cplex_solution(disable_prints) if not cplex_auto_solution: nodes = self.tree.num_of_nodes return choices, nodes, self.tree.max_queue_size, self.SP_len, self.best_equation.solution, Equations.MIP_infeasible except: print("cann't find integer solution") return None, 0, 0, 0, 0, True