Example #1
0
    def get_case(self, thread_info):
        # Unpack the parameters
        queue_pos = thread_info["index"]
        row = thread_info["row"]
        start_date = thread_info["start_date"]
        end_date = thread_info["end_date"]
        page_number = thread_info["page_number"]

        case_id = CourtCase.extract_case_id(row)

        # Use queue_pos to space the worker threads. Assuming each thread takes approximately
        # the same time, space out the first round of worker threads by using incremental
        # sleep delays. Remember that queue_pos is 0-based, so the first thread is not delayed
        if queue_pos < self.threads:
            time.sleep(queue_pos / 10.0)  # Space out threads by 100ms

        # Fetch the document text from the server create a CourtCase upon success, or add a failure
        try:
            doc_filename = CourtCase.extract_filename(row)
            doc_text = CourtCase.get_document_text(doc_filename.replace(".doc", ".txt"), self.timeout)
            extended_info = self.get_verdict_extended_info(
                CourtCase.extract_case_id(row), CourtCase.extract_filename(row))
        except Exception as e:
            self.log_message(LogLevel.ERROR, "Error fetching verdict for case " + case_id + ": " + str(e))
            return case_id

        self.pool_progress += 1
        return CourtCase(row, extended_info, doc_text)
Example #2
0
    def handle_result_page(self, soup, start_date, end_date, page_number, specific_verdicts=None) -> (list, FaultEntity):
        """Decodes and parses the IIS VIEWSTATE hidden field, then extracts the XML search data
        and uses it to generate CourtCase object instances with the verdict's information"""
        return_list = []

        # Extract and decode the XML search results from the VIEWSTATE
        view_state = base64.b64decode(
            soup.find_all(id="__VIEWSTATE")[0]['value']
        ).decode("utf-8", "ignore")
        results_mask = re.compile(r'<Results>([\s\S]+?)</Results>')
        data_xml = results_mask.search(view_state).group(0)

        # Pass each child (search result) to CourtCase's constructor as an ElementTree object
        data_tree = ET.fromstring(data_xml)

        # If specific_verdicts was passed, filter data_tree only to the requested verdicts
        if specific_verdicts is not None:
            data_tree = [d for d in data_tree if CourtCase.extract_case_id(d) in specific_verdicts]

        # Create a list of data tree elements with the required delay on startup
        # This is used to space the requests instead of sending them all at the same time
        data_tree_numbered = []
        for i in range(len(data_tree)):
            item = {
                "index": i,
                "row": data_tree[i],
                "start_date": start_date.strftime("%d/%m/%Y"),
                "end_date": end_date.strftime("%d/%m/%Y"),
                "page_number": page_number
            }
            data_tree_numbered.append(item)

        # Initialize a thread pool and execute the jobs concurrently
        thread_pool = Pool(self.threads)
        callback = partial(self.print_status_line)
        tasks = [thread_pool.apply_async(self.get_case, (x, ), callback=callback) for x in data_tree_numbered]
        tasks_results = [task.get() for task in tasks]
        thread_pool.terminate()

        failed_verdicts = [v for v in tasks_results if type(v) == str]
        success_verdicts = [v for v in tasks_results if type(v) == CourtCase]

        if len(failed_verdicts) > 0:
            return success_verdicts, FaultEntity((start_date, end_date), page_number, failed_verdicts)
        else:
            return success_verdicts, None