Esempio n. 1
0
def crawl(crawler_param, options):
    # We need to setup the Cassandra Object Mapper to work on multiprocessing
    # If we do not do that, the processes will be blocked when interacting
    # with the object mapper module
    setup_cassandra_object_mapper()

    _logger.debug("Calling crawl method: {0}".format(getattr(crawler,
                                                             "crawl")))

    return crawler.crawl(crawler_param, options)
def obtain_company_files(ccvm, phantomjs_path, doc_type, from_date=None):
    """
    This function is responsible for get the relation of files to be
    processed for the company and start its download

    This function is being throttle allowing 20 downloads per minute
    """

    # We need to setup the Cassandra Object Mapper to work on multiprocessing
    # If we do not do that, the processes will be blocked when interacting
    # with the object mapper module
    setup_cassandra_object_mapper()

    files = []
    driver = None

    try:
        driver = webdriver.PhantomJS(executable_path=phantomjs_path)

        encoded_args = urlencode({
            'CCVM': ccvm,
            'TipoDoc': 'C',
            'QtLinks': "1000"
        })
        url = COMPANY_DOCUMENTS_URL.format(encoded_args)

        # Let's navigate to the url and wait until the reload is being done
        # We control that the page is loaded looking for an element with
        # id = "AIR" in the page
        driver.get(url)
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.NAME, 'AIR')))
        except TimeoutException:
            WebDriverWait(driver, 10).until(EC.title_contains("CBLCNET -"))
            _logger.warning(
                "There is no documents page for company {ccvm} "
                "and {doc_type}. Showing 'Error de Aplicacao'".format(
                    ccvm=ccvm, doc_type=doc_type))
            return files

        # Once the page is ready, we can select the doc_type from the list
        # of documentation available and navigate to the results page
        # Select ITR files and Click
        element = driver.find_element_by_link_text(doc_type)
        element.click()

        # Wait until the page is loaded
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located(
                    (By.XPATH, "//form[@name='AIR']/table/*")))
        except TimeoutException:
            WebDriverWait(driver, 10).until(EC.title_contains("CBLCNET -"))
            _logger.warning(
                "There is no documents page for company {ccvm} "
                "and {doc_type}. Showing 'Error de Aplicacao'".format(
                    ccvm=ccvm, doc_type=doc_type))
            return files

        bs = BeautifulSoup(driver.page_source, "html.parser")

        # We extract the references to all the ENET files
        # The ENET files are ZIP files that contains textual document that
        # we can parse and extract information from them
        files = extract_ENET_files_from_page(ccvm, driver, bs, doc_type,
                                             from_date)

        return files
    except NoSuchElementException as ex:
        _logger.warning(
            "The company {ccvm} do not have {doc_type} documents".format(
                ccvm=ccvm, doc_type=doc_type))
        return []
    finally:
        _logger.debug("Finishing to crawl company "
                      "[{ccvm} - {doc_type}] files: [{num_files}]".format(
                          ccvm=ccvm, doc_type=doc_type, num_files=len(files)))
        if driver:
            _logger.debug("Closing the phantomjs driver for company "
                          "[{ccvm} - {doc_type}]".format(ccvm=ccvm,
                                                         doc_type=doc_type))
            driver.quit()
def update_listed_companies(letter, options):
    """
    :param letter:
    :param driver:
    :return:
    """

    # We need to setup the Cassandra Object Mapper to work on multiprocessing
    # If we do not do that, the processes will be blocked when interacting
    # with the object mapper module
    setup_cassandra_object_mapper()

    driver = None
    try:
        driver = CrawlersRegistry().get_crawler(
            BOVESPA_CRAWLER).get_web_driver(**options)

        _logger.debug("Crawling companies listing for letter: {}".
                      format(letter))

        companies = []

        url = COMPANIES_LISTING_URL.format(letter)
        _logger.debug("Crawling url: {}".format(url))

        # Let's navigate to the url and wait until the page is completely
        # loaded. We control that the page is loaded looking for the
        #  presence of the table with id = "dlCiasCdCVM"
        driver.get(url)
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, 'dlCiasCdCVM')))
        except Exception as ex:
            WebDriverWait(driver, 10).until(
                EC.text_to_be_present_in_element(
                    (By.ID, 'lblMsg'),
                    "Nenhuma companhia foi encontrada com o critério de"
                    " busca especificado."))

            return companies

        bs = BeautifulSoup(driver.page_source, "html.parser")

        companies_table = bs.find("table", attrs={"id": "dlCiasCdCVM"})
        companies_rows = companies_table.findChildren(["tr"])

        # The first row is the header
        _logger.debug("Processing companies for letter [{0}]."
                      " Companies: {1}".
                      format(letter, len(companies_rows) - 1))

        for row in companies_rows[1:]:
            cells = row.findChildren('td')

            ccvm_code = cells[3].find("a").getText().strip()

            _logger.debug(
                "Check existance bovespa company: {}".format(ccvm_code))
            if not BovespaCompany.objects.filter(ccvm=ccvm_code).exists():
                _logger.debug(
                    "Bovespa company not found!: {}".format(ccvm_code))

                values = {
                    "ccvm": ccvm_code,
                    "company_name": cells[1].find("a").getText().strip(),
                    "cnpj": cells[0].find("a").getText().strip(),
                    "company_type": cells[2].find("a").getText(),
                }

                situation_text = cells[4].find("a").getText().strip()
                situation_date = re.search(
                    r".*(\d{2}/\d{2}/\d{4})", situation_text)[1]
                situation_date = date_parse(situation_date)

                if "cancelado" in situation_text.lower():
                    values.update({
                        "situation": SITUATION_CANCELLED,
                        "canceled_date": situation_date
                    })
                elif "concedido" in situation_text.lower():
                    values.update({
                        "situation": SITUATION_GRANTED,
                        "granted_date": situation_date
                    })

                _logger.debug("Create bovespa company: {}".format(values))
                companies.append(BovespaCompany.create(**values))
            else:
                _logger.debug(
                    "Get bovespa company from DB: {}".format(ccvm_code))
                companies.append(BovespaCompany.objects.get(ccvm=ccvm_code))

        return [(company.ccvm, company.company_name,
                 company.cnpj, company.situation) for company in companies]
    except Exception as ex:
        _logger.exception(
            "Unable to get, or crawl if it doesn't exists,"
            " the list of listed companies")
        raise ex
    finally:
        _logger.debug(
            "Finishing to crawl listed companies for letter {}".format(letter))
        if driver:
            _logger.debug(
                "Closing the Selenium Driver for letter {}".format(letter))
            driver.quit()