def scroll_quinto_andar_page(x_request_id: str, div_number_row: int, driver) -> None: """ Function responsible for make scroll in quinto andar page base on below divs. Parameters: x_request_id: Unique Id. div_number_row: Number of the block in row in the page driver: google chrome instance Returns: None """ try: element = driver.find_element_by_xpath( f"/html/body/div[1]/main/section[2]/div[2]/div/div[1]/div[{div_number_row+2}]" ) sleep(number=3) if element: actions = ActionChains(driver) actions.move_to_element(element) actions.perform() except (ElementClickInterceptedException, AttributeError) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def scraper_flow(x_request_id: str, driver: any): """ Function responsible for deal with flow logic of QuintoAndar scraper. Parameters: x_request_id: Unique id. driver: Google Chrome instance Returns: void """ try: timeout_start = time.time() send_log( x_request_id=x_request_id, message=f"Initiating the flow of scraper. Time: {timeout_start}", ) recursive_scraper_logic( x_request_id=x_request_id, div_number_row=quinto_andar["div_number_row_initiator"], div_number_column=quinto_andar["div_number_column_initiator"], limit_scraper=quinto_andar["limit_scraper"], timeout_start=timeout_start, driver=driver, ) sleep(10) send_log(x_request_id=x_request_id, message="Finished the flow of scraper.") except (WebDriverException, ElementNotInteractableException) as exception: error_handler( x_request_id=x_request_id, _msg="Exception occurred on scraper_flow", exception=exception, )
def resident_localization_data(x_request_id: str, driver) -> list: """ Function responsible for get all information about localization of specific residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: Object { street_name<String>, district_name<String>, state_name<String> } """ send_log( x_request_id=x_request_id, message="Searching for address of residence...", ) sleep(number=7) try: localization_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[2]/div[2]/p" ) if localization_data: send_log( x_request_id=x_request_id, message="Found information about address...", ) localization_data = localization_data.text return localization_data.split(",") if localization_data else None except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def residence_size(x_request_id: str, driver) -> int: """ Function responsible for return the size of the residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: int: size of residence """ send_log( x_request_id=x_request_id, message="Searching for the number of bedrooms...", ) sleep(number=2) try: size_residence_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[1]/div/div/span" ) if size_residence_data: send_log( x_request_id=x_request_id, message="Found information about bedrooms...", ) size_residence = size_residence_data.text # Start verification if has digit # then going to return. If do not have then return 0. return (int(re.findall(r"\d+", size_residence)[0]) if verification_string_has_digit(x_request_id=x_request_id, text=size_residence) else 0) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_residence_id(x_request_id: str, driver: any) -> int: """ Function responsible for return id of residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: int """ send_log(x_request_id=x_request_id, message="Searching for the residence id...") sleep(number=2) try: residence_id = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/nav/ol/li[5]/a") if residence_id: send_log( x_request_id=x_request_id, message="Found id of residence...", ) residence_id_text = residence_id.text # Start verification if has digit # then going to return. If do not have then return 0. return (int(re.findall(r"\d+", residence_id_text)[0]) if verification_string_has_digit(x_request_id=x_request_id, text=residence_id_text) else 0) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_furniture_flag(x_request_id: str, driver) -> bool: """ Function responsible for get flag that represent if the resident already have furniture. Parameters: x_request_id: unique id driver: google chrome instance Returns: int """ send_log(x_request_id=x_request_id, message="Searching for a furniture flag...") sleep(number=2) try: flag_furniture_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[7]/div/div/span" ) if flag_furniture_data: send_log( x_request_id=x_request_id, message="Found information about furniture in the residence...", ) flag_furniture = flag_furniture_data.text flag_furniture = flag_furniture.lower() return bool("sem" not in flag_furniture) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def number_of_rooms(x_request_id: str, driver) -> int: """ Function responsible for return number of rooms. Parameters: x_request_id: unique id driver: google chrome instance Returns: Bool <True or False> """ send_log(x_request_id=x_request_id, message="Searching for number of rooms...") sleep(number=2) try: number_rooms_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[2]/div/div" ) if number_rooms_data: send_log( x_request_id=x_request_id, message="Found information about number of rooms...", ) number_rooms = number_rooms_data.text # Start verification if has digit # then going to return. If do not have then return 0. return (int(re.findall(r"\d+", number_rooms)[0]) if verification_string_has_digit(x_request_id=x_request_id, text=number_rooms) else 0) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def pet_flag(x_request_id: str, driver) -> bool: """ Function responsible for flag if the residence can have pet or not. Parameters: x_request_id: unique id driver: google chrome instance Returns: Bool <True or False> """ send_log(x_request_id=x_request_id, message="Searching for pet flag...") sleep(number=2) try: pet_flag_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[6]/div/div/span" ) if pet_flag_data: send_log( x_request_id=x_request_id, message="Found information about pet flag...", ) pet_flag_text = pet_flag_data.text send_log( x_request_id=x_request_id, message=f"Pet flag informatio is {pet_flag_text}", ) return not bool("Não" in pet_flag_text or "Nao" in pet_flag_text) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_metro_flag(x_request_id: str, driver) -> bool: """ Function responsible for identify if has metro close to the residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: Bool <True or False> """ send_log(x_request_id=x_request_id, message="Searching for subway flag...") sleep(number=2) try: metro_flag_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[8]/div/div/span" ) if metro_flag_data: send_log( x_request_id=x_request_id, message="Found information about subway...", ) metro_flag_text = metro_flag_data.text return bool(metro_flag_text.find("Não")) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_type_residence(x_request_id: str, driver) -> str: """ Function responsible for return type of residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: str """ send_log( x_request_id=x_request_id, message="Searching for the type of residence...", ) sleep(number=2) try: type_residence_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[2]/div[1]/h1" ) if type_residence_data: send_log( x_request_id=x_request_id, message="Found the type of residence...", ) type_residence = type_residence_data.text type_residence = type_residence.lower() if "casa" in type_residence: return "house" return "apartment" except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_rent_values(x_request_id: str, driver) -> dict: """ Function responsible for get all values about the rent of the residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: dict """ send_log( x_request_id=x_request_id, message="Searching for a values of the rent...", ) try: sleep(number=2) rent_values_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[2]/section/div/ul") if rent_values_data: send_log(x_request_id=x_request_id, message="Found the values...") rent_values_dict = { "rent_without_taxes": int, "condominium_tax": int, "house_tax": int, "fire_insurance": int, "service_tax": int, "total_rent_value": int, } rent_values = rent_values_data.text if rent_values: rent_values = rent_values.replace("Incluso", "0") rent_values = re.findall(r"(?<![.,])\d+[,.]{0,1}\d*", rent_values) # Going to get values in case of find 6 numbers in array. if len(rent_values) == 6: rent_values_dict["rent_without_taxes"] = rent_values[0] rent_values_dict["condominium_tax"] = rent_values[1] rent_values_dict["house_tax"] = rent_values[2] rent_values_dict["fire_insurance"] = rent_values[3] rent_values_dict["service_tax"] = rent_values[4] rent_values_dict["total_rent_value"] = rent_values[5] return rent_values_dict except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def main(driver: any, queue: any) -> None: """ Consumer responsible for receive messages from SQS Queue Parameters: driver: any queue: any Returns: None """ try: while True: messages = receive_messages(queue=queue, max_number=1, wait_time=0) if len(messages) == 0: send_log( x_request_id="", message="QUEUE with 0 messages, going to send default event in 30 minutes...", ) sleep(number=1800) dealing_with_empty_queue(queue=queue) else: for message in messages: x_request_id = request_handler(message=message.body) send_log( x_request_id=x_request_id, message="Receive message going to start scraper flow...", ) consumer_message_handler( message=message.body, x_request_id=x_request_id, driver=driver, ) delete_message(x_request_id=x_request_id, message=message) except AttributeError as exception: error_handler(exception=exception)
def recursive_scraper_logic( x_request_id: str, div_number_row: int, div_number_column: int, limit_scraper: int, timeout_start, driver, ): """ Function responsible for deal with recursive scraper logic. Parameters: x_request_id: Unique id. div_number_row: Number of the block in row in the page div_number_column: Number of the block in column in page limit_scraper: Number responsible for define the limit of scraper to the page timeout_start: The time that scraper begin driver: Google Chrome instance Notes: The function define the number of scraper that happens in the page until event of scroll happens. And the logic start again. Returns: void """ timeout = 900 sleep(15) # Scraper will happen for 15 minutes # if time.time() < timeout_start + timeout: link = get_link_of_resident_block( x_request_id=x_request_id, div_number_row=div_number_row, div_number_column=div_number_column, driver=driver, ) quinto_andar_data = QuintoAndarSchema() if link: main_window = save_window_opener(x_request_id=x_request_id, driver=driver) open_new_tab(x_request_id=x_request_id, link=link) event_switch_right_window(x_request_id=x_request_id, driver=driver) event_switch_to_tab_window(main_window=main_window, driver=driver) sleep(8) send_log( x_request_id=x_request_id, message="Initiation of collection of data...", ) resident_data = get_resident_block_data( x_request_id=x_request_id, quinto_andar_data=quinto_andar_data, driver=driver, ) creation_residence_data(x_request_id=x_request_id, residence_data=resident_data) close_current_tab(driver=driver, main_window=main_window) send_log(x_request_id=x_request_id, message="Return to main screen...") sleep(1) driver.switch_to_window(main_window) div_number_row, div_number_column = recursive_column_row_logic( x_request_id=x_request_id, div_number_column=div_number_column, div_number_row=div_number_row, limit_scraper=limit_scraper, driver=driver, ) send_log( x_request_id=x_request_id, message=f"Data of residence is: {quinto_andar_data}", ) limit_scraper += 1 recursive_scraper_logic( x_request_id=x_request_id, div_number_row=div_number_row, div_number_column=div_number_column, limit_scraper=limit_scraper, timeout_start=timeout_start, driver=driver, )
def test_sleep_function_should_return_none(): """Function responsible for test sleep function that should return None""" assert sleep(number=1) is None