def number_of_rooms(x_request_id: str, driver) -> int: """ Function responsible for return number of rooms. Parameters: x_request_id: unique id driver: google chrome instance Returns: Bool <True or False> """ send_log(x_request_id=x_request_id, message="Searching for number of rooms...") sleep(number=2) try: number_rooms_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[2]/div/div" ) if number_rooms_data: send_log( x_request_id=x_request_id, message="Found information about number of rooms...", ) number_rooms = number_rooms_data.text # Start verification if has digit # then going to return. If do not have then return 0. return (int(re.findall(r"\d+", number_rooms)[0]) if verification_string_has_digit(x_request_id=x_request_id, text=number_rooms) else 0) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def scraper_flow(x_request_id: str, driver: any): """ Function responsible for deal with flow logic of QuintoAndar scraper. Parameters: x_request_id: Unique id. driver: Google Chrome instance Returns: void """ try: timeout_start = time.time() send_log( x_request_id=x_request_id, message=f"Initiating the flow of scraper. Time: {timeout_start}", ) recursive_scraper_logic( x_request_id=x_request_id, div_number_row=quinto_andar["div_number_row_initiator"], div_number_column=quinto_andar["div_number_column_initiator"], limit_scraper=quinto_andar["limit_scraper"], timeout_start=timeout_start, driver=driver, ) sleep(10) send_log(x_request_id=x_request_id, message="Finished the flow of scraper.") except (WebDriverException, ElementNotInteractableException) as exception: error_handler( x_request_id=x_request_id, _msg="Exception occurred on scraper_flow", exception=exception, )
def get_type_residence(x_request_id: str, driver) -> str: """ Function responsible for return type of residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: str """ send_log( x_request_id=x_request_id, message="Searching for the type of residence...", ) sleep(number=2) try: type_residence_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[2]/div[1]/h1" ) if type_residence_data: send_log( x_request_id=x_request_id, message="Found the type of residence...", ) type_residence = type_residence_data.text type_residence = type_residence.lower() if "casa" in type_residence: return "house" return "apartment" except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def executor(x_request_id: str, consumer: str, properties: any, driver: any) -> any: """ Function responsible for execute a specific consumer that is declared in consumers object Parameters: x_request_id: str consumer: str properties: any driver: any Returns: None """ if consumer in consumers_object: send_log( x_request_id=x_request_id, message= f"Going to execute consumer {consumer}... With follow properties {properties}", ) return consumers_object.get(consumer)(x_request_id=x_request_id, properties=properties, driver=driver) send_log( x_request_id=x_request_id, message="Consumer object not found going to execute default...", ) return consumers_object.get("default")(x_request_id=x_request_id, properties=properties, driver=driver)
def get_link_of_resident_block(x_request_id, div_number_row: int, div_number_column: int, driver) -> classmethod: """ Function responsible for get link of one of blocks in QuintoAndar homepage. Parameters: x_request_id: UniqueId div_number_row: Number of the block in row in the page div_number_column: Number of the block in column in page driver: Google Chrome instance uses: wemake-services/[email protected] continue-on-error: true with: Returns Link <str> """ send_log( x_request_id=x_request_id, message=f"Getting link of a respective residence base on row " f"{div_number_row} and column {div_number_column}...", ) try: link = driver.find_element_by_xpath( "/html/body/div[1]/main" "/section[2]/div[2]/div" f"/div[1]/div[{div_number_row}]/div[{div_number_column}]/div/a") return link if link else None except (AttributeError, NoSuchElementException) as exception: error_handler( x_request_id=x_request_id, _msg="Exception occurred get_link_of_resident_block", exception=exception, )
def get_furniture_flag(x_request_id: str, driver) -> bool: """ Function responsible for get flag that represent if the resident already have furniture. Parameters: x_request_id: unique id driver: google chrome instance Returns: int """ send_log(x_request_id=x_request_id, message="Searching for a furniture flag...") sleep(number=2) try: flag_furniture_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[7]/div/div/span" ) if flag_furniture_data: send_log( x_request_id=x_request_id, message="Found information about furniture in the residence...", ) flag_furniture = flag_furniture_data.text flag_furniture = flag_furniture.lower() return bool("sem" not in flag_furniture) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def create_residence_features( x_request_id: str, residence_id: int, residence_feature_key: str, residence_feature_value: any, ) -> None: """ Function responsible for create residence feature data. Parameters: x_request_id: str residence_id: int residence_feature_key: str residence_feature_value: any Returns: None """ table_name = TableNameSchema() data = { "ResidenceId": residence_id, "key": residence_feature_key, "value": str(residence_feature_value), } residence_feature = create( x_request_id=x_request_id, data=data, table_name=table_name.residence_features, ) send_log( x_request_id=x_request_id, message= f"Inserted in database the follow residence values {residence_feature}...", )
def resident_localization_data(x_request_id: str, driver) -> list: """ Function responsible for get all information about localization of specific residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: Object { street_name<String>, district_name<String>, state_name<String> } """ send_log( x_request_id=x_request_id, message="Searching for address of residence...", ) sleep(number=7) try: localization_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[2]/div[2]/p" ) if localization_data: send_log( x_request_id=x_request_id, message="Found information about address...", ) localization_data = localization_data.text return localization_data.split(",") if localization_data else None except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_metro_flag(x_request_id: str, driver) -> bool: """ Function responsible for identify if has metro close to the residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: Bool <True or False> """ send_log(x_request_id=x_request_id, message="Searching for subway flag...") sleep(number=2) try: metro_flag_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[8]/div/div/span" ) if metro_flag_data: send_log( x_request_id=x_request_id, message="Found information about subway...", ) metro_flag_text = metro_flag_data.text return bool(metro_flag_text.find("Não")) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_residence_id(x_request_id: str, driver: any) -> int: """ Function responsible for return id of residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: int """ send_log(x_request_id=x_request_id, message="Searching for the residence id...") sleep(number=2) try: residence_id = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/nav/ol/li[5]/a") if residence_id: send_log( x_request_id=x_request_id, message="Found id of residence...", ) residence_id_text = residence_id.text # Start verification if has digit # then going to return. If do not have then return 0. return (int(re.findall(r"\d+", residence_id_text)[0]) if verification_string_has_digit(x_request_id=x_request_id, text=residence_id_text) else 0) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def residence_size(x_request_id: str, driver) -> int: """ Function responsible for return the size of the residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: int: size of residence """ send_log( x_request_id=x_request_id, message="Searching for the number of bedrooms...", ) sleep(number=2) try: size_residence_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[1]/div/div/span" ) if size_residence_data: send_log( x_request_id=x_request_id, message="Found information about bedrooms...", ) size_residence = size_residence_data.text # Start verification if has digit # then going to return. If do not have then return 0. return (int(re.findall(r"\d+", size_residence)[0]) if verification_string_has_digit(x_request_id=x_request_id, text=size_residence) else 0) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def creation_residence_data( x_request_id: str, residence_data: QuintoAndarSchema ) -> None: """ Function responsible for create all data from residence. Parameters: x_request_id: str residence_data: QuintoAndarSchema Returns: None Notes: Function deal_with_feature is responsible for receive each different features and create respectively to each residence. """ try: send_log( message=f"Going to create the follow data {residence_data}", x_request_id=x_request_id, ) residence_address_id = create_residence_address( x_request_id=x_request_id, residence_data=residence_data ) residence_id = create_residence( x_request_id=x_request_id, residence_address_id=residence_address_id, residence_data=residence_data, ) create_residence_values( x_request_id=x_request_id, residence_id=residence_id, residence_data=residence_data, ) features = { "petFlag": residence_data.pet_flag, "metroFlag": residence_data.metro_flag, "furnitureFlag": residence_data.furniture_flag, } deal_with_feature( features=features, x_request_id=x_request_id, residence_id=residence_id, ) except (SyntaxError, AttributeError, AssertionError) as exception: return error_handler( exception=exception, _msg="Exception occurred in create_residence_flow", )
def save_window_opener(x_request_id: str, driver: any) -> any: """ Function responsible for save main window. Parameters: x_request_id: str driver: Google chrome instance Returns: void """ send_log(x_request_id=x_request_id, message="Saving main screen...") return driver.current_window_handle
def event_switch_right_window(x_request_id: str, driver) -> None: """ Function responsible for switch to the right window. Parameters: x_request_id: str driver: Google Chrome instance Returns: void """ send_log(x_request_id=x_request_id, message="Changing to the next tab...") driver.find_element_by_tag_name("body").send_keys(Keys.CONTROL + Keys.TAB)
def finish_session(x_request_id: str, driver) -> None: """ Function responsible for finish a respective session. Parameters: x_request_id: Unique id. driver: Google Chrome instance. Returns: void """ send_log(x_request_id=x_request_id, message="Finishing session of scraper...") driver.quit()
def get_rent_values(x_request_id: str, driver) -> dict: """ Function responsible for get all values about the rent of the residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: dict """ send_log( x_request_id=x_request_id, message="Searching for a values of the rent...", ) try: sleep(number=2) rent_values_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[2]/section/div/ul") if rent_values_data: send_log(x_request_id=x_request_id, message="Found the values...") rent_values_dict = { "rent_without_taxes": int, "condominium_tax": int, "house_tax": int, "fire_insurance": int, "service_tax": int, "total_rent_value": int, } rent_values = rent_values_data.text if rent_values: rent_values = rent_values.replace("Incluso", "0") rent_values = re.findall(r"(?<![.,])\d+[,.]{0,1}\d*", rent_values) # Going to get values in case of find 6 numbers in array. if len(rent_values) == 6: rent_values_dict["rent_without_taxes"] = rent_values[0] rent_values_dict["condominium_tax"] = rent_values[1] rent_values_dict["house_tax"] = rent_values[2] rent_values_dict["fire_insurance"] = rent_values[3] rent_values_dict["service_tax"] = rent_values[4] rent_values_dict["total_rent_value"] = rent_values[5] return rent_values_dict except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def validate_message_data(x_request_id: str, message: any) -> dict: """ Function responsible for validate message data. Parameters: x_request_id: id unique message: object Returns: object """ send_log(x_request_id=x_request_id, message="Validating message...") message = json.loads(message) return message if message.get("events") else None
def verification_string_has_digit(x_request_id: str, text: str) -> bool: """ Function responsible for verify if text has an digit or not. Parameters: x_request_id: unique id text: sentence that going to be verified Returns: bool: True or False """ send_log(x_request_id=x_request_id, message="Verification if text has digit...") if text is None: return False return any(map(str.isdigit, text))
def create_residence_values( x_request_id: str, residence_id: int, residence_data: QuintoAndarSchema ) -> None: """ Function responsible for create residence values. Parameters: x_request_id: str residence_id: int residence_data: QuintoAndarSchema Returns: int """ try: table_name = TableNameSchema() data = { "ResidenceId": residence_id, "price": float(residence_data.rent_price_without_tax), "condominiumTax": float(residence_data.condominium_tax), "houseTax": float(residence_data.house_tax), "fireInsurence": float(residence_data.fire_insurance), "serviceTax": float(residence_data.service_tax), "totalRentPrice": float(residence_data.total_rent_price), } residence_values = create( x_request_id=x_request_id, data=data, table_name=table_name.residence_values, ) send_log( x_request_id=x_request_id, message=f"Inserted in database the follow residence values {residence_values}...", ) except ( TimeoutError, SyntaxError, IndexError, AttributeError, ) as exception: return error_handler( x_request_id=x_request_id, exception=exception, _msg="Exception occurred in create_residence_value", )
def open_new_tab(x_request_id: str, link) -> None: """ Function responsible for open new tab base on link Parameters: x_request_id: str link Returns: void """ send_log( x_request_id=x_request_id, message="Opening new tab for the link the was get...", ) link.send_keys(Keys.CONTROL + Keys.RETURN)
def send_message( x_request_id: str, queue, message_body, message_attributes=None, thread_number: int = 0, ) -> None: """ Send a message to an Amazon SQS queue. Parameters: x_request_id: unique id queue: The queue to receive the messages. message_body: The messages to send to the queue. These are simplified to contain only the message body and attributes. message_attributes: any thread_number: int represent the number of thread of queue. these is important to make QUEUE work in thread Returns: The response from SQS that contains the assigned message ID. """ if not message_attributes: message_attributes = {} try: queue.send_message( MessageBody=message_body, MessageAttributes=message_attributes, MessageDeduplicationId=f"wmh_scraper_{random_number(10000)}", MessageGroupId=f"wmh_scraper_{thread_number}", ) message_body = json.loads(message_body) send_log( message=f"Sending the follow msg to SQS QUEUE {message_body}", x_request_id=x_request_id, ) except (ClientError, TypeError) as exception: error_handler( x_request_id=x_request_id, _msg=f"Send message failed: {message_body}", exception=exception, )
def open_page(x_request_id: str, driver, link: str) -> None: """Function responsible for open a web page, base on specific Link. Parameters: x_request_id: Unique id. driver: Google Chrome instance. link: URL of WebPage. Returns: void """ if link: driver.get(link) send_log( x_request_id=x_request_id, message=f"Page was open with follow link {link}...", )
def delete_message(x_request_id: str, message) -> None: """ Delete an message from a queue. Parameters: x_request_id: Unique id str message: The message to delete. The message's queue URL is contained in the message's metadata. Returns: None """ try: message.delete() send_log( x_request_id=x_request_id, message="Message have been deleted with success.", ) except (ClientError, AttributeError) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def pet_flag(x_request_id: str, driver) -> bool: """ Function responsible for flag if the residence can have pet or not. Parameters: x_request_id: unique id driver: google chrome instance Returns: Bool <True or False> """ send_log(x_request_id=x_request_id, message="Searching for pet flag...") sleep(number=2) try: pet_flag_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[6]/div/div/span" ) if pet_flag_data: send_log( x_request_id=x_request_id, message="Found information about pet flag...", ) pet_flag_text = pet_flag_data.text send_log( x_request_id=x_request_id, message=f"Pet flag informatio is {pet_flag_text}", ) return not bool("Não" in pet_flag_text or "Nao" in pet_flag_text) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def api_integration(x_request_id: str, url: str, token: str, body: dict) -> json: """ Function responsible for send request based on path and data body. Parameters: x_request_id: str url: str token: str body: dict """ headers = {"Content-Type": "application/json", "Authorization": token} try: send_log( x_request_id=x_request_id, message= f"Sending request to follow path: {url} with follow data: {body}", ) data = requests.post(url=url, data=json.dumps(body), headers=headers, timeout=25) send_log( x_request_id=x_request_id, message=f"Request finish with status: {data.status_code}", ) return data except ( requests.exceptions.Timeout, requests.exceptions.ReadTimeout, ) as exception: return error_handler( x_request_id=x_request_id, _msg="Exception occurred in api service.", exception=exception, )
def main(driver: any, queue: any) -> None: """ Consumer responsible for receive messages from SQS Queue Parameters: driver: any queue: any Returns: None """ try: while True: messages = receive_messages(queue=queue, max_number=1, wait_time=0) if len(messages) == 0: send_log( x_request_id="", message="QUEUE with 0 messages, going to send default event in 30 minutes...", ) sleep(number=1800) dealing_with_empty_queue(queue=queue) else: for message in messages: x_request_id = request_handler(message=message.body) send_log( x_request_id=x_request_id, message="Receive message going to start scraper flow...", ) consumer_message_handler( message=message.body, x_request_id=x_request_id, driver=driver, ) delete_message(x_request_id=x_request_id, message=message) except AttributeError as exception: error_handler(exception=exception)
def recursive_scraper_logic( x_request_id: str, div_number_row: int, div_number_column: int, limit_scraper: int, timeout_start, driver, ): """ Function responsible for deal with recursive scraper logic. Parameters: x_request_id: Unique id. div_number_row: Number of the block in row in the page div_number_column: Number of the block in column in page limit_scraper: Number responsible for define the limit of scraper to the page timeout_start: The time that scraper begin driver: Google Chrome instance Notes: The function define the number of scraper that happens in the page until event of scroll happens. And the logic start again. Returns: void """ timeout = 900 sleep(15) # Scraper will happen for 15 minutes # if time.time() < timeout_start + timeout: link = get_link_of_resident_block( x_request_id=x_request_id, div_number_row=div_number_row, div_number_column=div_number_column, driver=driver, ) quinto_andar_data = QuintoAndarSchema() if link: main_window = save_window_opener(x_request_id=x_request_id, driver=driver) open_new_tab(x_request_id=x_request_id, link=link) event_switch_right_window(x_request_id=x_request_id, driver=driver) event_switch_to_tab_window(main_window=main_window, driver=driver) sleep(8) send_log( x_request_id=x_request_id, message="Initiation of collection of data...", ) resident_data = get_resident_block_data( x_request_id=x_request_id, quinto_andar_data=quinto_andar_data, driver=driver, ) creation_residence_data(x_request_id=x_request_id, residence_data=resident_data) close_current_tab(driver=driver, main_window=main_window) send_log(x_request_id=x_request_id, message="Return to main screen...") sleep(1) driver.switch_to_window(main_window) div_number_row, div_number_column = recursive_column_row_logic( x_request_id=x_request_id, div_number_column=div_number_column, div_number_row=div_number_row, limit_scraper=limit_scraper, driver=driver, ) send_log( x_request_id=x_request_id, message=f"Data of residence is: {quinto_andar_data}", ) limit_scraper += 1 recursive_scraper_logic( x_request_id=x_request_id, div_number_row=div_number_row, div_number_column=div_number_column, limit_scraper=limit_scraper, timeout_start=timeout_start, driver=driver, )