def start_driver(): print("Starting Driver") driver = start_firefox(trulia, geckodriver_path, adblock_path, uBlock_path) sleep(5) try: driver.switch_to_window(driver.window_handles[1]) driver.close() driver.switch_to_window(driver.window_handles[0]) return driver except: print ("Switching window failed??") driver.quit() restart("logfile", debug, start)
def open_page(url): driver.delete_all_cookies() d = {} driver.get(url) print(driver.title) sleep(3) if "Real Estate, " in driver.title or "Not Found" in driver.title: print ("404 in trulia") return 1 elif "Trulia" in driver.title: print ("Successfully loaded URL") return 0 else: print ("Being blocked from accessing Trulia. Restarting...") driver.quit() restart("logfile", debug, start) return 1
def update_ejscreen(idx, debug_mode): print("Crawling ejscreen") d = {} driver.execute_script("window.open('https://ejscreen.epa.gov/mapper/mobile/', 'new_tab')") sleep(5) driver.switch_to_window(driver.window_handles[1]) address = get_address(idx) try: handle_ejscreen_input(driver, address) sleep(3) extract_pollution_from_report(driver, d) except: if debug_mode: driver.quit() for proc in psutil.process_iter(): if proc.name() == "firefox" or proc.name() == "geckodriver": proc.kill() raise else: print("Cannot extract pollution. Restarting") driver.quit() restart("logfile", debug_mode, idx) write_ejscreen_to_file(idx, d)
def main(input_file, output_file, start, end, crawler_log, geckodriver_path, debug_mode, adblock_path, uBlock_path): urls = [] df = pd.read_csv(input_file) driver = start_firefox(trulia, geckodriver_path, adblock_path, uBlock_path) sleep(5) try: driver.switch_to_window(driver.window_handles[1]) driver.close() driver.switch_to_window(driver.window_handles[0]) except: print ("switching window failed??") driver.quit() restart(crawler_log, debug_mode, start) i = int(start) count = 0 for address in df["full"][int(start):int(end)]: if count == 60: os.system("sudo tmpreaper -m 1h /tmp") restart() try: print(i, address) # url1 = query(driver, "buy", address) url1 = "" url2 = query(driver, "rent", address) url3 = "" # url3 = query(driver, "sold", address) urls.append([url1, url2, url3]) with open(output_file, "ab") as log: filewriter = csv.writer(log, delimiter = ',', quoting = csv.QUOTE_MINIMAL) filewriter.writerow([url1, url2, url3]) with open(crawler_log, "ab") as log: filewriter = csv.writer(log, delimiter = ',', quoting = csv.QUOTE_MINIMAL) filewriter.writerow([i]) i += 1 count += 1 except: if debug_mode: driver.quit() for proc in psutil.process_iter(): if proc.name() == "firefox" or proc.name() == "geckodriver": proc.kill() raise else: driver.quit() restart(crawler_log, debug_mode, start) driver.quit()
def main(crawl_type, input_file, output_file, start, end, crawler_log, geckodriver_path, repair, debug_mode, adblock_path, uBlock_path): """Main function to do the crawling Args: crawl_type (List of String): default ["U"]. Can add ["A", "L"] input_file (String): Name of the input file output_file (String): Name of the output file start (int): Starting index of the crawling end (int): Ending index of the crawling crawler_log (String): Name of the log geckodriver_path (String): Path to the geckodriver repair (Bool): Wheather this crawling is repair mode or not debug_mode (Bool): Wheater this crawling is debug mode or not adblock_path (String): Path to the adblock uBlock_path (String): Path to the uBlock """ driver = start_firefox(trulia, geckodriver_path, adblock_path, uBlock_path) sleep(5) try: driver.switch_to_window(driver.window_handles[1]) driver.close() driver.switch_to_window(driver.window_handles[0]) except: print("switching window failed??") driver.quit() restart(crawler_log, debug_mode, start) df = pd.read_csv(input_file) urls = df["URL"] if "L" in crawl_type: location = df["LatLon"] if "A" in crawl_type: address_col = df["Address"] if repair: df['Sqft'] = df['Sqft'].astype(str) df['Type'] = df['Type'].astype(str) df["Address"] = df["Address"].astype(str) df["City"] = df["City"].astype(str) df["State"] = df["State"].astype(str) df["Zip_Code"] = df["Zip_Code"].astype(str) df["Year"] = df["Year"].astype(str) df["Days_on_Trulia"] = df["Days_on_Trulia"].astype(str) df["Bedroom_min"] = df['Bedroom_min'].astype(str) df["Bedroom_max"] = df['Bedroom_max'].astype(str) df["Bathroom_min"] = df['Bathroom_min'].astype(str) df["Bathroom_max"] = df['Bathroom_max'].astype(str) df["Phone_Number"] = df['Phone_Number'].astype(str) df["URL"] = df["URL"].astype(str) try: for i in range(int(start), int(end)): print(i) print(urls[i]) driver.delete_all_cookies() d = {} crawled_trulia = True driver.get(urls[i]) print(driver.title) sleep(3) if "Real Estate, " in driver.title or "Not Found" in driver.title: print("404 in trulia") crawled_trulia = False elif "Trulia" in driver.title: print("Start crawling") try: if repair: flag = extract_rental(driver, d, "R", address_col[i], df, i) elif "A" in crawl_type: flag = extract_rental(driver, d, "A", address_col[i], index=i) else: flag = extract_rental(driver, d, "U") except: if debug_mode: driver.quit() for proc in psutil.process_iter(): if proc.name() == "firefox" or proc.name( ) == "geckodriver": proc.kill() raise else: driver.quit() print("Reached EXCEPT after extract_rental") restart(crawler_log, debug_mode, start) if flag == False: crawled_trulia = False elif "this page" in driver.title.lower(): print("Being blocked from accessing Trulia. Restarting...") driver.quit() restart(crawler_log, debug_mode, start) else: crawled_trulia = False address = driver.title.split(" - ")[0] print("Trulia is not available. Continuing") print("Trulia crawling done. Crawling ejscreen now") if repair: df.to_csv(input_file, index=False) with open(crawler_log, "ab") as log: filewriter = csv.writer(log, delimiter=',', quoting=csv.QUOTE_MINIMAL) filewriter.writerow([i]) print("Repair done. going Next...") sleep(random.randint(10, 40)) continue if "L" in crawl_type: address = location[i] elif "A" in crawl_type: address = address_col[i] elif "A" not in crawl_type: if crawled_trulia == False: address = driver.title.split(" - ")[0] if address.find("#") != -1: address = address[:address.find("#")] else: address = address[:address.find("For")] else: if d["address"].find('#') != -1: add = d["address"][:d["address"].find('#')] else: add = d["address"] address = add + ", " + d["city"] + ", " + d[ "state"] + " " + d["zip code"] if crawled_trulia == False and "Real Estate, " in driver.title: address = "NA" driver.execute_script( "window.open('https://ejscreen.epa.gov/mapper/mobile/', 'new_tab')" ) sleep(5) driver.switch_to_window(driver.window_handles[1]) # if (len(address) < 10): # save_data(d, urls[i], output_file, crawl_type) # with open(crawler_log, "ab") as log: # filewriter = csv.writer(log, delimiter = ',', quoting = csv.QUOTE_MINIMAL) # filewriter.writerow([i]) # continue try: handle_ejscreen_input(driver, address) sleep(5) extract_pollution_from_report(driver, d) #print("Skipping ejscreen") except: if debug_mode: driver.quit() for proc in psutil.process_iter(): if proc.name() == "firefox" or proc.name( ) == "geckodriver": proc.kill() raise else: print("cannot extract pollution. Restarting") driver.quit() restart(crawler_log, debug_mode, start) save_rental(d, urls[i], output_file) with open(crawler_log, "ab") as log: filewriter = csv.writer(log, delimiter=',', quoting=csv.QUOTE_MINIMAL) filewriter.writerow([i]) driver.close() driver.switch_to_window(driver.window_handles[0]) sleep(random.randint(10, 40)) except: if debug_mode: driver.quit() for proc in psutil.process_iter(): if proc.name() == "firefox" or proc.name() == "geckodriver": proc.kill() raise else: driver.quit() restart(crawler_log, debug_mode, start) driver.quit()