class GmapParse: def __init__(self, debug: bool = False): self.scraper = GoogleMapsScraper(debug=debug) pass def parseStore(self, company, n_reviews: int = 10): if company in STORES: print(f"[*] Scraping '{company}'...") output_folder = Path(sc.check_folder("output")) for idx, store_url in enumerate(STORES[company]): print(f"[*] Scraping now from '{store_url}'!") print(f"{idx+1} out of {len(STORES[company])} links...") error = self.scraper.sort_by_date(store_url) if error == 0: n = 0 file = output_folder / f"gbusiness_{company}_{datetime.utcnow().strftime('%Y-%m-%d-T%H-%M-%SZ')}.jl" while n < n_reviews: reviews = self.scraper.enrich_reviews( self.scraper.get_reviews(n), store_url, company) with file.open("a", encoding="utf-8") as js: for r in reviews: js.write(json.dumps(r) + "\n") if len(reviews) == 0: n += 100 else: n += len(reviews) else: print( f"[*] Could not scrape link '{store_url}'. Link will be stored @error.log for further retries." ) with Path("error.log").open("a", encoding="utf-8") as js: js.write(str(store_url) + "\n") else: raise Exception(f"Company '{company}' not found!")
def scrape_gm_reviews(self): # set connection to DB collection = self.client[DB_NAME][COLLECTION_NAME] # init scraper and incremental add reviews # TO DO: pass logger as parameter to log into one single file? with GoogleMapsScraper() as scraper: for url in self.urls: try: error = scraper.sort_by_date(url) if error == 0: stop = False offset = 0 n_new_reviews = 0 while not stop: rlist = scraper.get_reviews(offset) for r in rlist: # calculate review date and compare to input min_date_review r['timestamp'] = self.__parse_relative_date( r['relative_date']) stop = self.__stop(r, collection) if not stop: collection.insert_one(r) n_new_reviews += 1 else: break offset += len(rlist) # log total number self.logger.info('{} : {} new reviews'.format( url, n_new_reviews)) else: self.logger.warning( 'Sorting reviews failed for {}'.format(url)) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split( exc_tb.tb_frame.f_code.co_filename)[1] self.logger.error('{}: {}, {}, {}'.format( url, exc_type, fname, exc_tb.tb_lineno))
def submit(request): global file MyLoginForm = LoginForm(request.POST) if request.method == "POST": if MyLoginForm.is_valid(): number = MyLoginForm.cleaned_data['number'] url = MyLoginForm.cleaned_data['url'] file = MyLoginForm.cleaned_data['file'] with GoogleMapsScraper() as scraper: # with open(args, 'r') as urls_file: # for url in urls_file: error = scraper.sort_by_date(url) if error == -1: # store reviews in CSV file n = 0 rev = [] while n < number: reviews = scraper.get_reviews(n) rev.extend(reviews) n += len(reviews) df = pd.DataFrame(rev) df.to_csv('polls/mydata/gm_reviews.csv', index=False) return HttpResponseRedirect(reverse('polls:save_file'))
if __name__ == '__main__': parser = argparse.ArgumentParser(description='Google Maps reviews scraper.') parser.add_argument('--N', type=int, default=100, help='Number of reviews to scrape') parser.add_argument('--i', type=str, default='urls.txt', help='target URLs file') parser.add_argument('--place', dest='place', action='store_true', help='Scrape place metadata') parser.add_argument('--debug', dest='debug', action='store_true', help='Run scraper using browser graphical interface') parser.add_argument('--source', dest='source', action='store_true', help='Add source url to CSV file (for multiple urls in a single file)') parser.set_defaults(place=False, debug=False, source=False) args = parser.parse_args() # store reviews in CSV file writer = csv_writer(args.source) with GoogleMapsScraper(debug=args.debug) as scraper: with open(args.i, 'r') as urls_file: for url in urls_file: if args.place: print(scraper.get_account(url)) else: error = scraper.sort_by_date(url) if error == 0: # error = scraper.open_web(url) # if error == 0: n = 0 m = 500 o = 0 while n < args.N:
type=int, default=100, help='Number of reviews to scrape') parser.add_argument('--i', type=str, default='urls.txt', help='target URLs file') parser.add_argument('--place', dest='place', action='store_true', help='Scrape place metadata') parser.set_defaults(place=False) args = parser.parse_args() with GoogleMapsScraper() as scraper: with open(args.i, 'r') as urls_file: for url in urls_file: if args.place: print(scraper.get_account(url)) else: error = scraper.sort_by_date(url) if error == 0: # store reviews in CSV file writer = csv_writer() n = 0 while n < args.N: reviews = scraper.get_reviews(n)
async def main(): # conn = await asyncpg.connect(database = "propadvisor_db", user = "******", password = "******", host = "localhost", port = "5432") # query="SELECT project_id, project_name, google_review_link from project_details where google_review_link != '' " # rows = await conn.fetch(query) #for row in rows: # project_id = row[0] # project_name = row[1] # google_review_link = row[2] # print("************************************************************************************************\n") # print ("project_id : ",project_id ) # print ("project_name : ",project_name ) # print ("google_review_link :",google_review_link ) parser = argparse.ArgumentParser( description='Google Maps reviews scraper.') parser.add_argument('--N', type=int, default=100, help='Number of reviews to scrape') parser.add_argument('--place', dest='place', action='store_true', help='Scrape place metadata') parser.add_argument('--debug', dest='debug', action='store_true', help='Run scraper using browser graphical interface') parser.add_argument( '--source', dest='source', action='store_true', help='Add source url to CSV file (for multiple urls in a single file)') parser.set_defaults(place=False, debug=False, source=False) args = parser.parse_args() #urls_file = [google_review_link] #urls_file = ['https://www.google.com/maps/place/Purva+Whitehall/@12.9170661,77.6700152,17z/data=!3m1!4b1!4m10!1m2!2m1!1sbangalore+apartments!3m6!1s0x3bae13724a6e7a53:0x64600b55324bd850!8m2!3d12.9170609!4d77.6722039!9m1!1b1'] urls_file = [ 'https://www.google.com/maps/place/Bharat+Skyvistas+Bluez/@19.2235109,72.8630312,17z/data=!3m1!4b1!4m7!3m6!1s0x0:0x57cd3dc06e975425!8m2!3d19.2235109!4d72.8652199!9m1!1b1' ] #urls_file = ['https://www.google.com/maps/place/Alcove+Service+Apartments/@12.9285776,77.629614,17z/data=!4m13!1m2!2m1!1sapartment+near+bangalore,+karnataka!3m9!1s0x3bae1460b308194d:0x6c455e62e871d6b0!5m2!4m1!1i2!8m2!3d12.9285724!4d77.6318027!9m1!1b1'] with GoogleMapsScraper(debug=args.debug) as scraper: for url in urls_file: print( "\n************************************************************************************************\n" ) #print(url) if args.place: print(scraper.get_account(url)) else: error = scraper.sort_by_date(url) if error == 0: count_review = scraper.get_count_reviews() print("Number Of Google Review: ", count_review) print("\n") n = 0 row_count = 1 try: while n < int(count_review): reviews = scraper.get_reviews(n) for r in reviews: row_data = list(r.values()) user_review = row_data[0].replace("'", "''") # #print("Project Name: ",project_name) print("Row Number: ", row_count) print("Review:", user_review) print("Review Date:", row_data[1]) print("Review Rating:", row_data[3]) print("Reviewer Name:", row_data[4]) # query = "SELECT project_id from google_reviews where project_id = '"+str(project_id)+"' \ # AND reviewer_name = '"+str(row_data[4])+"' AND review_date='"+str(row_data[1])+"' \ # AND rating='"+str(row_data[3])+"' AND review='"+user_review+"'" # records = await conn.fetch(query) # row_affected = len(records) # if(row_affected == 0): # insert_query = "INSERT INTO google_reviews(project_id, reviewer_name, review_date, \ # rating, review) VALUES ('"+str(project_id)+"', '"+str(row_data[4])+"', '"+str(row_data[1])+"', \ # '"+str(row_data[3])+"', '"+user_review+"')" # await conn.execute(insert_query) # print("One Row Inserted...") # else: # print("Information Already Exist...") print("\n") row_count = row_count + 1 n += len(reviews) except: print("*********************************************") print("Cannot Fetch reviews of project") print("*********************************************") print("\n")
def __init__(self, debug: bool = False): self.scraper = GoogleMapsScraper(debug=debug) pass
async def main(): try: shutil.rmtree('./data') except: pass config_var = path.abspath(path.join(__file__, "../../Config_db/db.json")) file_output = open(config_var, "r") db_params = json.load(file_output) # print( db_params['database']) # print( db_params['user']) # print( db_params['host']) # print( db_params['port']) conn = await asyncpg.connect(database=db_params['database'], user=db_params['user'], password=db_params['password'], host=db_params['host'], port=db_params['port']) query = "SELECT project_id, project_name, google_review_link from project_details where google_review_link IS NOT NULL AND google_review_link != ''" #query = "SELECT project_id, project_name, google_review_link from project_details where project_id = '106'" rows = await conn.fetch(query) for row in rows: project_id = row[0] project_name = row[1] google_review_link = row[2] print( "************************************************************************************************\n" ) print("project_id : ", project_id) print("project_name : ", project_name) print("google_review_link :", google_review_link) parser = argparse.ArgumentParser( description='Google Maps reviews scraper.') parser.add_argument('--N', type=int, default=100, help='Number of reviews to scrape') parser.add_argument('--place', dest='place', action='store_true', help='Scrape place metadata') parser.add_argument( '--debug', dest='debug', action='store_true', help='Run scraper using browser graphical interface') parser.add_argument( '--source', dest='source', action='store_true', help= 'Add source url to CSV file (for multiple urls in a single file)') parser.set_defaults(place=False, debug=False, source=False) args = parser.parse_args() urls_file = [google_review_link] #urls_file = ['https://www.google.com/maps/place/Golden+Blessings/@18.6092482,73.7593236,17z/data=!3m1!4b1!4m21!1m13!4m12!1m4!2m2!1d80.9539467!2d26.7547724!4e1!1m6!1m2!1s0x3bc2b9773c37683b:0xa4dc7ad3f6cd879a!2sgolden+blessings!2m2!1d73.7615123!2d18.6092431!3m6!1s0x3bc2b9773c37683b:0xa4dc7ad3f6cd879a!8m2!3d18.6092431!4d73.7615123!9m1!1b1'] with GoogleMapsScraper(debug=args.debug) as scraper: for url in urls_file: print( "\n************************************************************************************************\n" ) #print(url) if args.place: print(scraper.get_account(url)) else: error = scraper.sort_by_date(url) if error == 0: count_review = scraper.get_count_reviews() print("Number Of Google Review: ", count_review) print("\n") n = 0 row_count = 1 try: while n < int(count_review): reviews = scraper.get_reviews(n) for r in reviews: row_data = list(r.values()) user_review = row_data[0].replace( "'", "''") print("Project Name: ", project_name) print("Row Number: ", row_count) print("Review:", user_review) print("Review Date:", row_data[1]) print("Review Rating:", row_data[3]) print("Reviewer Name:", row_data[4]) from datetime import datetime now = datetime.now() print("now", now) row_data[1] = convert_google_review_date( row_data[1]) query = " SELECT project_id FROM project_review where project_id = '" + str( project_id ) + "' \ AND reviewer_id = '0' AND reviewer_name = '" + str( row_data[4] ) + "' AND reviewer_type = 'google_reviewer' \ AND overall_rating = '" + str( int(row_data[3]) ) + "' AND review = '" + user_review + "' AND review_date='" + str( row_data[1]) + "' " records = await conn.fetch(query) row_affected = len(records) print("row_affected", row_affected) if (row_affected == 0): insert_query = "INSERT INTO public.project_review( project_id, reviewer_id, reviewer_name, reviewer_type, \ location_rating, amenities_rating, layout_planning_rating, overall_rating, customer_service_rating, \ vfm, review_title, review, review_date, status, email_varified, review_time) VALUES ( '" + str( project_id ) + "',\ '0', '" + str( row_data[4] ) + "', 'google_reviewer', '0', '0', '0', '" + str( int(row_data[3]) ) + "', '0', '0', \ '', '" + user_review + "', '" + str( row_data[1] ) + "', 'approved', 'true', '" + str( now) + "' ) " await conn.execute(insert_query) print("----------One Row Inserted...") else: print( "----------Information Already Exist..." ) print("\n") row_count = row_count + 1 n += len(reviews) except: print( "*********************************************" ) print("Cannot Fetch reviews of project", project_name) print( "*********************************************" ) print("\n")