Esempio n. 1
0
class GmapParse:
    def __init__(self, debug: bool = False):
        self.scraper = GoogleMapsScraper(debug=debug)
        pass

    def parseStore(self, company, n_reviews: int = 10):
        if company in STORES:
            print(f"[*] Scraping '{company}'...")
            output_folder = Path(sc.check_folder("output"))
            for idx, store_url in enumerate(STORES[company]):
                print(f"[*] Scraping now from '{store_url}'!")
                print(f"{idx+1} out of {len(STORES[company])} links...")
                error = self.scraper.sort_by_date(store_url)
                if error == 0:
                    n = 0
                    file = output_folder / f"gbusiness_{company}_{datetime.utcnow().strftime('%Y-%m-%d-T%H-%M-%SZ')}.jl"
                    while n < n_reviews:
                        reviews = self.scraper.enrich_reviews(
                            self.scraper.get_reviews(n), store_url, company)
                        with file.open("a", encoding="utf-8") as js:
                            for r in reviews:
                                js.write(json.dumps(r) + "\n")
                        if len(reviews) == 0:
                            n += 100
                        else:
                            n += len(reviews)
                else:
                    print(
                        f"[*] Could not scrape link '{store_url}'. Link will be stored @error.log for further retries."
                    )
                    with Path("error.log").open("a", encoding="utf-8") as js:
                        js.write(str(store_url) + "\n")
        else:
            raise Exception(f"Company '{company}' not found!")
Esempio n. 2
0
    def scrape_gm_reviews(self):
        # set connection to DB
        collection = self.client[DB_NAME][COLLECTION_NAME]

        # init scraper and incremental add reviews
        # TO DO: pass logger as parameter to log into one single file?
        with GoogleMapsScraper() as scraper:
            for url in self.urls:
                try:
                    error = scraper.sort_by_date(url)
                    if error == 0:
                        stop = False
                        offset = 0
                        n_new_reviews = 0
                        while not stop:
                            rlist = scraper.get_reviews(offset)
                            for r in rlist:
                                # calculate review date and compare to input min_date_review
                                r['timestamp'] = self.__parse_relative_date(
                                    r['relative_date'])
                                stop = self.__stop(r, collection)
                                if not stop:
                                    collection.insert_one(r)
                                    n_new_reviews += 1
                                else:
                                    break
                            offset += len(rlist)

                        # log total number
                        self.logger.info('{} : {} new reviews'.format(
                            url, n_new_reviews))
                    else:
                        self.logger.warning(
                            'Sorting reviews failed for {}'.format(url))

                except Exception as e:
                    exc_type, exc_obj, exc_tb = sys.exc_info()
                    fname = os.path.split(
                        exc_tb.tb_frame.f_code.co_filename)[1]

                    self.logger.error('{}: {}, {}, {}'.format(
                        url, exc_type, fname, exc_tb.tb_lineno))
Esempio n. 3
0
def submit(request):
    global file
    MyLoginForm = LoginForm(request.POST)
    if request.method == "POST":
        if MyLoginForm.is_valid():
            number = MyLoginForm.cleaned_data['number']
            url = MyLoginForm.cleaned_data['url']
            file = MyLoginForm.cleaned_data['file']
            with GoogleMapsScraper() as scraper:
                # with open(args, 'r') as urls_file:
                # for url in urls_file:
                error = scraper.sort_by_date(url)
                if error == -1:
                    # store reviews in CSV file
                    n = 0
                    rev = []
                    while n < number:
                        reviews = scraper.get_reviews(n)
                        rev.extend(reviews)
                        n += len(reviews)
                    df = pd.DataFrame(rev)
                    df.to_csv('polls/mydata/gm_reviews.csv', index=False)
    return HttpResponseRedirect(reverse('polls:save_file'))
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Google Maps reviews scraper.')
    parser.add_argument('--N', type=int, default=100, help='Number of reviews to scrape')
    parser.add_argument('--i', type=str, default='urls.txt', help='target URLs file')
    parser.add_argument('--place', dest='place', action='store_true', help='Scrape place metadata')
    parser.add_argument('--debug', dest='debug', action='store_true', help='Run scraper using browser graphical interface')
    parser.add_argument('--source', dest='source', action='store_true', help='Add source url to CSV file (for multiple urls in a single file)')
    parser.set_defaults(place=False, debug=False, source=False)

    args = parser.parse_args()

    # store reviews in CSV file
    writer = csv_writer(args.source)

    with GoogleMapsScraper(debug=args.debug) as scraper:
        with open(args.i, 'r') as urls_file:
            for url in urls_file:

                if args.place:
                    print(scraper.get_account(url))
                else:
                    error = scraper.sort_by_date(url)
                    if error == 0:
                    # error = scraper.open_web(url)
                    # if error == 0:

                        n = 0
                        m = 500
                        o = 0
                        while n < args.N:
Esempio n. 5
0
                        type=int,
                        default=100,
                        help='Number of reviews to scrape')
    parser.add_argument('--i',
                        type=str,
                        default='urls.txt',
                        help='target URLs file')
    parser.add_argument('--place',
                        dest='place',
                        action='store_true',
                        help='Scrape place metadata')
    parser.set_defaults(place=False)

    args = parser.parse_args()

    with GoogleMapsScraper() as scraper:
        with open(args.i, 'r') as urls_file:
            for url in urls_file:

                if args.place:
                    print(scraper.get_account(url))
                else:
                    error = scraper.sort_by_date(url)
                    if error == 0:
                        # store reviews in CSV file
                        writer = csv_writer()

                        n = 0
                        while n < args.N:
                            reviews = scraper.get_reviews(n)
Esempio n. 6
0
async def main():
    # conn = await asyncpg.connect(database = "propadvisor_db", user = "******", password = "******", host = "localhost", port = "5432")
    # query="SELECT project_id, project_name, google_review_link from project_details where google_review_link != '' "
    # rows = await conn.fetch(query)

    #for row in rows:
    # project_id = row[0]
    # project_name = row[1]
    # google_review_link = row[2]
    # print("************************************************************************************************\n")
    # print ("project_id : ",project_id )
    # print ("project_name : ",project_name )
    # print ("google_review_link :",google_review_link )

    parser = argparse.ArgumentParser(
        description='Google Maps reviews scraper.')
    parser.add_argument('--N',
                        type=int,
                        default=100,
                        help='Number of reviews to scrape')
    parser.add_argument('--place',
                        dest='place',
                        action='store_true',
                        help='Scrape place metadata')
    parser.add_argument('--debug',
                        dest='debug',
                        action='store_true',
                        help='Run scraper using browser graphical interface')
    parser.add_argument(
        '--source',
        dest='source',
        action='store_true',
        help='Add source url to CSV file (for multiple urls in a single file)')
    parser.set_defaults(place=False, debug=False, source=False)

    args = parser.parse_args()

    #urls_file = [google_review_link]
    #urls_file = ['https://www.google.com/maps/place/Purva+Whitehall/@12.9170661,77.6700152,17z/data=!3m1!4b1!4m10!1m2!2m1!1sbangalore+apartments!3m6!1s0x3bae13724a6e7a53:0x64600b55324bd850!8m2!3d12.9170609!4d77.6722039!9m1!1b1']
    urls_file = [
        'https://www.google.com/maps/place/Bharat+Skyvistas+Bluez/@19.2235109,72.8630312,17z/data=!3m1!4b1!4m7!3m6!1s0x0:0x57cd3dc06e975425!8m2!3d19.2235109!4d72.8652199!9m1!1b1'
    ]
    #urls_file = ['https://www.google.com/maps/place/Alcove+Service+Apartments/@12.9285776,77.629614,17z/data=!4m13!1m2!2m1!1sapartment+near+bangalore,+karnataka!3m9!1s0x3bae1460b308194d:0x6c455e62e871d6b0!5m2!4m1!1i2!8m2!3d12.9285724!4d77.6318027!9m1!1b1']

    with GoogleMapsScraper(debug=args.debug) as scraper:
        for url in urls_file:
            print(
                "\n************************************************************************************************\n"
            )
            #print(url)
            if args.place:
                print(scraper.get_account(url))
            else:
                error = scraper.sort_by_date(url)
                if error == 0:
                    count_review = scraper.get_count_reviews()
                    print("Number Of Google Review: ", count_review)
                    print("\n")
                    n = 0
                    row_count = 1
                    try:
                        while n < int(count_review):
                            reviews = scraper.get_reviews(n)
                            for r in reviews:
                                row_data = list(r.values())
                                user_review = row_data[0].replace("'", "''")
                                # #print("Project Name: ",project_name)
                                print("Row Number: ", row_count)
                                print("Review:", user_review)
                                print("Review Date:", row_data[1])
                                print("Review Rating:", row_data[3])
                                print("Reviewer Name:", row_data[4])

                                # query = "SELECT project_id from google_reviews where  project_id = '"+str(project_id)+"' \
                                # AND  reviewer_name = '"+str(row_data[4])+"' AND review_date='"+str(row_data[1])+"' \
                                # AND rating='"+str(row_data[3])+"' AND review='"+user_review+"'"
                                # records = await conn.fetch(query)
                                # row_affected = len(records)

                                # if(row_affected == 0):
                                #     insert_query = "INSERT INTO google_reviews(project_id, reviewer_name, review_date, \
                                #     rating, review) VALUES ('"+str(project_id)+"', '"+str(row_data[4])+"', '"+str(row_data[1])+"', \
                                #     '"+str(row_data[3])+"', '"+user_review+"')"
                                #     await conn.execute(insert_query)
                                #     print("One Row Inserted...")
                                # else:
                                #     print("Information Already Exist...")

                                print("\n")
                                row_count = row_count + 1
                            n += len(reviews)

                    except:
                        print("*********************************************")
                        print("Cannot Fetch reviews of project")
                        print("*********************************************")
                        print("\n")
Esempio n. 7
0
 def __init__(self, debug: bool = False):
     self.scraper = GoogleMapsScraper(debug=debug)
     pass
Esempio n. 8
0
async def main():
    try:
        shutil.rmtree('./data')
    except:
        pass

    config_var = path.abspath(path.join(__file__, "../../Config_db/db.json"))
    file_output = open(config_var, "r")
    db_params = json.load(file_output)
    # print( db_params['database'])
    # print( db_params['user'])
    # print( db_params['host'])
    # print( db_params['port'])

    conn = await asyncpg.connect(database=db_params['database'],
                                 user=db_params['user'],
                                 password=db_params['password'],
                                 host=db_params['host'],
                                 port=db_params['port'])
    query = "SELECT project_id, project_name, google_review_link from project_details where google_review_link IS NOT NULL AND google_review_link != ''"
    #query = "SELECT project_id, project_name, google_review_link from project_details where project_id = '106'"
    rows = await conn.fetch(query)

    for row in rows:
        project_id = row[0]
        project_name = row[1]
        google_review_link = row[2]
        print(
            "************************************************************************************************\n"
        )
        print("project_id : ", project_id)
        print("project_name : ", project_name)
        print("google_review_link :", google_review_link)

        parser = argparse.ArgumentParser(
            description='Google Maps reviews scraper.')
        parser.add_argument('--N',
                            type=int,
                            default=100,
                            help='Number of reviews to scrape')
        parser.add_argument('--place',
                            dest='place',
                            action='store_true',
                            help='Scrape place metadata')
        parser.add_argument(
            '--debug',
            dest='debug',
            action='store_true',
            help='Run scraper using browser graphical interface')
        parser.add_argument(
            '--source',
            dest='source',
            action='store_true',
            help=
            'Add source url to CSV file (for multiple urls in a single file)')
        parser.set_defaults(place=False, debug=False, source=False)

        args = parser.parse_args()

        urls_file = [google_review_link]
        #urls_file = ['https://www.google.com/maps/place/Golden+Blessings/@18.6092482,73.7593236,17z/data=!3m1!4b1!4m21!1m13!4m12!1m4!2m2!1d80.9539467!2d26.7547724!4e1!1m6!1m2!1s0x3bc2b9773c37683b:0xa4dc7ad3f6cd879a!2sgolden+blessings!2m2!1d73.7615123!2d18.6092431!3m6!1s0x3bc2b9773c37683b:0xa4dc7ad3f6cd879a!8m2!3d18.6092431!4d73.7615123!9m1!1b1']

        with GoogleMapsScraper(debug=args.debug) as scraper:
            for url in urls_file:
                print(
                    "\n************************************************************************************************\n"
                )
                #print(url)
                if args.place:
                    print(scraper.get_account(url))
                else:
                    error = scraper.sort_by_date(url)
                    if error == 0:
                        count_review = scraper.get_count_reviews()
                        print("Number Of Google Review: ", count_review)
                        print("\n")
                        n = 0
                        row_count = 1
                        try:
                            while n < int(count_review):
                                reviews = scraper.get_reviews(n)
                                for r in reviews:
                                    row_data = list(r.values())
                                    user_review = row_data[0].replace(
                                        "'", "''")
                                    print("Project Name: ", project_name)
                                    print("Row Number: ", row_count)
                                    print("Review:", user_review)
                                    print("Review Date:", row_data[1])
                                    print("Review Rating:", row_data[3])
                                    print("Reviewer Name:", row_data[4])
                                    from datetime import datetime
                                    now = datetime.now()
                                    print("now", now)

                                    row_data[1] = convert_google_review_date(
                                        row_data[1])

                                    query = " SELECT project_id FROM project_review where  project_id = '" + str(
                                        project_id
                                    ) + "' \
                                    AND reviewer_id = '0' AND reviewer_name = '" + str(
                                        row_data[4]
                                    ) + "' AND reviewer_type = 'google_reviewer' \
                                    AND overall_rating = '" + str(
                                        int(row_data[3])
                                    ) + "' AND review = '" + user_review + "'  AND review_date='" + str(
                                        row_data[1]) + "' "

                                    records = await conn.fetch(query)
                                    row_affected = len(records)
                                    print("row_affected", row_affected)

                                    if (row_affected == 0):
                                        insert_query = "INSERT INTO public.project_review( project_id, reviewer_id, reviewer_name, reviewer_type, \
                                        location_rating, amenities_rating, layout_planning_rating, overall_rating, customer_service_rating, \
                                        vfm, review_title, review, review_date, status, email_varified, review_time) VALUES ( '" + str(
                                            project_id
                                        ) + "',\
                                        '0', '" + str(
                                            row_data[4]
                                        ) + "', 'google_reviewer', '0', '0', '0', '" + str(
                                            int(row_data[3])
                                        ) + "', '0', '0', \
                                        '', '" + user_review + "', '" + str(
                                            row_data[1]
                                        ) + "', 'approved', 'true', '" + str(
                                            now) + "' ) "

                                        await conn.execute(insert_query)
                                        print("----------One Row Inserted...")
                                    else:
                                        print(
                                            "----------Information Already Exist..."
                                        )

                                    print("\n")
                                    row_count = row_count + 1
                                n += len(reviews)
                        except:
                            print(
                                "*********************************************"
                            )
                            print("Cannot Fetch reviews of project",
                                  project_name)
                            print(
                                "*********************************************"
                            )
                            print("\n")