Example #1
0
def get_results():
    result, continuation_token = reviews(
        app_name,
        lang='pt',  # defaults to 'en'
        country='br',  # defaults to 'us'
        sort=Sort.NEWEST,  # defaults to Sort.MOST_RELEVANT
        count=10,  # defaults to 100
        filter_score_with=None  # defaults to None(means all score)
    )
    result, _ = reviews(
        app_name,
        continuation_token=
        continuation_token  # defaults to None(load from the beginning)
    )
    return result
def scrape(appid):
	result = reviews(
	    appid,
	    lang='en', # defaults to 'en'
	    country='us', # defaults to 'us'
	    sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
	    count=100, # defaults to 100
	  #  filter_score_with=5 # defaults to None(means all score)
	)
	file = open('reviews_scraped.txt','a+')
	#print(len(result))
	#print((result),end="\n\n")
	for i in result:
		print(i['content'],end="\n\n")


	for i in result:
		file.writelines(str((i['content']).encode('utf-8')))
		file.write('\n') 

	return result

#scrape('camera1.themaestrochef.com.cameraappfordogs')

#rey kedar mute nundi teey
#em vinpiyatle
Example #3
0
def extract_review_data(execution_date, app_id='com.tgc.sky.android'):
    results, continuation_token = reviews(
        app_id,
        sort=Sort.NEWEST,
        count=200
    ) 
    review_data = pd.DataFrame(results)
    
    continue_extracting_reviews = get_continue_extracting_reviews(
        execution_date,
        review_data
    )
    while(continue_extracting_reviews):
        next_batch, continuation_token = extract_next_batch(
            continuation_token, 
            app_id=app_id
        )
        continue_extracting_reviews = get_continue_extracting_reviews(
            execution_date, 
            next_batch
        ) 
        review_data = review_data.append(next_batch, ignore_index=True)
        time.sleep(1)
            
    return review_data
def scrape(apps_dataset_path, reviews_datase_path, show_debug: bool = False):
    app_packages = [
        "com.anydo",
        "com.todoist",
        "com.ticktick.task",
        "com.habitrpg.android.habitica",
        "cc.forestapp",
        "com.oristats.habitbull",
        "com.levor.liferpgtasks",
        "com.habitnow",
        "com.microsoft.todos",
        "prox.lab.calclock",
        "com.gmail.jmartindev.timetune",
        "com.artfulagenda.app",
        "com.tasks.android",
        "com.appgenix.bizcal",
        "com.appxy.planner",
    ]

    app_infos = []

    for ap in tqdm(app_packages):
        info = app(ap, lang="en", country="us")
        del info["comments"]
        app_infos.append(info)

    if show_debug:
        print_json(app_infos[0])

    app_infos_df = pd.DataFrame(app_infos)

    if show_debug:
        print(app_infos_df)

    app_infos_df.to_csv(apps_dataset_path, index=None, header=True)

    app_reviews = []

    for ap in tqdm(app_packages):
        for score in range(1, 6):
            for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
                rvs, _ = reviews(
                    ap,
                    lang="en",
                    country="us",
                    sort=sort_order,
                    count=300 if score == 3 else 150,
                    filter_score_with=score,
                )
                for r in rvs:
                    r["sortOrder"] = ("most_relevant" if sort_order
                                      == Sort.MOST_RELEVANT else "newest")
                    r["appId"] = ap
                app_reviews.extend(rvs)

    if show_debug:
        print(app_reviews[0])

    app_reviews_df = pd.DataFrame(app_reviews)
    app_reviews_df.to_csv(reviews_datase_path, index=None, header=True)
Example #5
0
def fetch_and_store_comments(app_id):
    try:
        comments = reviews(
                app_id,	
                lang='pt-BR', 
                country='br', 
                sort=Sort.NEWEST, 
                count=100, 
                )

        # Read dataframes and compare them
        df_comments = pd.DataFrame.from_dict(comments)
        try:
            df_folder = read_files_in_folder(app_id)
            df_comparison = pd.concat([df_comments, df_folder])
            df_comparison['at'] = pd.to_datetime(df_comparison['at'])
            since_comments = df_comments['at'].min()
            #since_comparison = df_comparison['at'].min()
            until_comparison = df_comparison['at'].max()
            df_comparison = df_comparison.drop_duplicates(['at', 'userName', 'content'])
            df = df_comparison[(df_comparison['at'] > since_comments) & (df_comparison['at'] < until_comparison)]
            # Store data
            df.to_csv(r'var/data/{0}/{1}_{2}.csv'.format(app_id, since_comments.strftime('%Y-%m-%d'), until_comparison.strftime('%Y-%m-%d')))
        except:
            df_comments['at'] = pd.to_datetime(df_comments['at'])
            since_comments = df_comments['at'].min()
            until_comments = df_comments['at'].max()
            df = df_comments[(df_comments['at'] > since_comments) & (df_comparison['at'] < until_comments)]
            df.to_csv(r'var/data/{0}/{1}_{2}.csv'.format(app_id, since_comments.strftime('%Y-%m-%d'), until_comparison.strftime('%Y-%m-%d')))
    except:
        pass
Example #6
0
def get_review(app_name):
    app_id = app_name.split("/")[-1].split("=")[1]

    result = reviews(
        app_id,
        sort=Sort.MOST_RELEVANT,  # defaults to Sort.MOST_RELEVANT
        count=75)

    di = result
    li = []
    for i in di[0]:
        li.append(i['content'])
    lis = []

    for i in li:
        lis.append(TextBlob(i).sentiment.polarity)

    lis = [
        "Positive" if i > 0 else "Neutral" if i == 0 else "Negative"
        for i in lis
    ]

    data = list(zip(li, lis))

    return data, app_id
Example #7
0
def get_review(url_string):
    p = re.compile(".+\\bdetails\\?id=([^&]+)")
    result = p.search(url_string)
    appid = result.group(1)
    
    result, continuation_token = reviews(
    appid,
    lang='en', # defaults to 'en'
    country='us', # defaults to 'us'
    sort=Sort.MOST_RELEVANT, # defaults to Sort.MOST_RELEVANT
    count=100, # defaults to 100
    filter_score_with=None # defaults to None(means all score)
    )
    
    #dataframe for app reviews
    '''
    df_app = pd.DataFrame(result)
    
    #drop stuff you don't need
    del df_app['userImage']
    del df_app['reviewCreatedVersion']
    del df_app['repliedAt']
    del df_app['replyContent']
    del df_app['userName']
    df_app.columns = ['at', 'content', 'score', 'thumbsUpCount']
    
    print(df_app)
    return df_app
    '''
    
    for element in result:
        element['appId'] = appid
        
    return result
Example #8
0
def create_appcloud( app_id: str, lang: str, country: str):
	app = google_play_scraper.app(app_id, lang=lang, country=country)
	try:
		reviews, cont = google_play_scraper.reviews(app_id, count=500, lang=lang, country=country)
	except IndexError:
		print('no reviews found')
		reviews = []
	review_string = ' '.join([x.get('content') for x in reviews]) + 'test'

	fig, ax = plt.subplots(2,1)
	try:
		desc_cloud = WordCloud(stopwords=stopwords[lang]).generate(app.get('description'))
		desc_im = ax[0].imshow(desc_cloud, interpolation='bilinear')
		ax[0].axis("off")
		ax[0].set_title('App Description')
	except ValueError:
		print('no content')

	try:
		review_cloud = WordCloud(stopwords=stopwords[lang]).generate(review_string)
		review_img = ax[1].imshow(review_cloud, interpolation='bilinear')
		ax[1].axis("off")
		ax[1].set_title('Comments')
	except ValueError:
		print('no content')
	return fig
Example #9
0
def get_app_reviews():

    #Choose the app to obtain the reviews for
    app_package = ['com.medium.reader']
    app_reviews = []

    for app_name in app_package:
        #iterate through the reviews from 1-5
        for review_score in list(range(1, 6)):
            #Choose the most relevant and the newest reviews first
            for sort_score_order in [Sort.MOST_RELEVANT]:
                app_review_list, _ = reviews(
                    app_name,
                    lang='en',
                    country='us',
                    sort=sort_score_order,
                    count=1500 if review_score == 3 else 750,
                    filter_score_with=review_score)
                print(app_review_list)

            for review in app_review_list:
                review[
                    'sort_order'] = 'most_relevant' if sort_score_order == Sort.MOST_RELEVANT else 'newest'
                review['app_id'] = app_name

            app_reviews.extend(app_review_list)
            print(len(app_reviews))

    return app_reviews
Example #10
0
def get_most_relevant_reviews(id):
    result, continuation_token = reviews(
        id,
        lang='en',  # defaults to 'en'
        country='us',  # defaults to 'us'
        sort=Sort.MOST_RELEVANT,  # defaults to Sort.MOST_RELEVANT
        count=100,  # defaults to 100
    )
    df = pd.DataFrame(result)
    review_id = []
    content = []
    rating = []
    user_name = []
    date = []
    for username in df['userName']:
        user_name.append(username)
    for rewID in df['reviewId']:
        review_id.append(rewID)
    for contents in df['content']:
        content.append(contents)
    for score in df['score']:
        rating.append(score)
    for datetime in df['at']:
        date.append(datetime)
    return review_id, content, rating, user_name, date
Example #11
0
def extract_next_batch(continuation_token, app_id):
    results, continuation_token = reviews(
        app_id,
        continuation_token=continuation_token
    )
    next_batch = pd.DataFrame(results)
    return next_batch, continuation_token
def get_reviews(app_id):
    rows = []
    result, continuation_token = reviews(
        app_id,
        lang='en',  # defaults to 'en'
        country='in',  # defaults to 'us'
        sort=Sort.MOST_RELEVANT,  # defaults to Sort.MOST_RELEVANT
        count=200,  # defaults to 100
        # filter_score_with=5 # defaults to None(means all score)
    )

    rows += result
    # If you pass `continuation_token` as an argument to the reviews function at this point,
    # it will crawl the items after 3 review items.
    # more_views = True
    iteration = 0
    while continuation_token.token and (len(rows) <= 50000):
        # while result:
        iteration += 1
        print(f"Iteration number : {iteration}")
        result, continuation_token = reviews(
            app_id,
            continuation_token=
            continuation_token  # defaults to None(load from the beginning)
        )
        rows += result
        print(f"Total_reviews so far is {len(rows)}")

    df = pd.DataFrame(rows)

    result = app(
        app_id,
        lang='en',  # defaults to 'en'
        country='in'  # defaults to 'us'
    )

    print(f"retrived total of {len(rows)} for app -> {result['title']}")

    app_reviews_folder = "app_reviews"
    if not os.path.isdir(app_reviews_folder):
        os.mkdir(app_reviews_folder)
    csv_file_name = result['title'].replace(" ", "_").replace(
        "/", "_") + "__" + app_id.replace(".", "_")
    csv_url = os.path.join(app_reviews_folder, csv_file_name + ".csv")
    df.to_csv(csv_url, index=False)
    print(f"saved reviews for {app_id} to {csv_url}")
Example #13
0
def id_check(id):
    status = False
    result, continuation_token = reviews(
        id,
        count=1,
    )
    if len(result) != 0:
        status = True
    return status
def crawl_app_reviews(app_id, **kwargs):
    result, continuation_token = reviews(
        app_id,
        lang=kwargs.get('lang', 'en'),
        country=kwargs.get('country', 'us'),
        sort=Sort.MOST_RELEVANT,
        count=kwargs.get('count', 10),
        filter_score_with=kwargs.get('score', 5),
    )

    app_reviews, _ = reviews(
        app_id,
        continuation_token=continuation_token,
    )

    app_object = App.objects.save_app(app_id)
    app_reviews = filter_new_reviews(app_reviews)
    save_app_reviews(app_reviews, app_object)
Example #15
0
 def init_scrape_site(self):
     print("scraping... " + self.market_tag)
     self.scrape_result = []
     self.cont_token = None
     try:
         self.scrape_result, self.cont_token = reviews(
             self.app_url,
             lang=self.lang_tag,  # defaults to 'en'
             count=200)
     except:
         print(self.market_tag + " FAILED!! ")
         return
     self.fail_count = 0
Example #16
0
    def get_android_reviews(self, app_id, review_count = 200, country = 'gb'):

        try: 
            result, continuation_token = reviews(
                app_id,
                lang = 'en', 
                country = country, 
                sort = Sort.NEWEST, # defaults to Sort.MOST_RELEVANT, # defaults to 100
                count = review_count
            )

            result, _ = reviews(
                app_id,
                continuation_token = continuation_token # defaults to None(load from the beginning)
            )
        except:
            raise ValueError('the %a has less than %r reviews. Try entering a lower review count.' % (app_id, review_count))


        review_entries = list()

        for entry in range(len(result)):
            review_entry = dict()
            review_entry['updated'] = result[entry]['at']
            review_entry['id'] = result[entry]['reviewId']
            review_entry['content'] = result[entry]['content']
            review_entry['rating'] = result[entry]['score']
            review_entry['version'] = result[entry]['reviewCreatedVersion']
            review_entry['author'] = result[entry]['userName']
            review_entry['OS'] = 'Android'
            review_entry['country'] = country
            review_entries.append(review_entry)

        df = pd.DataFrame(review_entries, columns = ['updated', 'id', 'title', 'content', \
                                                     'rating', 'version', 'author', 'OS', 'country']).set_index('id')
            

        return df
Example #17
0
 def scrape_iter(self):
     print("scraping... " + self.len_str() + "\t== " +
           self.scrape_result[-1]['content'][0:30])
     clean_token = self.cont_token
     try:
         new_reviews, self.cont_token = reviews(
             self.app_url, continuation_token=self.cont_token)
         self.fail_count = 0
         self.scrape_result += new_reviews
     except:
         self.fail_count += 1
         self.cont_token = clean_token
         print("fail number " + str(self.fail_count) + " count " +
               self.len_str())
Example #18
0
def etl_google(number_of_reviews, fetch_size=100):

    continuation_token = None
    total_reviews_scraped = 0
    with engine.connect() as conn:
        for i in range(0, ceil(number_of_reviews / fetch_size)):
            this_data = []
            logging.info("This is {}th request".format(int(i + 1)))

            # fetch data from google play store
            try:
                result, continuation_token = reviews(
                    'com.aspiro.tidal',
                    lang='en',  # defaults to 'en'
                    country='us',  # defaults to 'us'
                    sort=Sort.MOST_RELEVANT,  # defaults to Sort.MOST_RELEVANT
                    continuation_token=continuation_token,
                    count=fetch_size)
                logging.info("Retrieved total {} reviews so far".format(
                    int(len(result))))

            except Exception as e:
                logging.error("Failed to request data:" + str(e))
                sys.exit(1)

            # transform data
            for review in result:
                data = {
                    'id': None,
                    'username': review["userName"],
                    'date': parse(str(review["at"])).date(),
                    'rating': review["score"],
                    'origin': 2,
                    'review': review["content"]
                }

                this_data.append(data)

            # move data to sqlite
            conn.execute(reviews_table.insert(), this_data)

            total_reviews_scraped += int(len(this_data))
            logging.info(
                "Total {} reviews has been moved to db so far!".format(
                    total_reviews_scraped))

            randomNum = random.uniform(1, 3)
            logging.info("Sleeping for {}".format(randomNum))
            sleep(randomNum)
Example #19
0
def fetch_most_relevant_comments(app_id):
    try:
        comments = reviews(
            app_id,
            lang='pt-BR', 
            country='br', 
            sort=Sort.MOST_RELEVANT
        ) 
        if len(comments) > 0:
            df = pd.DataFrame.from_dict(comments)
            return df.to_csv(r'var/data/{0}/MOST_RELEVANT.csv'.format(app_id))
        else:
            print('not found')
    except:
        pass
def get_reviews(app_to_scrape):
    try:
        time.sleep(random.randrange(10))
        print("Downloading reviews for: " + app_to_scrape['id'])

        result, continuation_token = reviews(
            app_to_scrape['id'],
            lang=app_to_scrape['store_lang'],
            country=app_to_scrape['store_country'],
            sort=Sort.NEWEST,
            count=c.NUM_REVIEWS)
    except:
        print(
            'It seems like we had some problems in fetching the reviews for: '
            + app_to_scrape['id'])
        result = list()
    return result
def get_data_gp():
    """
    Retreiving reviews and date from Google Play
    """
    result = []
    for score in scores:
        try:
            result_current, continuation_token = reviews(
                application_name,
                lang=language,
                country=country,
                sort=Sort.NEWEST,  # defaults to Sort.MOST_RELEVANT
                count=int(count / len(scores)),  # defaults to 100
                filter_score_with=score,  # defaults to None(means all score)
            )
        except IndexError:
                result_current = [{'content' :'Empty content Пустой контент', 'at': datetime.datetime(2020, 6, 6, 13, 41, 46)}]
        result.extend(result_current)
    return result
Example #22
0
def get_data_gp():
    result = []
    for score in scores:
        try:
            result_current, continuation_token = reviews(
                application_name,
                lang=language,
                country=country,
                sort=Sort.NEWEST,  # defaults to Sort.MOST_RELEVANT
                count=count_per_score,  # defaults to 100
                filter_score_with=score,  # defaults to None(means all score)
            )
        except IndexError:
            result_current = [{
                'content': 'Empty content Пустой контент',
                'at': datetime.datetime(2020, 6, 6, 13, 41, 46)
            }]
        result.extend(result_current)
    print(scores, count, count_per_score, filename, "NUMBER: ", len(result))
    return result
Example #23
0
def download_reviews(file):
    app_ids = [
        "com.spotify.music", "us.zoom.videomeetings", "com.instagram.android",
        "com.alphainventor.filemanager", "com.facebook.lite", "com.whatsapp",
        "com.netflix.mediaclient", "com.paypal.android.p2pmobile"
    ]
    reviews = []
    for a in tqdm(app_ids):
        for score in range(1, 6):
            for order in [gps.Sort.MOST_RELEVANT, gps.Sort.NEWEST]:
                c = 200 if score == 3 else 100
                r, _ = gps.reviews(a,
                                   lang="en",
                                   country="us",
                                   sort=order,
                                   count=c,
                                   filter_score_with=score)
                reviews.extend(r)
    reviews = pd.DataFrame(reviews)
    reviews = pd.concat([reviews["score"], reviews["content"]], axis=1)
    reviews.to_csv(file, index=None, header=True)
    return reviews
rat_int = df_app.groupby(['ratings'])['installs'].mean().sort_values()
sns.barplot(x=rat_int, y=rat_int.index, data=df_app)

# It is clear that installation  are higher when the games have higher reviews and ratings . so inorder to increase the installation of games, one must concentrate on increasing its ratings.

# In[113]:

# scrapping the app reviews
app_reviews = []

for ap in tqdm(app_packages):
    for score in range(1, 6):
        for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
            rvs = reviews(ap,
                          lang='fr',
                          country='fr',
                          sort=sort_order,
                          count=200 if score == 3 else 100,
                          filter_score_with=score)[0]
            for r in rvs:
                r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
                r['appId'] = ap
            app_reviews.extend(rvs)

# In[116]:

#save app information into pandas
df_reviews = pd.DataFrame(app_reviews)
df_reviews.head(1)

# In[117]:
Example #25
0
    def lookup(  # type: ignore[override]
            self, config: PlayStoreScrapperConfig,
            **kwargs) -> List[TextPayload]:
        source_responses: List[TextPayload] = []

        # Get data from state
        id: str = kwargs.get("id", None)
        state: Optional[Dict[str,
                             Any]] = (None if id is None or self.store is None
                                      else self.store.get_source_state(id))
        update_state: bool = True if id else False
        state = state or dict()

        if config.countries is None or len(config.countries) == 0:
            logger.warning("`countries` in config should not be empty or None")
            return source_responses

        for country in config.countries:
            country_stat: Dict[str, Any] = state.get(country, dict())
            lookup_period: str = country_stat.get("since_time",
                                                  config.lookup_period)
            lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
            if len(lookup_period) <= 5:
                since_time = convert_utc_time(lookup_period)
            else:
                since_time = datetime.strptime(lookup_period,
                                               DATETIME_STRING_PATTERN)

            last_since_time: datetime = since_time

            # since_id: Optional[str] = country_stat.get("since_id", None)
            # last_index = since_id
            # state[scrapper.country] = country_stat

            continuation_token = None
            while True:
                store_reviews, continuation_token = reviews(
                    app_id=config.package_name,
                    lang=config.language,
                    country=country,
                    sort=Sort.NEWEST,
                    filter_score_with=config.filter_score_with,
                    continuation_token=continuation_token,
                    count=config.max_count,
                )
                store_reviews = store_reviews or []

                for review in store_reviews:
                    source_responses.append(
                        TextPayload(
                            processed_text=review["content"],
                            meta=review,
                            source_name=self.NAME,
                        ))
                    review_time = review["at"].replace(tzinfo=timezone.utc)

                    if since_time > review_time:
                        break

                    if last_since_time is None or last_since_time < review_time:
                        last_since_time = review_time
                    # if last_index is None or last_index < review.id:
                    #    last_index = review.id

                if (continuation_token is None
                        or continuation_token.token is None
                        or continuation_token.count <= len(source_responses)):
                    break

            country_stat["since_time"] = last_since_time.strftime(
                DATETIME_STRING_PATTERN)
            # country_stat["since_id"] = last_index

        if update_state and self.store is not None:
            self.store.update_source_state(workflow_id=id, state=state)

        return source_responses
Example #26
0
# print_json(app_infos[0])
# app_infos_df = pd.DataFrame(app_infos)
# app_infos_df.to_csv('apps_info.csv', index=None, header=True)


# scraping app reviews
app_reviews = []

for ap in tqdm(app_packages):
  for score in list(range(1, 6)):
    for sort_order in [Sort.MOST_RELEVANT, Sort.NEWEST]:
      rvs, _ = reviews(
        ap,
        lang='en',
        country='us',
        sort=sort_order,
        # count= 200 if score == 3 else 100,
        count = 300,
        filter_score_with=score
      )
      for r in rvs:
        r['sortOrder'] = 'most_relevant' if sort_order == Sort.MOST_RELEVANT else 'newest'
        r['appId'] = ap
      app_reviews.extend(rvs)

# print_json(app_reviews[0])

# saving as csv
app_reviews_df = pd.DataFrame(app_reviews)
app_reviews_df.to_csv('../data/b_paid_raw.csv', index=None, header=True)
Example #27
0
def get_crawl_google(id, country_id):

    BATCH_SIZE = 50
    MAX_REVIEWS = 2000
    appinfo = app(id, lang='en', country=country_id)

    appinfo['title']
    AVAIL_REVIEWS = appinfo.get('reviews')
    TOFETCH_REVIEWS = min(AVAIL_REVIEWS, MAX_REVIEWS)
    ints = list(range(max(TOFETCH_REVIEWS // BATCH_SIZE, 1)))
    t = tqdm(total=TOFETCH_REVIEWS)
    for i in ints:
        if i == 0:
            result, continuation_token = reviews(id,
                                                 count=BATCH_SIZE,
                                                 country=country_id)
        res, continuation_token = reviews(
            id, count=BATCH_SIZE, continuation_token=continuation_token)
        result.extend(res)
        t.update(BATCH_SIZE)
    t.close()

    dfp = pd.DataFrame(result)
    dfp.drop_duplicates('reviewId', inplace=True)

    data = [
        dfp['reviewId'], dfp['content'], dfp['reviewCreatedVersion'],
        dfp['score'], dfp['at']
    ]
    headers = ['reviewId', 'review', 'version', 'rating', 'at']
    df_google = pd.concat(data, axis=1, keys=headers)
    df_google['version'].fillna("null", inplace=True)
    for idx in range(len(df_google) - 1):
        if df_google['version'][idx] == 'null':
            df_google.loc[idx, 'version'] = df_google['version'][idx + 1]

    for i in range(len(df_google)):
        if "." in df_google['version'][i][1]:
            pass
        elif "." in df_google['version'][i][2]:
            pass
        else:
            df_google.drop(index=i, inplace=True)

    df_google.reset_index(drop=True, inplace=True)
    df_google['at'] = pd.to_datetime(df_google['at'])
    return df_google

    # creating the dataframe
    data = [
        dfp['reviewId'], dfp['content'], dfp['reviewCreatedVersion'],
        dfp['score'], dfp['at']
    ]
    headers = ['reviewId', 'review', 'version', 'rating', 'at']
    df_google = pd.concat(data, axis=1, keys=headers)
    df_google['version'].fillna("null", inplace=True)

    # fill the null value on the version
    for idx in range(len(df_google) - 1):
        if df_google['version'][idx] == 'null':
            df_google.loc[idx, 'version'] = df_google['version'][idx + 1]

    # drop version which lead to error (ex: '334280')
    for i in range(len(df_google)):
        if "." in df_google['version'][i][1]:
            pass
        elif "." in df_google['version'][i][2]:
            pass
        else:
            df_google.drop(index=i, inplace=True)
    df_google.reset_index(drop=True, inplace=True)
    df_google['at'] = pd.to_datetime(
        df_google['at'])  #set the 'at' column as datetime

    return df_google
Example #28
0
from google_play_scraper import Sort, reviews
result, continuation_token = reviews(
    'com.fantome.penguinisle',
    lang='en',  # defaults to 'en'
    country='in',  # defaults to 'us'
    sort=Sort.MOST_RELEVANT,  # defaults to Sort.MOST_RELEVANT
    count=100,  # defaults to 100
    filter_score_with=1  # defaults to None(means all score)
)

# If you pass `continuation_token` as an argument to the reviews function at this point,
# it will crawl the items after 3 review items.

result, _ = reviews(
    'com.zhiliaoapp.musically',
    continuation_token=
    continuation_token  # defaults to None(load from the beginning)
)

for x in result:
    content1 = x["content"]
    print(content1)
    print()
Example #29
0
    #     config['app_id']['via_google'],
    #     config['app_id']['getaround_google'],
    #     config['app_id']['olacabs_google'],
    config['app_id']['taxieu_google'],
    #     config['app_id']['freenow_google'],
    #     config['app_id']['yandexgo_google']
]

output_path = config['output_path']

# ## App Reviews

for app_id in list_of_app_id:
    result, continuation_token = reviews(
        app_id,
        lang='en',  # defaults to 'en'
        sort=Sort.MOST_RELEVANT,  # defaults to Sort.MOST_RELEVANT
        count=20000,  # defaults to 100
        filter_score_with=None  # defaults to None(means all score)
    )

    result, _ = reviews(
        app_id,
        continuation_token=
        continuation_token  # defaults to None(load from the beginning)
    )

    df = pd.json_normalize(result)
    csv_file_name = app_id + '_google_playstore_review.csv'
    df.to_csv(output_path + csv_file_name)
Example #30
0
    def lookup(
        self, config: PlayStoreScrapperConfig, **kwargs
    ) -> List[AnalyzerRequest]:
        source_responses: List[AnalyzerRequest] = []

        # Get data from state
        id: str = kwargs.get("id", None)
        state: Dict[str, Any] = None if id is None else self.store.get_source_state(id)
        update_state: bool = True if id else False
        state = state or dict()

        for country in config.countries:
            country_stat: Dict[str, Any] = state.get(country, dict())
            lookup_period: str = country_stat.get("since_time", config.lookup_period)
            lookup_period = lookup_period or DEFAULT_LOOKUP_PERIOD
            if len(lookup_period) <= 5:
                since_time = convert_utc_time(lookup_period)
            else:
                since_time = datetime.strptime(lookup_period, DATETIME_STRING_PATTERN)

            last_since_time: datetime = since_time

            # since_id: Optional[str] = country_stat.get("since_id", None)
            # last_index = since_id
            # state[scrapper.country] = country_stat

            continuation_token: Optional[ContinuationToken] = None
            while True:
                store_reviews, continuation_token = reviews(
                    app_id=config.package_name,
                    lang=config.language,
                    country=country,
                    sort=Sort.NEWEST,
                    filter_score_with=config.filter_score_with,
                    continuation_token=continuation_token,
                    count=config.max_count,
                )
                store_reviews = store_reviews or []

                for review in store_reviews:
                    source_responses.append(
                        AnalyzerRequest(
                            processed_text=review["content"],
                            meta=review,
                            source_name=self.NAME,
                        )
                    )

                    if since_time > review["at"]:
                        break

                    if last_since_time is None or last_since_time < review["at"]:
                        last_since_time = review["at"]
                    # if last_index is None or last_index < review.id:
                    #    last_index = review.id

                if (
                    continuation_token is None
                    or continuation_token.token is None
                    or continuation_token.count <= len(source_responses)
                ):
                    break

            country_stat["since_time"] = last_since_time.strftime(
                DATETIME_STRING_PATTERN
            )
            # country_stat["since_id"] = last_index

        if update_state:
            self.store.update_source_state(workflow_id=id, state=state)

        return source_responses