def get_app_details(app_id): # print("Getting details for app with id '{}'".format(app_id)) details = {} app_page_url = "https://play.google.com/store/apps/details?id=" + app_id + "&hl=en_GB" raw_html = simple_get(app_page_url) if not raw_html: return app_id, None soup = BeautifulSoup(raw_html, 'html.parser') title = soup.find("div", {"class": "id-app-title"}).text details['title'] = title.strip() genre = soup.find("span", {"itemprop": "genre"}).text details['genre'] = genre.strip() desc = soup.find("div", {"itemprop": "description"}).text desc = desc.replace('<br>', '\n') details['description'] = desc.strip() score = soup.find("div", {"class": "score"}) details['score'] = score.text.strip() if score else None meta_info = soup.findAll("div", {"class": "meta-info"}) for info in meta_info: info_title = info.find("div", {"class": "title"}).text info_content = info.find(attrs={"class": "content"}).text details[info_title.strip()] = info_content.strip() # pprint(details) print("App details retrieved for app with id '{}'".format(app_id)) return app_id, details
def get_html(url): print('Getting URL...') try: raw_html = get_url.simple_get(url) print('Retrieved URL!') except: return return raw_html
def get_similar_properties(Zoopla_url): house = simple_get(Zoopla_url) house_html = BeautifulSoup(house, 'html.parser') similar_properties = house_html.find_all("a", class_="ui-property-card__link", href=True) return similar_properties
def get_apps_on_page(url): app_ids = set() raw_html = simple_get(url, use_proxies=False) soup = BeautifulSoup(raw_html, 'html.parser') for a in soup.select('a'): signature = "/store/apps/details?id=" if a['href'].startswith(signature): app_id = a['href'][len(signature):] app_ids.add(app_id) return app_ids
def get_sub_categorys(url): sub_categories = {} raw_html = simple_get(url, use_proxies=False) html = BeautifulSoup(raw_html, 'html.parser') data = html.findAll('h2') for h2 in data: for a in h2.select('a'): name = a.text.strip() url = a['href'] if name != "See more" and name != "Recommended for you": sub_categories[name] = url # pprint(sub_categories) return sub_categories
def get_house_views(Zoopla_url): ''' For a given house URL on zoopla will return the number of page hits in the last 30 days and the number of page views since the house was listed. ''' house = simple_get(Zoopla_url) house_html = BeautifulSoup(house, 'html.parser') views = house_html.find_all("span", class_="dp-view-count__value") last_30_days = get_int_from_string(views[0].text) since_listed = get_int_from_string(views[1].text) return last_30_days, since_listed
def get_houses_in_postcode_segment(postcode): zoopla_url = "https://www.zoopla.co.uk/for-sale/property/{}/?page_size=100".format( postcode) #print(zoopla_url) houses = BeautifulSoup(simple_get(zoopla_url), 'html.parser') house_links = houses.find_all("h2", class_="listing-results-attr") totalviews = 0 c = 0 for link in house_links: url = (link.find('a', href=True)['href']) l = "https://www.zoopla.co.uk{}".format(url) last_30, all_views = get_house_views(l) address = get_address(l) #print("{a} has had {v} views in the last 30 days".format(a=address,v=last_30)) totalviews += last_30 c = c + 1 if c > 0: avg_views = totalviews / c else: avg_views = 0 #print("the average views in the last 30 days for {postcode} is: {a}".format(postcode=postcode, a=avg_views)) return avg_views
if str(int(time) - 1173) in ignore_list[str1]: return True return False for location in location_list: print("============================") print(location['loc'] + ':') print("============================") date = datetime.datetime.now().strftime("%m%%2F%d%%2F%Y") for time in location['time']: print("----------------------------") print("Time: " + time) print("----------------------------") time = str(int(time) + 1173) #print(base_url.format(location['loc'], time, date)) raw_html = simple_get(base_url.format(location['loc'], time, date)) html = BeautifulSoup(raw_html, 'html.parser') for station in html.find_all(class_="menu__station"): station_name = station.find('h2').text if not check_ignore_list(station_name, time): print(station_name + ":") for meal_item in station.find_all("li"): name = meal_item.find(class_="item__name") name_pre = "\t- " if name and not name.find("a"): print(name_pre + name.string.strip()) elif name and name.find("a"): print(name_pre + name.find("a").string.strip())
raw_html = simple_get(url, use_proxies=False) soup = BeautifulSoup(raw_html, 'html.parser') for a in soup.select('a'): signature = "/store/apps/details?id=" if a['href'].startswith(signature): app_id = a['href'][len(signature):] app_ids.add(app_id) return app_ids if __name__ == "__main__": main_page_url = "https://play.google.com" # get main categories app_store_page = main_page_url + "/store/apps" raw_html = simple_get(app_store_page, use_proxies=False) categories = get_app_categories(raw_html) pprint(categories) # get sub categories (test) # url = "https://play.google.com/store/apps/category/COMMUNICATION" # sub_cats = get_sub_categorys(url) # get all app ids from page (test) # url = "https://play.google.com/store/apps/collection/promotion_cp_messaging_apps?clp=SjIKIQobcHJvbW90aW9uX2NwX21lc3NhZ2luZ19hcHBzEAcYAxINQ09NTVVOSUNBVElPTg%3D%3D:S:ANO1ljIyLhU" # print(get_apps_on_page(url)) # get app ids from multiple pages all_app_ids = set() for category in categories: c_url = main_page_url + category
def get_address(Zoopla_url): house = simple_get(Zoopla_url) house_html = BeautifulSoup(house, 'html.parser') return house_html.find_all("h2", class_="ui-property-summary__address")[0].text