def get_spots(): response = gspread.get_sheet_values(SHEET_ID_INSTAGRAM, "city", "FORMULA") _, city_list = gspread.convert_to_dict_data(response) response = gspread.get_sheet_values(SHEET_ID_INSTAGRAM, "spot", "FORMULA") label_list, spot_list = gspread.convert_to_dict_data(response) spot_names = {spot.get('spot') for spot in spot_list} for city in city_list: num = 1 new_num = 0 while True: try: driver = get_driver() page = '?page=%s' % (num) print(page) driver.get(BASE_URL + city['href'] + page) sleep(1) html_source = driver.page_source soup = BeautifulSoup(html_source, "lxml") main_tag = soup.find("main") list_tags = main_tag.find_all("li") for li in list_tags: a_tag = li.find("a") if not a_tag: continue spot = a_tag.text if spot in spot_names: continue spot_list.append({ 'city': city['city'], 'spot': spot, 'page': num, 'href': a_tag.get('href'), }) print("NEW!", spot) new_num += 1 num += 1 except Exception as e: pprint(e) break finally: driver.quit() print("NEW", new_num) values = gspread.convert_to_sheet_values(label_list, spot_list) body = {'values': values} gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'spot', body) print("SUCCESS!! get_spots")
def update_languages(): response = gspread.get_sheet_values(SHEET_ID_INSTAGRAM, "hashtag", "FORMULA") label_list, hashtag_list = gspread.convert_to_dict_data(response) for index, hashtag in enumerate(hashtag_list): name = hashtag['name'] print(name) try: detect_list = detect_langs(name) languages = [detect.lang for detect in detect_list] print(languages) except Exception as e: print(e) continue new_data = hashtag_list[index] new_data['languages'] = ','.join(languages) hashtag_list[index] = new_data body = { 'values': gspread.convert_to_sheet_values(label_list, hashtag_list) } gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'hashtag', body) print("SUCCESS!! update_languages")
def update_hashtag(): response = gspread.get_sheet_values(SHEET_ID_INSTAGRAM, "hashtag", "FORMULA") label_list, hashtag_list = gspread.convert_to_dict_data(response) data = get_hashtag() new_num = 0 for d in data: name = d['name'] index = next((index for index, hashtag in enumerate(hashtag_list) if hashtag['name'] == name), None) if index is None: hashtag_list.append(d) print("NEW!!:", d.get('page'), d.get('name')) new_num += 1 continue new_data = hashtag_list[index] new_data.update(d) hashtag_list[index] = new_data print("new:", new_num) hashtag_list = sorted(hashtag_list, key=lambda k: k.get('num', 0) or 0, reverse=True) body = { 'values': gspread.convert_to_sheet_values(label_list, hashtag_list) } gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'hashtag', body) print("SUCCESS!! update_hashtag")
def get_sheet_values_list(sheet_name): r = redis.from_url(REDIS_URL) rcache = r.get(sheet_name) if rcache: print("cache HIT!! %s" % (sheet_name)) person_list = json.loads(rcache.decode()) else: response = gspread.get_sheet_values(SHEET_ID, sheet_name) person_label_list, person_list = gspread.convert_to_dict_data(response) r.set(sheet_name, json.dumps(person_list), ex=EXPIRE) return person_list
def get_location_japan(): response = gspread.get_sheet_values(SHEET_ID_INSTAGRAM, "city", "FORMULA") label_list, city_list = gspread.convert_to_dict_data(response) city_names = {city.get('city') for city in city_list} url = "/explore/locations/JP/" print(url) num = 1 while True: try: driver = get_driver() page = '?page=%s' % (num) print("page:", num) driver.get(BASE_URL + url + page) sleep(1) html_source = driver.page_source soup = BeautifulSoup(html_source, "lxml") main_tag = soup.find("main") list_tags = main_tag.find_all("li") for li in list_tags: a_tag = li.find("a") if not a_tag: continue city = a_tag.text if city in city_names: continue city_list.append({ 'city': city, 'page': num, 'href': a_tag.get('href'), }) print("NEW!", city) num += 1 except Exception as e: pprint(e) break finally: driver.quit() body = {'values': gspread.convert_to_sheet_values(label_list, city_list)} gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'city', body) print("SUCCESS!! get_location_japan")
def add_hashtag_detail(): try: driver = get_driver() # Login print("LOGIN START!!") driver.get(login_url) usernameField = driver.find_element_by_xpath(usernamePath) usernameField.send_keys(INSTAGRAM_USERNAME) passwordField = driver.find_element_by_xpath(passwordPath) passwordField.send_keys(INSTAGRAM_PASSWORD) passwordField.send_keys(Keys.RETURN) sleep(30) print("LOGIN FINISH!!") response = gspread.get_sheet_values(SHEET_ID_INSTAGRAM, "hashtag", "FORMULA") label_list, hashtag_list = gspread.convert_to_dict_data(response) count = 1 for index, hashtag in enumerate(hashtag_list): # 進行状況を表示 if index % 100 == 0: print("index:", index) # 100件ごとに保存する if count % 100 == 0: body = { 'values': gspread.convert_to_sheet_values(label_list, hashtag_list) } gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'hashtag', body) print("count:", count) if hashtag.get('num'): continue new_hashtag = hashtag data = get_hashtag_detail(driver, hashtag['name']) new_hashtag.update(data) hashtag_list[index] = new_hashtag count += 1 print("new:", count) hashtag_list = sorted(hashtag_list, key=lambda k: k.get('num', 0) or 0, reverse=True) body = { 'values': gspread.convert_to_sheet_values(label_list, hashtag_list) } gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'hashtag', body) print("SUCCESS!! add_hashtag_detail") except Exception as e: pprint(e) finally: driver.quit()
def add_hashtag_list(): try: driver = get_driver() # Login print("LOGIN START!!") driver.get(login_url) usernameField = driver.find_element_by_xpath(usernamePath) usernameField.send_keys(INSTAGRAM_USERNAME) passwordField = driver.find_element_by_xpath(passwordPath) passwordField.send_keys(INSTAGRAM_PASSWORD) passwordField.send_keys(Keys.RETURN) print("LOGIN FINISH!!") response = gspread.get_sheet_values(SHEET_ID_INSTAGRAM, "hashtag", "FORMULA") label_list, hashtag_list = gspread.convert_to_dict_data(response) count = 1 new_num = 0 for index, hashtag in enumerate(hashtag_list[:5]): # 進行状況を表示 if index % 100 == 0: print("index:", index) # 100件ごとに保存する if count % 100 == 0: body = { 'values': gspread.convert_to_sheet_values(label_list, hashtag_list) } gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'hashtag', body) print("count:", count) if 'ja' not in hashtag['languages']: continue data = get_hashtag_detail(driver, hashtag['name']) hashtag_set = data.get('hashtag_set', set()) for new_tag in hashtag_set: find = next((index for hashtag in hashtag_list if hashtag['name'] == new_tag), None) if find is not None: continue hashtag_list.append({ 'name': new_tag, 'update_at': data.get('update_at'), }) print(new_tag) new_num += 1 count += 1 hashtag_list = sorted(hashtag_list, key=lambda k: k.get('num', 0) or 0, reverse=True) body = { 'values': gspread.convert_to_sheet_values(label_list, hashtag_list) } gspread.update_sheet_values(SHEET_ID_INSTAGRAM, 'hashtag', body) print("new:", new_num) print("SUCCESS!! add_hashtag_detail") except Exception as e: pprint(e) finally: driver.quit()
def get_users_by_chache(params, sheet_name, expire=EXPIRE): print(params) key = str(params) r = redis.from_url(REDIS_URL) rcache = r.get(key) # rcache = False if rcache: print("cache HIT!! %s" % (key)) result = json.loads(rcache.decode()) return result response = gspread.get_sheet_values(SHEET_ID, sheet_name) person_label_list, person_list = gspread.convert_to_dict_data(response) person_list = [ user for user in person_list if user.get('share_url') and int(user.get('aweme_count') or 0) > 0 ] if params.get('sort'): person_list = sorted(person_list, key=lambda k: int(k.get(params['sort'], 0) or 0), reverse=True) gender = [] account = [] is_none = False if params.get('options'): for option in params['options']: if option in ['0', '1', '2']: gender.append(option) elif option in 'バッジなし': is_none = True else: account.append(option) if gender: person_list = [ user for user in person_list if user.get('gender') in gender ] if account: if is_none is True: person_list = [ user for user in person_list if user.get('custom_verify') in account or not user.get('custom_verify') ] else: person_list = [ user for user in person_list if user.get('custom_verify') in account ] elif is_none is True: person_list = [ user for user in person_list if not user.get('custom_verify') ] for index, person in enumerate(person_list): person['index'] = index start_num = 1 page = int(params['page']) if params.get('page') else None if page: start_num = PER_PAGE * (page - 1) end_num = start_num + PER_PAGE result = [] for user in person_list[start_num:end_num]: # 許可されたkeyのみ返す data = {k: v for k, v in user.items() if k in allowed_keys} data['avatar_thumb'] = data['avatar_thumb'].replace('.webp', '.jpeg') result.append(data) response = { 'paging': create_paging_data(len(person_list), page), 'user_list': result, } r.set(key, json.dumps(response), ex=expire) return response