def crawler_item(browser, user_name: str, media_id: int, media_name: str, mt, xpath): """ 抓取单个账号用户信息 :param user_name: <str> 账号名称 :param media_id: <int> 媒体ID :param media_name: <str> 媒体名称 :return: <None> 已将结果存入数据库 """ # 使用twitter-scraper包抓取账户信息(关注数+正在关注数可能错误) try: profile = Profile(user_name) except: print("账号不存在!") return writing_item = profile.to_dict() writing_item["media_id"] = media_id writing_item["media_name"] = media_name # 抓取账户粉丝数和正在关注数(Selenium爬虫) browser.get("https://twitter.com/" + user_name) time.sleep(tool.get_scope_random(12)) following_count = None followers_count = None try: following_count = browser.find_element_by_xpath( xpath["following_count"][0]).get_attribute("title") followers_count = browser.find_element_by_xpath( xpath["followers_count"][0]).get_attribute("title") except: try: following_count = browser.find_element_by_xpath( xpath["following_count"][1]).get_attribute("title") followers_count = browser.find_element_by_xpath( xpath["followers_count"][1]).get_attribute("title") except: print("Selenium抓取关注数+正在关注失败!") # 依据Selenium爬虫结果修正抓取结果 if following_count is not None: following_count = following_count.replace(",", "") print("修正正在关注数量:", writing_item["following_count"], "→", following_count) writing_item["following_count"] = following_count if followers_count is not None: followers_count = followers_count.replace(",", "") print("修正关注者数量:", writing_item["followers_count"], "→", followers_count) writing_item["followers_count"] = followers_count # 将数据写入到数据库 writing_list = list() writing_list.append(writing_item) write_num = mysql.insert_pure(mt, writing_list) print("存储记录数:", write_num) print(writing_list)
def get_user_data(username: str) -> dict: """Gets user data. Args: username (str): username of tweet. Returns: dict: dictionary containing user info. """ profile = Profile(username=username) return profile.to_dict()
def twitter_scraper(username): try: profile = Profile(username) data = profile.to_dict() data['pfp_url'] = data.pop('profile_photo') data['website'] = 'https://' + data['website'] if data[ 'website'] else '' data['url'] = f'https://twitter.com/{username}' data.pop('username') return data except: return "User not found"
import sys from twitter_scraper import Profile, get_tweets from twitter_functions import process_tweets try: opts = [opt for opt in sys.argv[1:] if opt.startswith("-")] args = [arg for arg in sys.argv[1:] if not arg.startswith("-")] profile_name = args[0] tweets = {} if "--profilename" or "-p" in opts: profile = Profile(profile_name) profile.to_dict() for tweet in get_tweets(profile_name, pages=1): if tweet['entries']['photos']: sep = ' pic' tweet_text = tweet['text'].split(sep, 1)[0] tweets.update([(tweet_text, tweet['entries']['photos'][0])]) process_tweets(tweets) except: raise SystemExit(f"Usage: {sys.argv[0]} -p twitter_handle")
def scrap_bio(username): profile = Profile(username) return profile.to_dict()['biography']