def watch_twitter(self, account: dict) -> None: current_data = Profile(account["username"]).to_dict() del current_data["username"] # change 'likes_count' to 'likes' for key, value in current_data.copy().items(): if key.endswith("_count"): current_data[key.split("_")[0]] = value del current_data[key] last_data = (models.Twitter.select().where( models.Twitter.username == account["username"]).order_by( -models.Twitter.timestamp).limit(1)) if last_data: _changed_datas = self.__compare_datas( current_data, last_data[0], account.get("ignored_metrics", [])) if _changed_datas: table = self.__create_table("twitter", account["username"], _changed_datas) self.send_message(f"```\n {table} \n```") elif not account.get("only_if_changed", True): self.send_message( f"Nothing changed for twitter/**{account['username']}**") with self.lock: models.Twitter.create(username=account["username"], data=current_data) time.sleep(account.get("interval", self.default_interval)) self.watch_twitter(account)
def crawler_item(browser, user_name: str, media_id: int, media_name: str, mt, xpath): """ 抓取单个账号用户信息 :param user_name: <str> 账号名称 :param media_id: <int> 媒体ID :param media_name: <str> 媒体名称 :return: <None> 已将结果存入数据库 """ # 使用twitter-scraper包抓取账户信息(关注数+正在关注数可能错误) try: profile = Profile(user_name) except: print("账号不存在!") return writing_item = profile.to_dict() writing_item["media_id"] = media_id writing_item["media_name"] = media_name # 抓取账户粉丝数和正在关注数(Selenium爬虫) browser.get("https://twitter.com/" + user_name) time.sleep(tool.get_scope_random(12)) following_count = None followers_count = None try: following_count = browser.find_element_by_xpath( xpath["following_count"][0]).get_attribute("title") followers_count = browser.find_element_by_xpath( xpath["followers_count"][0]).get_attribute("title") except: try: following_count = browser.find_element_by_xpath( xpath["following_count"][1]).get_attribute("title") followers_count = browser.find_element_by_xpath( xpath["followers_count"][1]).get_attribute("title") except: print("Selenium抓取关注数+正在关注失败!") # 依据Selenium爬虫结果修正抓取结果 if following_count is not None: following_count = following_count.replace(",", "") print("修正正在关注数量:", writing_item["following_count"], "→", following_count) writing_item["following_count"] = following_count if followers_count is not None: followers_count = followers_count.replace(",", "") print("修正关注者数量:", writing_item["followers_count"], "→", followers_count) writing_item["followers_count"] = followers_count # 将数据写入到数据库 writing_list = list() writing_list.append(writing_item) write_num = mysql.insert_pure(mt, writing_list) print("存储记录数:", write_num) print(writing_list)
def get_user_data(username: str) -> dict: """Gets user data. Args: username (str): username of tweet. Returns: dict: dictionary containing user info. """ profile = Profile(username=username) return profile.to_dict()
def twitter_scraper(username): try: profile = Profile(username) data = profile.to_dict() data['pfp_url'] = data.pop('profile_photo') data['website'] = 'https://' + data['website'] if data[ 'website'] else '' data['url'] = f'https://twitter.com/{username}' data.pop('username') return data except: return "User not found"
def crawler(driver, user_name: str, template): """ 抓取Twitter用户信息 填写数据模板中的name、username、birthday、biography、website、profile_photo、likes_count、tweets_count、followers_count、following_count属性 :param driver: <selenium.webdriver.chrome.webdriver.WebDriver> Chrome浏览器对象 :param user_name: <str> Twitter用户名 :param template: <dict> 返回值数据模板 :return: <dict> 填写抓取数据的数据模板 """ # 使用twitter-scraper包抓取账户信息(关注数+正在关注数可能错误) try: profile = Profile(user_name).to_dict() except: print("账号不存在!") return print(profile) for key, value in profile.items(): template[key] = value # 抓取账户粉丝数和正在关注数(Selenium爬虫) driver.get("https://twitter.com/" + user_name) time.sleep(tool.get_scope_random(12)) try: following_count = tool.fetch.number( driver.find_element_by_xpath(XPATH_FOLLOWING_COUNT[0]).text) followers_count = tool.fetch.number( driver.find_element_by_xpath(XPATH_FOLLOWERS_COUNT[0]).text) except: try: following_count = tool.fetch.number( driver.find_element_by_xpath(XPATH_FOLLOWING_COUNT[1]).text) followers_count = tool.fetch.number( driver.find_element_by_xpath(XPATH_FOLLOWERS_COUNT[1]).text) except: print("Selenium抓取关注数+正在关注失败!") return template # 依据Selenium爬虫结果修正抓取结果 if abs(template["following_count"] - following_count) > 1000: print("修正正在关注数量:", template["following_count"], "→", following_count) template["following_count"] = following_count if abs(template["followers_count"] - followers_count) > 1000: print("修正关注者数量:", template["followers_count"], "→", followers_count) template["followers_count"] = followers_count return template
def searcher(hashes, pgcount): #Each page has 20 tweets global df for tweet in get_tweets(hashes, pages=pgcount): dic = {} dic['Link'] = "twitter.com/anyuser/status/" + tweet['tweetId'] dic['Retweet'] = tweet['isRetweet'] dic['Text'] = tweet['text'] dic['Time'] = str(tweet['time']) dic['Replies'] = tweet['replies'] dic['Retweets'] = tweet['retweets'] dic['Likes'] = tweet['likes'] dic['Hashtags'] = ''.join(tweet['entries']['hashtags']) dic['Photos'] = ''.join(tweet['entries']['photos']) dic['Urls'] = ''.join(tweet['entries']['urls']) dic['Videos'] = str(tweet['entries']['videos']) page = urlopen("https://twitter.com/anyuser/status/" + tweet['tweetId']) uname = page.geturl()[20:].split("/", 1)[0] dic['Username'] = uname profile = Profile(uname) dic['P_Location'] = profile.location dic['P_Name'] = profile.name dic['P_Followers'] = profile.followers_count dic['P_Photo'] = profile.profile_photo df = df.append(dic, ignore_index=True)
def getTopTweets(username, startDate, endDate, noTopTweets=5): # Scrap Tweets tweetCriteria = got.manager.TweetCriteria().setUsername(username).setSince( startDate).setUntil(endDate) tweets = got.manager.TweetManager.getTweets(tweetCriteria) df = pd.DataFrame([tweet.__dict__ for tweet in tweets]) originalDF = df df = df.drop(['author_id', 'date', 'geo', 'username'], axis=1) # Get keyword count df['keywordCount'] = df.apply(lambda x: getkeywordCount(x["text"]), axis=1) # Normalize counts for feature in ["retweets", "favorites", "replies", 'keywordCount']: df[feature + "NormalisedCount"] = df[feature] / ( df[feature].max()) #-df[feature].min()) # ## Find out if the tweet is a retweet to some other persons tweet # #### We do this because retweeted tweets get a far greater reach and thus skews the scores df["ifTo"] = df.apply(lambda x: 0.6 if x["to"] else 1, axis=1) ## Get time elapsed since every tweet. ### We hypothise that older tweets get more exposure df["formatted_date"] = df.apply( lambda x: x["formatted_date"][:-10] + x["formatted_date"][-4:], axis=1) df["timeElapsedHours"] = df.apply( lambda x: (pd.Timestamp(endDate) - pd.to_datetime(x["formatted_date"]) ).total_seconds() // 3600, axis=1) df["timeElapsedScore"] = 1 + 0.04 * (7 * 12 - df["timeElapsedHours"]) / 7 / 24 # ## Assign Score to tweet on basis of normalised count of retweets, favorites, replies and keyword count df["score"] = (1.5 * df["retweetsNormalisedCount"] + 1.2 * df["repliesNormalisedCount"] + 1 * df["favoritesNormalisedCount"] + 2 * df["keywordCount"]) * df["ifTo"] * df["timeElapsedScore"] df = df.drop([ 'id', 'keywordCount', 'retweetsNormalisedCount', 'favoritesNormalisedCount', 'repliesNormalisedCount', 'keywordCountNormalisedCount', 'ifTo', 'timeElapsedHours', 'timeElapsedScore' ], axis=1) sortedDF = df.sort_values(by=['score'], inplace=False, ascending=False) # Get profile user name and image link profile = Profile(username).to_dict() profileInformation = {} profileInformation["username"] = profile["username"] profileInformation["name"] = profile["name"] profileInformation["profile_photo"] = profile["profile_photo"] return sortedDF.head(noTopTweets), profileInformation
def refresh( self, info, username: str = None, user_id: strawberry.ID = None, ) -> User: if not username and not user_id or username and user_id: raise ValueError("You must specify either a username or user id!") kwargs = {} if username: kwargs["username"] = username elif user_id: kwargs["user_id"] = user_id user = models.User.objects(**kwargs).first() if not user: try: twitter_user = Profile(username) except IndexError: raise Exception("User does not exist!") else: user = models.User( username=username, user_id=twitter_user.user_id, bio=twitter_user.biography, profile_photo=twitter_user.profile_photo, ) else: twitter_user = Profile(username) if user.threads: queue_type = "low" else: queue_type = "high" q = Queue(queue_type, connection=conn) user.status = "Pending" user.bio = twitter_user.biography user.profile_photo = twitter_user.profile_photo user.save() q.enqueue(jobs.refresh_user_threads, username, job_timeout="3h") return user
def get_potential_followers(tag, pages=2): pfollowers = [] for tweets in get_tweets(tag, pages): user = Profile(tweets['username']) try: if user.following_count == user.followers_count or user.following_count >= user.followers_count: print(tweets['username'] + " followers_count: " + str(user.followers_count) + " following_count: " + str(user.following_count)) pfollowers.append(tweets['username']) except: pass return pfollowers
def fetch(username): values = (('Name', 'name', 0), ('Website', 'website', 0), ('Birthday', 'birthday', 0), ('Username', 'username', 0), ('Biography', 'biography', 0), ('Private', 'is_private', 0), ('Profile Photo', 'profile_photo', 0), ('Banner Photo', 'banner_photo', 0), ('Location', 'location', 0), ('Likes', 'likes_count', 1), ('Tweets', 'tweets_count', 1), ('Following', 'following_count', 1), ('Followers', 'followers_count', 1)) profile = Profile(username) db = json_rw(1) str_list = ['birthday', 'is_private', 'website'] if not db: for _, name, __ in values: value = getattr(profile, name) if name in str_list: value = str(value) db[name] = value else: msg = '' for template, name, is_diff in values: value = getattr(profile, name) if name in str_list: value = str(value) if value == db[name]: continue if is_diff: diff = str(value - db[name]) if not '-' in diff: diff = f'+{diff}' diff = f'{value} ({diff})' db[name] = value msg += f'`{template}:` {diff}\n' if is_diff else f'`{template}:` {value}\n' if msg: telegram(msg) json_rw(0, db)
def handle_search(username_): #comparing if it avaiable a = "" background = "" padding = 50 try: profile = Profile(str(username_)) a = "Not Avaiable" background = "red" except: a = "Avaiable" background = "green" padding = 65 display = Label(root, text=a, bg=background, padx=padding) display.grid(row=4, column=1)
def add_user_twitter_scraper(username): """Add a user and their tweets to database.""" try: # Get user profile user_profile = Profile(username) # Add to User table (or check if existing) db_user = (User.query.get(user_profile.user_id) or User(id=user_profile.user_id, username=username)) db.session.add(db_user) # Get tweets ignoring re-tweets and replies tweets = list(get_tweets(username, pages=25)) original_tweets = [ tweet for tweet in tweets if tweet['username'] == username ] # # Add newest_tweet_id to the User table # if tweets: # db_user.newest_tweet_id = tweets[0].id # Loop over tweets, get embedding and add to Tweet table for tweet in tweets: # Get an examble basilica embedding for first tweet embedding = BASILICA.embed_sentence(tweet['text'], model='twitter') # Add tweet info to Tweet table db_tweet = Tweet(id=tweet['tweetId'], text=tweet['text'][:300], embedding=embedding) db_user.tweet.append(db_tweet) db.session.add(db_tweet) except Exception as e: print('Error processing {}: {}'.format(username, e)) raise e else: db.session.commit()
import sys from twitter_scraper import Profile, get_tweets from twitter_functions import process_tweets try: opts = [opt for opt in sys.argv[1:] if opt.startswith("-")] args = [arg for arg in sys.argv[1:] if not arg.startswith("-")] profile_name = args[0] tweets = {} if "--profilename" or "-p" in opts: profile = Profile(profile_name) profile.to_dict() for tweet in get_tweets(profile_name, pages=1): if tweet['entries']['photos']: sep = ' pic' tweet_text = tweet['text'].split(sep, 1)[0] tweets.update([(tweet_text, tweet['entries']['photos'][0])]) process_tweets(tweets) except: raise SystemExit(f"Usage: {sys.argv[0]} -p twitter_handle")
def scrap_bio(username): profile = Profile(username) return profile.to_dict()['biography']
def profile(): user = request.args.get('user', default='crypto_rand', type=str) return json.dumps(Profile(user).to_dict(), default=default)
def scrape_user_id(username: str) -> Optional[int]: try: return int(Profile(username).user_id) except IndexError as e: # TODO: Rewrite Profile for better error handling. return None
from twitter_scraper import get_tweets, Profile import twint import sys import io if __name__ == '__main__': #for tweets in get_tweets('basadaplanta', pages=1): # print(tweets['text']) #user = Profile('basadaplanta') #print(user.to_dict()) #user = twint.Config() #user.Username = "******" #twint.run.Following(user) for tweets in get_tweets('#vegano'): user = Profile(tweets['username']) try: if user.following_count == user.followers_count or user.following_count >= user.followers_count: print(tweets['username'] + " followers_count: "+ str( user.followers_count ) + " following_count: " + str( user.following_count) ) except : pass
def twitterToRSS(user): items = [] # Create place for items to go. profile = Profile(user) # Get profile info for feed. for tweet in get_tweets(user, pages=1): # Title creation if len(tweet['text']) < 50: title = tweet['text'] else: title = f"{tweet['text'][:50]}..." itemExtensions = [] # Create place for extensions to go if tweet['entries']['photos']: itemExtensions.append( MediaItem( tweet['entries']['photos'][0], 'image/png', 'image', True, )) description = f"<img class='webfeedsFeaturedVisual' src='{tweet['entries']['photos'][0]}'><p>{tweet['text']}</p>" else: itemExtensions.append( MediaItem( profile.profile_photo, 'image/png', 'image', True, )) description = tweet['text'] if tweet['isRetweet']: author = f"@{tweet['username']}" title = f"(Retweet) {title}" else: author = f'{profile.name} (@{profile.username})' # An article for every tweet items.append( Item(title=title, link=f"https://twitter.com{tweet['tweetUrl']}", description=description, author=author, guid=Guid(f"https://twitter.com{tweet['tweetUrl']}"), pubDate=tweet['time'], extensions=itemExtensions)) # Create feed feed = Feed( title= f"Tweets from {profile.name} (@{profile.username}) - Twitter RSS by Kyle Williams", link=f"https://twitter.com/{profile.username}", description= f"{profile.biography} [Tweets Gathered by Twitter Scraper [twitter-scraper], Feed Generated by rfeed, Hosted By Repl.it]", items=items, image=Image(profile.profile_photo, f"Profile picture of {profile.name}", f"https://twitter.com/{profile.username}"), extensions=[ MediaContent(), Webfeeds(), WebfeedsIcon(profile.profile_photo), WebfeedsCover(get_banner(profile.username)) ]) # Tell Flask that it's getting XML response = make_response(feed.rss()) response.headers.set('Content-Type', 'application/rss+xml') return response # Return RSS feed